%useLatestDescriptors

%use dataframe

%use kandy

val rawDf = DataFrame.read("netflix_titles.csv")

// taking a look at types and columns
rawDf.schema()

rawDf.size() // rowsCount x columnsCount

rawDf.head() // return first five rows

// Getting general statistics and info for each columns
rawDf.describe()

val df = rawDf.dropNulls { date_added } // remove rows where `date_added` is not specified
    .convert { date_added }.toLocalDate("MMMM d, yyyy") // convert date_added to LocalDate using date pattern
    .sortBy { date_added } // and let's also sort by date for easy operation later
df

// let's look at what type of column it turned out
df.date_added.type()             

rawDf
    .valueCounts(sort = false) { type }
    .plot {
        bars {
            x(type)
            y("count")
            fillColor(type) {
                scale = categorical(range = listOf(Color.hex("#00BCD4"), Color.hex("#009688")))
            }
        }

        layout {
            title = "Count of TV Shows and Movies"
            size = 900 to 550
        }
    }

val df_date_count = df
    .convert { date_added }.with { it.year } // converting `date_added` to extract `year`
    .groupBy { date_added } // grouping by `year` stored in `date_added`
    .aggregate {
        count { type == "TV Show" } into "tvshows" // counting TV Shows into column `tvshows`
        count { type == "Movie" } into "movies" // counting Movies into column `movies`
    }
df_date_count

val df_date_count = df
    .groupBy { date_added.map { it.year } } // grouping by year added extracted from `date_added`
    .aggregate {
        count { type == "TV Show" } into "tvshows" // counting TV Shows into column `tvshows`
        count { type == "Movie" } into "movies" // counting Movies into column `movies`
    }
df_date_count

df.groupBy { date_added.map { it.year } }
    .pivot { type }

df.groupBy { date_added.map { it.year } }
    .pivot { type }.aggregate { count() }

df.groupBy { date_added.map { it.year } }
    .pivot { type }.count()

val df_date_count = df
    .groupBy { date_added.map { it.year } }.pivotCounts { type }
df_date_count

df_date_count.plot {
    x(date_added) { axis.name = "year" }

    area {
        y(type.`TV Show`) { axis.name = "count" }
        fillColor = Color.hex("#BF360C")
        borderLine.color = Color.hex("#BF360C")
        alpha = .5
    }

    area {
        y(type.Movie)
        fillColor = Color.hex("#01579B")
        borderLine.color = Color.hex("#01579B")
        alpha = .5
    }

    layout {
        title = "Number of titles by year"
        size = 800 to 500
        style {
            panel {
                background {
                    fillColor = Color.hex("#ECEFF1")
                    borderLineColor = Color.hex("#ECEFF1")
                }
                grid.lineGlobal { blank = true }
            }
        }
    }
}

val df_cumsum_titles = df_date_count
    .sortBy { date_added } // sorting by date_added
    .cumSum { type.allCols() } // count cumulative sum for columns `TV Show` and `Movie` that are nested under column `type`
df_cumsum_titles

df_cumsum_titles.plot {
    x(date_added) { axis.name = "year" }

    area {
        y(type.`TV Show`) { axis.name = "cumulative count" }
        fillColor = Color.hex("#BF360C")
        borderLine.color = Color.hex("#BF360C")
        alpha = .5
    }

    area {
        y(type.Movie)
        fillColor = Color.hex("#01579B")
        borderLine.color = Color.hex("#01579B")
        alpha = .5
    }

    layout {
        title = "Cumulative count of titles by year"
        size = 800 to 500

        style {
            panel {
                background {
                    fillColor = Color.hex("#ECEFF1")
                    borderLineColor = Color.hex("#ECEFF1")
                }
                grid.lineGlobal { blank = true }
            }
        }
    }
}

import kotlinx.datetime.*

val maxDate = df.date_added.max()
val df_days = df.add {
    "days_on_platform" from { date_added.daysUntil(maxDate) } // adding column for number of days on the platform
    "months_on_platform" from { date_added.monthsUntil(maxDate) } // adding column for number of months on the platform
    "years_on_platform" from { date_added.yearsUntil(maxDate) } // adding column for number of years on the platform
}

val p1 = df_days.select { type and days_on_platform }.plot {
    histogram(days_on_platform, binsOption = BinsOption.byNumber(30)) {
        y(Stat.density)
        fillColor = Color.hex("#ef0b0b")
        borderLine.color = Color.hex("#ECEFF1")
    }

    statBin(days_on_platform, binsOption = BinsOption.byNumber(30)) {
        area {
            x(Stat.x)
            y(Stat.density)
            alpha = .5
            fillColor = Color.hex("#0befef")
        }
    }

    layout {
        xAxisLabel = "days"
        title = "Age distribution (in days) on Netflix"
    }
}

val p2 = df_days.select { type and days_on_platform }.plot {
    boxplot(x = type, y = days_on_platform) {
        boxes {
            fillColor(Stat.x) {
                scale = categorical(range = listOf(Color.hex("#792020"), Color.hex("#207979")))
            }
        }
    }
    layout {
        yAxisLabel = "days"
        title = "Boxplot for age (in days) by type"
    }
}

plotBunch {
    add(p1, 0, 0, 500, 450)
    add(p2, 500, 0, 500, 450)
}

df_days.valueCounts(sort = false) { type and years_on_platform }.plot {
    bars {
        x(years_on_platform) { axis.name = "years" }
        y("count")
        fillColor(type) {
            scale = categorical(range = listOf(Color.hex("#bc3076"), Color.hex("#30bc76")))
        }
        position = Position.dodge()
    }
    layout {
        title = "Years of Movies and TV Shows on Netflix"
        size = 900 to 500
    }
}

val df_years = df
    // adding a new column of the difference between the year of release and the year of addition
    .add("years_off_platform") {
        date_added.year - release_year
    }
    // dropping negative values and equal to zero
    .filter { "years_off_platform"<Int>() > 0 }
df_years

df_years.valueCounts(false) { years_off_platform }.plot {
    x(years_off_platform) { axis.name = "years" }
    points {
        y("count")
        size = 7.5
        color(years_off_platform) {
            scale = continuous(range = Color.hex("#97a6d9")..Color.hex("#00256e"))
        }
    }
    layout {
        title = "How long does it take for a title to be added to Netflix?"
        size = 1000 to 500
    }
}

// Top 5 oldest movies
df_days
    .filter { type == "Movie" } // filtering by type
    .sortByDesc { days_on_platform } // sorting by number of days on Netflix
    .select { cols(type, title, country, date_added, release_year, duration) } // selecting required columns
    .head() // taking first five rows

// Top 5 newest movies
df_days
    .filter { type == "Movie" }
    .sortBy { days_on_platform }
    .select { cols(type, title, country, date_added, release_year, duration) }
    .head()

// Top 5 oldest shows
df_days
    .filter { type == "TV Show" }
    .sortByDesc { days_on_platform }
    .select { cols(type, title, country, date_added, release_year, duration) }
    .head()

// Top 5 newest shows
df_days
    .filter { type == "TV Show" }
    .sortBy { days_on_platform }
    .select { cols(type, title, country, date_added, release_year, duration) }
    .head()

val df_split_date = df
    // splitting dates into four columns
    .split { date_added }.by { listOf(it, it.dayOfWeek, it.month, it.year) }
    .into("date", "day", "month", "year")
    .sortBy("month") // sorting by month
df_split_date

df_split_date
    .valueCounts(false) { year and month }
    .plot {
        tiles {
            x(year)
            y(month)
            width = .9
            height = .9
            fillColor("count") {
                scale = continuous(range = Color.hex("#FFF3E0")..Color.hex("#E65100"))
            }
        }

        layout {
            title = "Content additions by month and year"
            size = 900 to 700
            style {
                panel {
                    background {
                        blank = true
                    }
                    grid.lineGlobal { blank = true }
                }
            }
        }
    }

// splitting cast and couting number of actors
val cast_df = df
    .split { cast }.by(',').inplace()
    .add("size_cast") { "cast"<List<String>>().size }
    .convert { date_added } // Since we need the time in milliseconds since epoch for the plots, let's convert date_added to an Instant
            .with { it.atStartOfDayIn(TimeZone.UTC) }  
cast_df

cast_df.plot { 
    histogram(size_cast, binsOption = BinsOption.byNumber(50)) {
        fillColor(Stat.count) { 
            scale = continuous(range = Color.hex("#E0F7FA")..Color.hex("#006064"))
            legend {
                type = LegendType.None
            }
        }
    }
    layout {
        xAxisLabel = "actors"
        title = "Number of people on cast"
        size = 950 to 650
    }
}

// counting the participation of each actor
val actors_df = cast_df.cast.explode().valueCounts()
actors_df

actors_df.take(30).plot { 
    barsH {
        y(cast) { scale = categorical() }
        x(count)
        fillColor(cast) { 
            scale = categoricalColorHue()
            legend {
                type = LegendType.None
            }
        }
    }
    layout.title = "Top 30 actors"
    layout.size = 950 to 900
}

val actors = cast_df.pivot { type }.aggregate {
    cast.explode().valueCounts()
}
actors

val p1 = actors.`TV Show`.take(30).plot {
    barsH {
        x(count)
        y(cast)
        fillColor(cast) {
            scale = continuous(Color.hex("#263238")..Color.hex("#ECEFF1"))
            legend { type = LegendType.None }
        }
    }
    layout.title = "Top 30 actors in Shows"
}

val p2 = actors.Movie.take(30).plot {
    barsH {
        x(count)
        y(cast)
        fillColor(cast) {
            scale = continuousColorGradientN(listOf(Color.hex("#006064"), Color.hex("#E0F7FA")))
            legend { type = LegendType.None }
        }
    }
    layout.title = "Top 30 actors in Movies"
}

plotBunch {
    add(p1, 0, 0, 500, 700)
    add(p2, 500, 0, 500, 700)
}

val directors_df = df.valueCounts { director }

directors_df.take(10).plot {
    barsH {
        x(count)
        y(director) { axis.name = "Name" }
        fillColor(director) { 
            scale = categoricalColorHue()
            legend { type = LegendType.None }
        }
    }
    layout.title = "Top 10 directors"
    layout.size = 850 to 500
}

%use lets-plot
%use lets-plot-gt(gt=30.1)

USE {
    repository("https://repo.osgeo.org/repository/release")
    dependencies {
        implementation("org.geotools:gt-shapefile:30.1")
        implementation("org.geotools:gt-cql:30.1")
    }
}

import org.geotools.data.shapefile.ShapefileDataStoreFactory
import org.geotools.data.simple.SimpleFeatureCollection
import java.net.URL

val factory = ShapefileDataStoreFactory()

val worldFeatures: SimpleFeatureCollection = with("naturalearth_lowres") {
    val url = "https://raw.githubusercontent.com/JetBrains/lets-plot-kotlin/master/docs/examples/shp/${this}/${this}.shp"
    factory.createDataStore(URL(url)).featureSource.features
}

// Convert Feature Collection to SpatialDataset.
// Use 10 decimals to encode floating point numbers (this is the default).
val world = worldFeatures.toSpatialDataset(10)
val voidTheme = theme(
    axisTitle = "blank",
    axisLine = "blank",
    axisTicks = "blank",
    axisText = "blank",
)
val worldLimits = coordMap(ylim = -55 to 85)

val countries = DataFrame.readCSV("country_codes.csv")
countries.head()

// counting number of titles by county and joining them with country codes dataframe
val df_country = df.valueCounts { country }.join(countries)
df_country

ggplot() +
        geomMap(
            data = df_country.toMap(),
            map = world,
            mapJoin = "iso" to "iso_a3",
            color = "white",
        ) { fill = "count" } +
        scaleFillGradient(
            low = "#FFF3E0",
            high = "#E65100",
            name = "Number of Titles",
        ) +
        ggsize(width = 1000, height = 800) +
        voidTheme +
        worldLimits

df_country[0..9].sortByDesc { count }.plot {
    bars {
        x(country)
        y(count)
        fillColor = Color.hex("#00796B")
    }
    layout.title = "Top 10 Countries"
    layout.size = 900 to 450
}

val df_dur = df
    .split { duration }.by(" ").inward("duration_num", "duration_scale") // splitting duration by time and scale inward
    .convert { "duration"["duration_num"] }.toInt() // converting by column path
    .update { "duration"["duration_scale"] }.with { if (it == "Seasons") "Season" else it }
df_dur.head()

val durations = df_dur.pivot { type }.values { duration }
durations

val p1 = durations.Movie.plot {
    histogram(duration_num, binsOption = BinsOption.byNumber(100)) {
        y(Stat.density)
        fillColor = Color.hex("#00BCD4")
    }

    statBin(duration_num, binsOption = BinsOption.byNumber(25)) {
        line {
            x(Stat.x) { axis.name = "minutes" }
            y(Stat.density) { axis.name = "density" }
            alpha = 1.0
            width = 1.0
            color = Color.hex("#d41900")
        }
    }

    layout.title = "Distribution of movies duration in minutes"
}

val p2 = durations.`TV Show`.plot {
    statBin(duration_num, binsOption = BinsOption.byNumber(15)) {
        bars {
            x(Stat.x)
            y(Stat.count)
            fillColor = Color.hex("#00BCD4")
        }
    }
}

plotBunch {
    add(p1, 0, 0, 1000, 500)
    add(p2, 0, 500, 1000, 500)
}

df_dur.xs("Movie") { type }
    .sortByDesc { duration.duration_num }.head()
    .select { title and country and date_added and release_year and duration.all() }

df_dur.xs("TV Show") { type }
    .sortByDesc { duration.duration_num }.head()
    .select { title and country and date_added and release_year and duration.all() }

val list_top_countries = df_country.country.take(10).toSet()

val df_cntr = df_dur
    .filter { country in list_top_countries }
    .pivot { type }.aggregate { 
        groupBy { country }.mean { duration.duration_num }
    }
df_cntr

val p1 = df_cntr.Movie.sortBy { duration_num }.plot {
    bars {
        x(country) { axis.name = "Name" }
        y(duration_num) { axis.name = "Minute" }
        fillColor(duration_num) {
            scale = continuous(Color.hex("#ECEFF1")..Color.hex("#263238"))
            legend.type = LegendType.None
        }
    }
    layout.title = "Top 10 cast on Movies by country"
}

val p2 = df_cntr.`TV Show`.sortBy { duration_num }.plot {
    bars {
        x(country) { axis.name = "Name" }
        y(duration_num) { axis.name = "Season" }
        fillColor(duration_num) {
            scale = continuous(Color.hex("#E0F7FA")..Color.hex("#006064"))
            legend.type = LegendType.None
        }
    }
    layout.title = "Top 10 cast on TV Shows by country"
}

plotBunch {
    add(p1, 0, 0, 900, 550)
    add(p2, 0, 550, 900, 550)
}

val dfInstants = df.convert { date_added }.with { it.atStartOfDayIn(TimeZone.UTC) }

dfInstants.valueCounts(false) { rating }.sortBy("count").plot {
    bars {
        x(rating)
        y("count")
        fillColor(rating) { 
            scale = categoricalColorHue()
            legend.type = LegendType.None
        }
    }
    layout.title = "Rating of Titles"
    layout.size = 950 to 500
}

dfInstants.valueCounts(sort = false) { rating and type }.plot {
    bars {
        x(rating)
        y("count")
        fillColor(type) { scale = categorical(listOf(Color.hex("#607D8B"), Color.hex("#00BCD4"))) }
        position = Position.dodge()
    }
    layout.title = "Rating of Titles"
    layout.size = 950 to 500
}