%useLatestDescriptors %use dataframe %use kandy val rawDf = DataFrame.read("netflix_titles.csv") // taking a look at types and columns rawDf.schema() rawDf.size() // rowsCount x columnsCount rawDf.head() // return first five rows // Getting general statistics and info for each columns rawDf.describe() val df = rawDf.dropNulls { date_added } // remove rows where `date_added` is not specified .convert { date_added }.toLocalDate("MMMM d, yyyy") // convert date_added to LocalDate using date pattern .sortBy { date_added } // and let's also sort by date for easy operation later df // let's look at what type of column it turned out df.date_added.type() rawDf .valueCounts(sort = false) { type } .plot { bars { x(type) y("count") fillColor(type) { scale = categorical(range = listOf(Color.hex("#00BCD4"), Color.hex("#009688"))) } } layout { title = "Count of TV Shows and Movies" size = 900 to 550 } } val df_date_count = df .convert { date_added }.with { it.year } // converting `date_added` to extract `year` .groupBy { date_added } // grouping by `year` stored in `date_added` .aggregate { count { type == "TV Show" } into "tvshows" // counting TV Shows into column `tvshows` count { type == "Movie" } into "movies" // counting Movies into column `movies` } df_date_count val df_date_count = df .groupBy { date_added.map { it.year } } // grouping by year added extracted from `date_added` .aggregate { count { type == "TV Show" } into "tvshows" // counting TV Shows into column `tvshows` count { type == "Movie" } into "movies" // counting Movies into column `movies` } df_date_count df.groupBy { date_added.map { it.year } } .pivot { type } df.groupBy { date_added.map { it.year } } .pivot { type }.aggregate { count() } df.groupBy { date_added.map { it.year } } .pivot { type }.count() val df_date_count = df .groupBy { date_added.map { it.year } }.pivotCounts { type } df_date_count df_date_count.plot { x(date_added) { axis.name = "year" } area { y(type.`TV Show`) { axis.name = "count" } fillColor = Color.hex("#BF360C") borderLine.color = Color.hex("#BF360C") alpha = .5 } area { y(type.Movie) fillColor = Color.hex("#01579B") borderLine.color = Color.hex("#01579B") alpha = .5 } layout { title = "Number of titles by year" size = 800 to 500 style { panel { background { fillColor = Color.hex("#ECEFF1") borderLineColor = Color.hex("#ECEFF1") } grid.lineGlobal { blank = true } } } } } val df_cumsum_titles = df_date_count .sortBy { date_added } // sorting by date_added .cumSum { type.allCols() } // count cumulative sum for columns `TV Show` and `Movie` that are nested under column `type` df_cumsum_titles df_cumsum_titles.plot { x(date_added) { axis.name = "year" } area { y(type.`TV Show`) { axis.name = "cumulative count" } fillColor = Color.hex("#BF360C") borderLine.color = Color.hex("#BF360C") alpha = .5 } area { y(type.Movie) fillColor = Color.hex("#01579B") borderLine.color = Color.hex("#01579B") alpha = .5 } layout { title = "Cumulative count of titles by year" size = 800 to 500 style { panel { background { fillColor = Color.hex("#ECEFF1") borderLineColor = Color.hex("#ECEFF1") } grid.lineGlobal { blank = true } } } } } import kotlinx.datetime.* val maxDate = df.date_added.max() val df_days = df.add { "days_on_platform" from { date_added.daysUntil(maxDate) } // adding column for number of days on the platform "months_on_platform" from { date_added.monthsUntil(maxDate) } // adding column for number of months on the platform "years_on_platform" from { date_added.yearsUntil(maxDate) } // adding column for number of years on the platform } val p1 = df_days.select { type and days_on_platform }.plot { histogram(days_on_platform, binsOption = BinsOption.byNumber(30)) { y(Stat.density) fillColor = Color.hex("#ef0b0b") borderLine.color = Color.hex("#ECEFF1") } statBin(days_on_platform, binsOption = BinsOption.byNumber(30)) { area { x(Stat.x) y(Stat.density) alpha = .5 fillColor = Color.hex("#0befef") } } layout { xAxisLabel = "days" title = "Age distribution (in days) on Netflix" } } val p2 = df_days.select { type and days_on_platform }.plot { boxplot(x = type, y = days_on_platform) { boxes { fillColor(Stat.x) { scale = categorical(range = listOf(Color.hex("#792020"), Color.hex("#207979"))) } } } layout { yAxisLabel = "days" title = "Boxplot for age (in days) by type" } } plotBunch { add(p1, 0, 0, 500, 450) add(p2, 500, 0, 500, 450) } df_days.valueCounts(sort = false) { type and years_on_platform }.plot { bars { x(years_on_platform) { axis.name = "years" } y("count") fillColor(type) { scale = categorical(range = listOf(Color.hex("#bc3076"), Color.hex("#30bc76"))) } position = Position.dodge() } layout { title = "Years of Movies and TV Shows on Netflix" size = 900 to 500 } } val df_years = df // adding a new column of the difference between the year of release and the year of addition .add("years_off_platform") { date_added.year - release_year } // dropping negative values and equal to zero .filter { "years_off_platform"() > 0 } df_years df_years.valueCounts(false) { years_off_platform }.plot { x(years_off_platform) { axis.name = "years" } points { y("count") size = 7.5 color(years_off_platform) { scale = continuous(range = Color.hex("#97a6d9")..Color.hex("#00256e")) } } layout { title = "How long does it take for a title to be added to Netflix?" size = 1000 to 500 } } // Top 5 oldest movies df_days .filter { type == "Movie" } // filtering by type .sortByDesc { days_on_platform } // sorting by number of days on Netflix .select { cols(type, title, country, date_added, release_year, duration) } // selecting required columns .head() // taking first five rows // Top 5 newest movies df_days .filter { type == "Movie" } .sortBy { days_on_platform } .select { cols(type, title, country, date_added, release_year, duration) } .head() // Top 5 oldest shows df_days .filter { type == "TV Show" } .sortByDesc { days_on_platform } .select { cols(type, title, country, date_added, release_year, duration) } .head() // Top 5 newest shows df_days .filter { type == "TV Show" } .sortBy { days_on_platform } .select { cols(type, title, country, date_added, release_year, duration) } .head() val df_split_date = df // splitting dates into four columns .split { date_added }.by { listOf(it, it.dayOfWeek, it.month, it.year) } .into("date", "day", "month", "year") .sortBy("month") // sorting by month df_split_date df_split_date .valueCounts(false) { year and month } .plot { tiles { x(year) y(month) width = .9 height = .9 fillColor("count") { scale = continuous(range = Color.hex("#FFF3E0")..Color.hex("#E65100")) } } layout { title = "Content additions by month and year" size = 900 to 700 style { panel { background { blank = true } grid.lineGlobal { blank = true } } } } } // splitting cast and couting number of actors val cast_df = df .split { cast }.by(',').inplace() .add("size_cast") { "cast">().size } .convert { date_added } // Since we need the time in milliseconds since epoch for the plots, let's convert date_added to an Instant .with { it.atStartOfDayIn(TimeZone.UTC) } cast_df cast_df.plot { histogram(size_cast, binsOption = BinsOption.byNumber(50)) { fillColor(Stat.count) { scale = continuous(range = Color.hex("#E0F7FA")..Color.hex("#006064")) legend { type = LegendType.None } } } layout { xAxisLabel = "actors" title = "Number of people on cast" size = 950 to 650 } } // counting the participation of each actor val actors_df = cast_df.cast.explode().valueCounts() actors_df actors_df.take(30).plot { barsH { y(cast) { scale = categorical() } x(count) fillColor(cast) { scale = categoricalColorHue() legend { type = LegendType.None } } } layout.title = "Top 30 actors" layout.size = 950 to 900 } val actors = cast_df.pivot { type }.aggregate { cast.explode().valueCounts() } actors val p1 = actors.`TV Show`.take(30).plot { barsH { x(count) y(cast) fillColor(cast) { scale = continuous(Color.hex("#263238")..Color.hex("#ECEFF1")) legend { type = LegendType.None } } } layout.title = "Top 30 actors in Shows" } val p2 = actors.Movie.take(30).plot { barsH { x(count) y(cast) fillColor(cast) { scale = continuousColorGradientN(listOf(Color.hex("#006064"), Color.hex("#E0F7FA"))) legend { type = LegendType.None } } } layout.title = "Top 30 actors in Movies" } plotBunch { add(p1, 0, 0, 500, 700) add(p2, 500, 0, 500, 700) } val directors_df = df.valueCounts { director } directors_df.take(10).plot { barsH { x(count) y(director) { axis.name = "Name" } fillColor(director) { scale = categoricalColorHue() legend { type = LegendType.None } } } layout.title = "Top 10 directors" layout.size = 850 to 500 } %use lets-plot %use lets-plot-gt(gt=30.1) USE { repository("https://repo.osgeo.org/repository/release") dependencies { implementation("org.geotools:gt-shapefile:30.1") implementation("org.geotools:gt-cql:30.1") } } import org.geotools.data.shapefile.ShapefileDataStoreFactory import org.geotools.data.simple.SimpleFeatureCollection import java.net.URL val factory = ShapefileDataStoreFactory() val worldFeatures: SimpleFeatureCollection = with("naturalearth_lowres") { val url = "https://raw.githubusercontent.com/JetBrains/lets-plot-kotlin/master/docs/examples/shp/${this}/${this}.shp" factory.createDataStore(URL(url)).featureSource.features } // Convert Feature Collection to SpatialDataset. // Use 10 decimals to encode floating point numbers (this is the default). val world = worldFeatures.toSpatialDataset(10) val voidTheme = theme( axisTitle = "blank", axisLine = "blank", axisTicks = "blank", axisText = "blank", ) val worldLimits = coordMap(ylim = -55 to 85) val countries = DataFrame.readCSV("country_codes.csv") countries.head() // counting number of titles by county and joining them with country codes dataframe val df_country = df.valueCounts { country }.join(countries) df_country ggplot() + geomMap( data = df_country.toMap(), map = world, mapJoin = "iso" to "iso_a3", color = "white", ) { fill = "count" } + scaleFillGradient( low = "#FFF3E0", high = "#E65100", name = "Number of Titles", ) + ggsize(width = 1000, height = 800) + voidTheme + worldLimits df_country[0..9].sortByDesc { count }.plot { bars { x(country) y(count) fillColor = Color.hex("#00796B") } layout.title = "Top 10 Countries" layout.size = 900 to 450 } val df_dur = df .split { duration }.by(" ").inward("duration_num", "duration_scale") // splitting duration by time and scale inward .convert { "duration"["duration_num"] }.toInt() // converting by column path .update { "duration"["duration_scale"] }.with { if (it == "Seasons") "Season" else it } df_dur.head() val durations = df_dur.pivot { type }.values { duration } durations val p1 = durations.Movie.plot { histogram(duration_num, binsOption = BinsOption.byNumber(100)) { y(Stat.density) fillColor = Color.hex("#00BCD4") } statBin(duration_num, binsOption = BinsOption.byNumber(25)) { line { x(Stat.x) { axis.name = "minutes" } y(Stat.density) { axis.name = "density" } alpha = 1.0 width = 1.0 color = Color.hex("#d41900") } } layout.title = "Distribution of movies duration in minutes" } val p2 = durations.`TV Show`.plot { statBin(duration_num, binsOption = BinsOption.byNumber(15)) { bars { x(Stat.x) y(Stat.count) fillColor = Color.hex("#00BCD4") } } } plotBunch { add(p1, 0, 0, 1000, 500) add(p2, 0, 500, 1000, 500) } df_dur.xs("Movie") { type } .sortByDesc { duration.duration_num }.head() .select { title and country and date_added and release_year and duration.all() } df_dur.xs("TV Show") { type } .sortByDesc { duration.duration_num }.head() .select { title and country and date_added and release_year and duration.all() } val list_top_countries = df_country.country.take(10).toSet() val df_cntr = df_dur .filter { country in list_top_countries } .pivot { type }.aggregate { groupBy { country }.mean { duration.duration_num } } df_cntr val p1 = df_cntr.Movie.sortBy { duration_num }.plot { bars { x(country) { axis.name = "Name" } y(duration_num) { axis.name = "Minute" } fillColor(duration_num) { scale = continuous(Color.hex("#ECEFF1")..Color.hex("#263238")) legend.type = LegendType.None } } layout.title = "Top 10 cast on Movies by country" } val p2 = df_cntr.`TV Show`.sortBy { duration_num }.plot { bars { x(country) { axis.name = "Name" } y(duration_num) { axis.name = "Season" } fillColor(duration_num) { scale = continuous(Color.hex("#E0F7FA")..Color.hex("#006064")) legend.type = LegendType.None } } layout.title = "Top 10 cast on TV Shows by country" } plotBunch { add(p1, 0, 0, 900, 550) add(p2, 0, 550, 900, 550) } val dfInstants = df.convert { date_added }.with { it.atStartOfDayIn(TimeZone.UTC) } dfInstants.valueCounts(false) { rating }.sortBy("count").plot { bars { x(rating) y("count") fillColor(rating) { scale = categoricalColorHue() legend.type = LegendType.None } } layout.title = "Rating of Titles" layout.size = 950 to 500 } dfInstants.valueCounts(sort = false) { rating and type }.plot { bars { x(rating) y("count") fillColor(type) { scale = categorical(listOf(Color.hex("#607D8B"), Color.hex("#00BCD4"))) } position = Position.dodge() } layout.title = "Rating of Titles" layout.size = 950 to 500 }