import pandas as pd
from lets_plot import *
LetsPlot.setup_html()
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/NetFlix.csv")
df = df[df["release_year"] >= 2000]
print(df.shape)
df.head()
(7338, 12)
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | genres | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | s1 | TV Show | 3% | NaN | João Miguel, Bianca Comparato, Michel Gomes, R... | Brazil | 14-Aug-20 | 2020 | TV-MA | 4 | International TV Shows, TV Dramas, TV Sci-Fi &... | In a future where the elite inhabit an island ... |
1 | s10 | Movie | 1920 | Vikram Bhatt | Rajneesh Duggal, Adah Sharma, Indraneil Sengup... | India | 15-Dec-17 | 2008 | TV-MA | 143 | Horror Movies, International Movies, Thrillers | An architect and his wife move into a castle t... |
2 | s100 | Movie | 3 Heroines | Iman Brotoseno | Reza Rahadian, Bunga Citra Lestari, Tara Basro... | Indonesia | 05-Jan-19 | 2016 | TV-PG | 124 | Dramas, International Movies, Sports Movies | Three Indonesian women break records by becomi... |
3 | s1000 | Movie | Blue Mountain State: The Rise of Thadland | Lev L. Spiro | Alan Ritchson, Darin Brooks, James Cade, Rob R... | United States | 01-Mar-16 | 2016 | R | 90 | Comedies | New NFL star Thad buys his old teammates' belo... |
4 | s1001 | TV Show | Blue Planet II | NaN | David Attenborough | United Kingdom | 03-Dec-18 | 2017 | TV-G | 1 | British TV Shows, Docuseries, Science & Nature TV | This sequel to the award-winning nature series... |
movies_df = df[(df["type"] == "Movie")&(df["genres"] != "Movies")]
by_genre_df = pd.melt(
movies_df["genres"].str.split(", ", expand=True).assign(duration=movies_df["duration"]),
id_vars=["duration"], value_vars=[0, 1, 2], value_name="genre"
)[["genre", "duration"]].dropna(subset=["genre"])
by_genre_df = by_genre_df.assign(
duration_mean=by_genre_df["genre"].replace(by_genre_df.groupby("genre")["duration"].mean())
).sort_values(by="duration_mean", ascending=False)
ggplot(by_genre_df, aes("duration", "genre")) + \
geom_area_ridges(aes(group="genre", fill="duration_mean"), \
scale=4, sampling=sampling_pick(by_genre_df.shape[0]), \
tooltips=layer_tooltips().title("@genre")\
.line("@|@duration")) + \
scale_x_log10() + \
scale_fill_viridis(name="mean duration", option='plasma') + \
ggsize(800, 600) + \
ggtitle("Mean Netflix movie duration") + \
theme(axis_line_x='blank')