import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from plotly.colors import n_colors
pd.options.plotting.backend = "plotly"
import plotly.io as pio
init_notebook_mode()
# For the sake of convenience, the data has been loaded as a Gist.
df = pd.read_csv("https://gist.github.com/drorata/385bbf8a0c5405b4ea5e22fd1ea7474d/raw/a02e19b941aae7ad2ce67bddef3f8d4764dad9c2/data.csv.gz")
df
cntry | cname | cedition | cproddat | cseqno | name | essround | edition | idno | dweight | pspwght | pweight | anweight | netustm | gndr | agea | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AT | ESS1-9e01 | 1.0 | 10.12.2020 | 1 | ESS1e06_6 | 1 | 6.6 | 1 | 0.945200 | 0.940933 | 0.271488 | 0.255452 | NaN | 1 | 54 |
1 | AT | ESS1-9e01 | 1.0 | 10.12.2020 | 2 | ESS1e06_6 | 1 | 6.6 | 2 | 0.472600 | 0.470466 | 0.271488 | 0.127726 | NaN | 1 | 50 |
2 | AT | ESS1-9e01 | 1.0 | 10.12.2020 | 3 | ESS1e06_6 | 1 | 6.6 | 3 | 0.945200 | 1.392155 | 0.271488 | 0.377953 | NaN | 2 | 63 |
3 | AT | ESS1-9e01 | 1.0 | 10.12.2020 | 4 | ESS1e06_6 | 1 | 6.6 | 4 | 0.945200 | 1.382163 | 0.271488 | 0.375240 | NaN | 1 | 44 |
4 | AT | ESS1-9e01 | 1.0 | 10.12.2020 | 5 | ESS1e06_6 | 1 | 6.6 | 6 | 1.890500 | 1.437766 | 0.271488 | 0.390336 | NaN | 2 | 41 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
106673 | IL | ESS1-9e01 | 1.0 | 10.12.2020 | 268361 | ESS8e02_2 | 8 | 2.2 | 8021 | 1.118244 | 0.682302 | 0.239750 | 0.163582 | 6666.0 | 2 | 65 |
106674 | IL | ESS1-9e01 | 1.0 | 10.12.2020 | 268362 | ESS8e02_2 | 8 | 2.2 | 8022 | 1.961206 | 2.062875 | 0.239750 | 0.494574 | 6666.0 | 2 | 70 |
106675 | IL | ESS1-9e01 | 1.0 | 10.12.2020 | 268363 | ESS8e02_2 | 8 | 2.2 | 8023 | 0.559122 | 0.341151 | 0.239750 | 0.081791 | 120.0 | 2 | 65 |
106676 | IL | ESS1-9e01 | 1.0 | 10.12.2020 | 268364 | ESS8e02_2 | 8 | 2.2 | 8024 | 1.118244 | 0.682302 | 0.239750 | 0.163582 | 6666.0 | 2 | 79 |
106677 | IL | ESS1-9e01 | 1.0 | 10.12.2020 | 268365 | ESS8e02_2 | 8 | 2.2 | 8025 | 1.118244 | 0.682302 | 0.239750 | 0.163582 | 300.0 | 2 | 59 |
106678 rows × 16 columns
_tmp = pd.DataFrame(df.cntry.value_counts(normalize=True)).reset_index()
fig = px.pie(
_tmp,
values="cntry",
names="index",
height=600,
title="Countries representation in the data"
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig
Note that countries representation is note uniform. This has to be kept in mind when reviewing the data.
fig = (
df[df.gndr != 9]
.groupby(["cntry", "gndr"])
.size()
.unstack()
# Taken from https://stackoverflow.com/a/46088484/671013
.assign(gender_1_ratio=lambda x: x[1] / x.sum(1))["gender_1_ratio"]
.plot.bar()
)
fig.update_layout(title="Ratio of gender (1) by country")
fig
px.histogram(
# The value 9 for gender is present in some of the countries;
# For better visualization it is ignored.
df[df.gndr!=9],
x="cntry",
color="gndr",
barmode='group',
histnorm='percent',
title="Distribution of genders by countries"
)
Starting with looking into the distribution of the ages
fig = make_subplots(
rows=1, cols=2, column_widths=[8, 3], specs=[[{"type": "xy"}, {"type": "domain"}]]
)
_tmp = pd.DataFrame(df.agea.describe().reset_index().rename({"index": "stat"}, axis=1))
fig.add_trace(
go.Table(
header=dict(values=list(_tmp.columns), align="center"),
cells=dict(values=[_tmp.stat, _tmp.agea], align="center",),
),
row=1,
col=2,
)
fig.add_trace(df.agea.hist().data[0])
fig.update_layout({"title": "Distribution of the `agea` values"})
We witness an outlier at 999. The documentation doesn't explain what is the meaning of this value. It will be ignored; yielding a much cleaner distribution of ages.
df[df.agea < 999].agea.hist(title="Filtered `agea` distribution")
Similarly, we review the netustm
.
The documentation mentions four values:
These values will be excluded from the review. Furthermore, the resultion will be hours (rather than minutes).
fig = make_subplots(
rows=1, cols=2, column_widths=[8, 3], specs=[[{"type": "xy"}, {"type": "domain"}]]
)
_tmp = pd.DataFrame(
(df[~df.netustm.isin([6666, 7777, 8888, 9999])].netustm / 60)
.describe()
.reset_index()
.rename({"index": "stat"}, axis=1)
)
fig.add_trace(
go.Table(
header=dict(values=list(_tmp.columns), align="center"),
cells=dict(values=[_tmp.stat, _tmp.netustm], align="center",),
),
row=1,
col=2,
)
fig.add_trace(
(df[~df.netustm.isin([6666, 7777, 8888, 9999])]["netustm"] / 60)
.round()
.hist()
.data[0]
)
fig.update_layout({"title": "Distribution of the `netustm` values (in hours per day)"})
Lastly, prepare a clean table with the data of interest
clean_age_net_use_df = df[
# Removing not useful usage time values
(~df.netustm.isin([6666, 7777, 8888, 9999]))
# Removing NULL values for the usage time
& (~df.netustm.isna())
# Retaining meaningful age data
& (df.agea < 999)
][["agea", "netustm", "cntry"]]
# For clearer story, rounding usage from minutes per day to hours per day
clean_age_net_use_df["netustm_hr"] = (clean_age_net_use_df.netustm / 60)
# Lastly, binning the age to age groups
bins= [0, 20, 30, 40, 50, 60, 70, 200]
labels = ['19-','20th','30th','40th','50th', "60th", "70th+"]
clean_age_net_use_df["age_group"] = pd.cut(
clean_age_net_use_df.agea,
bins=bins, labels=labels
)
clean_age_net_use_df
agea | netustm | cntry | netustm_hr | age_group | |
---|---|---|---|---|---|
8713 | 34 | 180.0 | AT | 3.0 | 30th |
8714 | 52 | 120.0 | AT | 2.0 | 50th |
8716 | 54 | 120.0 | AT | 2.0 | 50th |
8717 | 20 | 180.0 | AT | 3.0 | 19- |
8718 | 65 | 120.0 | AT | 2.0 | 60th |
... | ... | ... | ... | ... | ... |
106669 | 45 | 30.0 | IL | 0.5 | 40th |
106670 | 55 | 30.0 | IL | 0.5 | 50th |
106672 | 49 | 60.0 | IL | 1.0 | 40th |
106675 | 65 | 120.0 | IL | 2.0 | 60th |
106677 | 59 | 300.0 | IL | 5.0 | 50th |
17473 rows × 5 columns
ages = sorted(clean_age_net_use_df.agea.unique())
print(f"The minimal age in the cleaned data is: {ages[0]} and the maximal is {ages[-1]}")
The minimal age in the cleaned data is: 15 and the maximal is 93
Note that due to the restrictions on the net usage, the maximal age is now 93.
px.pie(
clean_age_net_use_df,
names="age_group",
title="Age groups distribution"
)
(
clean_age_net_use_df.groupby(["cntry", "age_group"])
.size()
.unstack()
.apply(lambda x: 100 * x / clean_age_net_use_df.groupby("cntry").size()[x.index])
.transpose()
.plot(title="Age group distribution by country")
)
We start with the distributions of the hour-usage by age (not age groups!):
colors = n_colors("rgb(5, 200, 200)", "rgb(200, 10, 10)", len(ages), colortype="rgb")
fig = go.Figure()
for age, color in zip(ages, colors):
fig.add_trace(
go.Violin(
x=clean_age_net_use_df[clean_age_net_use_df.agea == age].netustm,
line_color=color,
name=str(age),
)
)
fig.update_annotations()
fig.update_traces(
orientation="h", side="positive", width=3, points="suspectedoutliers",
)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False, height=800)
fig.show()
This is a very noise visualization that doesn't communicate well any story. We will recreate it with specified age group:
colors = n_colors(
"rgb(5, 200, 200)",
"rgb(200, 10, 10)",
len(clean_age_net_use_df.age_group.unique()),
colortype="rgb",
)
fig = go.Figure()
for age, color in zip(sorted(clean_age_net_use_df.age_group.unique()), colors):
fig.add_trace(
go.Violin(
x=clean_age_net_use_df[clean_age_net_use_df.age_group == age].netustm_hr,
line_color=color,
name=str(age),
)
)
fig.update_annotations()
fig.update_traces(
orientation="h", side="positive", width=3, points="suspectedoutliers",
)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False, height=800, title="Hours spent online by age group")
fig.show()
The wiggling in the graphs of the smaller age groups is due to the fact that the values are not continuous.
Insight: There's a clear correlation between the age group and the time spent online; the younger you are the more time you're going to spent online.
colors = n_colors(
"rgb(5, 200, 200)",
"rgb(200, 10, 10)",
len(clean_age_net_use_df.cntry.unique()),
colortype="rgb",
)
fig = go.Figure()
for country, color in zip(sorted(clean_age_net_use_df.cntry.unique()), colors):
fig.add_trace(
go.Violin(
x=clean_age_net_use_df[clean_age_net_use_df.cntry == country].netustm_hr,
line_color=color,
name=str(country),
)
)
fig.update_annotations()
fig.update_traces(
orientation="h", side="positive", width=3, points="suspectedoutliers",
)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False, height=800, title="Hours spent online by country")
fig.show()
Insight: In CH and DE people spend less time online compared to CZ, GB and IL. AT seems to have data issues related to this attribute.
fig = go.Figure(
go.Box(
x=clean_age_net_use_df["cntry"],
y=clean_age_net_use_df["netustm"],
boxmean="sd",
)
)
fig.update_layout({"height": 500})
Using Plotly has a great advantage, the plots are interactive and pretty. But, why are they static here you ask yourselves? Well, I don't know how to embed the interactive plots in the blog posts. Feel free to add in the comments ideas and discuss how to work this around.
In the meantime, I put the notebook that is used for creating this post in gist and it can be viewed interactively using NBViewer. Note it is not the very same notebook 🧐; this section is not available there.