https://github.com/BMClab/covid19
Laboratory of Biomechanics and Motor Control
Federal University of ABC, Brazil
The data used in this Jupyter notebook are available on the Figshare repository https://doi.org/10.6084/m9.figshare.16620238.
import sys, os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
pd.options.plotting.backend = "plotly"
from tqdm.notebook import tqdm
import pycountry_convert as pc
%load_ext watermark
%watermark
%watermark --iversions
Last updated: 2021-09-14T00:07:16.488680-03:00 Python implementation: CPython Python version : 3.8.10 IPython version : 7.27.0 Compiler : GCC 9.3.0 OS : Linux Release : 5.11.0-34-generic Machine : x86_64 Processor : x86_64 CPU cores : 12 Architecture: 64bit numpy : 1.20.3 plotly : 5.3.1 autopep8 : 1.5.6 matplotlib : 3.4.3 pycountry_convert: 0.7.2 seaborn : 0.11.2 json : 2.0.9 pandas : 1.3.3 sys : 3.8.10 | packaged by conda-forge | (default, May 11 2021, 07:01:05) [GCC 9.3.0]
path2 = r'./../data/'
pd.set_option('display.float_format', lambda x: '%.4g' % x)
plt.rcParams.update({'font.size': 14, 'xtick.labelsize': 12, 'ytick.labelsize': 12})
sns.set_style('whitegrid', rc={'xtick.bottom': True, 'xtick.top': True, 'ytick.left': True,
'ytick.right': True, 'xtick.direction': 'in', 'ytick.direction': 'in'})
colors = sns.color_palette()
colors
df = pd.read_parquet(os.path.join(path2, 'run_ww_2019_d.parquet'))
df['athlete'] = df['athlete'].astype('category') # bug in parquet
df = df[['athlete', 'gender', 'age_group', 'country', 'major']
].drop_duplicates(subset='athlete').sort_values('athlete').reset_index(drop=True)
display(df)
athlete | gender | age_group | country | major | |
---|---|---|---|---|---|
0 | 0 | F | 18 - 34 | United States | CHICAGO 2019 |
1 | 1 | M | 35 - 54 | Germany | BERLIN 2016 |
2 | 2 | M | 35 - 54 | United Kingdom | LONDON 2018,LONDON 2019 |
3 | 3 | M | 18 - 34 | United Kingdom | LONDON 2017 |
4 | 4 | M | 35 - 54 | United States | BOSTON 2017 |
... | ... | ... | ... | ... | ... |
36407 | 37594 | M | 18 - 34 | United Kingdom | BERLIN 2017 |
36408 | 37595 | M | 18 - 34 | United States | BERLIN 2019,NEW YORK 2015 |
36409 | 37596 | M | 18 - 34 | United States | BOSTON 2017 |
36410 | 37597 | F | 18 - 34 | United States | BOSTON 2015 |
36411 | 37598 | M | 35 - 54 | China | TOKYO 2012 |
36412 rows × 5 columns
df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36412 entries, 0 to 36411 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 athlete 36412 non-null category 1 gender 36412 non-null category 2 age_group 36412 non-null category 3 country 36083 non-null category 4 major 36412 non-null category dtypes: category(5) memory usage: 1.8 MB
pd.crosstab(df['gender'], df['age_group'], margins=True, normalize='all')
age_group | 18 - 34 | 35 - 54 | 55 + | All |
---|---|---|---|---|
gender | ||||
F | 0.1061 | 0.1271 | 0.01088 | 0.2441 |
M | 0.2301 | 0.4661 | 0.05968 | 0.7559 |
All | 0.3362 | 0.5932 | 0.07055 | 1 |
76% of athletes are male, 34% are between 18 and 34 years old, 59% are between 35 and 54 and the remaining 7% are 55 or older.
Fix some names in country:
print('Number of countries: {}'.format(df['country'].unique().size))
print('Number of athletes with unknown country: {} ({:.1f} %)'.format(df['country'].isnull().sum(),
df['country'].isnull().sum()/df['country'].size*100))
Number of countries: 130 Number of athletes with unknown country: 329 (0.9 %)
cn_pct = 100*df['country'].value_counts(normalize=True)
cn_pct = cn_pct.to_frame().reset_index()
cn_pct = cn_pct.rename(columns={'index':'country', 'country':'percentage'})
cn_pct['country'] = cn_pct['country'].astype('object')
cn_pct.head(10)
country | percentage | |
---|---|---|
0 | United States | 38.1 |
1 | United Kingdom | 20.97 |
2 | Germany | 5.925 |
3 | Canada | 3.17 |
4 | France | 2.503 |
5 | Japan | 2.375 |
6 | Netherlands | 2.3 |
7 | Brazil | 1.807 |
8 | Australia | 1.602 |
9 | Italy | 1.508 |
y = cn_pct.copy(deep=True)
y.loc[y['percentage'] < 1, 'country'] = 'Other countries'
fig = px.pie(y, values='percentage', names='country', labels='percentage',
title='Countries of the athletes')
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()
df['continent'] = np.nan
for country in df['country'].unique():
if pd.isnull(country):
country_code, continent_name = np.nan, np.nan
elif country == 'Kosovo':
country_code = 'XK'
continent_name = 'EU'
else:
country_code = pc.country_name_to_country_alpha2(country)
if country_code == 'TL':
country_code = 'TP'
continent_name = pc.country_alpha2_to_continent_code(country_code)
if not pd.isnull(country):
df.loc[df['country'] == country, 'continent'] = continent_name
df['continent'] = df['continent'].astype('category')
df
athlete | gender | age_group | country | major | continent | |
---|---|---|---|---|---|---|
0 | 0 | F | 18 - 34 | United States | CHICAGO 2019 | NA |
1 | 1 | M | 35 - 54 | Germany | BERLIN 2016 | EU |
2 | 2 | M | 35 - 54 | United Kingdom | LONDON 2018,LONDON 2019 | EU |
3 | 3 | M | 18 - 34 | United Kingdom | LONDON 2017 | EU |
4 | 4 | M | 35 - 54 | United States | BOSTON 2017 | NA |
... | ... | ... | ... | ... | ... | ... |
36407 | 37594 | M | 18 - 34 | United Kingdom | BERLIN 2017 | EU |
36408 | 37595 | M | 18 - 34 | United States | BERLIN 2019,NEW YORK 2015 | NA |
36409 | 37596 | M | 18 - 34 | United States | BOSTON 2017 | NA |
36410 | 37597 | F | 18 - 34 | United States | BOSTON 2015 | NA |
36411 | 37598 | M | 35 - 54 | China | TOKYO 2012 | AS |
36412 rows × 6 columns
ct_pct = 100*df['continent'].value_counts(normalize=True)
ct_pct = ct_pct.to_frame().reset_index()
ct_pct = ct_pct.rename(columns={'index':'continent', 'continent':'percentage'})
ct_pct
continent | percentage | |
---|---|---|
0 | EU | 43.6 |
1 | NA | 43.24 |
2 | AS | 7.765 |
3 | SA | 3.079 |
4 | OC | 1.774 |
5 | AF | 0.5487 |
fig = px.pie(ct_pct, values='percentage', names='continent', labels='percentage',
title='Continents of the athletes', height=400)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()
f = lambda x: np.array(x, dtype=int).max()
df['major_y'] = df['major'].str.findall(r'[0-9]+').apply(f)
df['major_n'] = df['major'].str.split(',').apply(lambda x: len(x))
df['major_s'] = df['major'].str.replace(' \d+', '', regex=True).str.split(',')
df
athlete | gender | age_group | country | major | continent | major_y | major_n | major_s | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | F | 18 - 34 | United States | CHICAGO 2019 | NA | 2019 | 1 | [CHICAGO] |
1 | 1 | M | 35 - 54 | Germany | BERLIN 2016 | EU | 2016 | 1 | [BERLIN] |
2 | 2 | M | 35 - 54 | United Kingdom | LONDON 2018,LONDON 2019 | EU | 2019 | 2 | [LONDON, LONDON] |
3 | 3 | M | 18 - 34 | United Kingdom | LONDON 2017 | EU | 2017 | 1 | [LONDON] |
4 | 4 | M | 35 - 54 | United States | BOSTON 2017 | NA | 2017 | 1 | [BOSTON] |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
36407 | 37594 | M | 18 - 34 | United Kingdom | BERLIN 2017 | EU | 2017 | 1 | [BERLIN] |
36408 | 37595 | M | 18 - 34 | United States | BERLIN 2019,NEW YORK 2015 | NA | 2019 | 2 | [BERLIN, NEW YORK] |
36409 | 37596 | M | 18 - 34 | United States | BOSTON 2017 | NA | 2017 | 1 | [BOSTON] |
36410 | 37597 | F | 18 - 34 | United States | BOSTON 2015 | NA | 2015 | 1 | [BOSTON] |
36411 | 37598 | M | 35 - 54 | China | TOKYO 2012 | AS | 2012 | 1 | [TOKYO] |
36412 rows × 9 columns
mj_pct = 100*df['major_s'].explode().astype('category').value_counts(normalize=True)
mj_pct = mj_pct.to_frame().reset_index()
mj_pct = mj_pct.rename(columns={'index':'major', 'major_s':'percentage'})
mj_pct
major | percentage | |
---|---|---|
0 | BOSTON | 27.26 |
1 | BERLIN | 26.5 |
2 | NEW YORK | 16.27 |
3 | LONDON | 16.09 |
4 | CHICAGO | 8.199 |
5 | TOKYO | 5.686 |
fig = px.pie(mj_pct, values='percentage', names='major', labels='percentage',
title='Frequency of Majors', height=400)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()
fun = lambda x, y: x[x['major'].isin(y)]['percentage'].values.sum()
print('Majors in America: {:.1f}'.format(fun(mj_pct, ['BOSTON', 'NEW YORK', 'CHICAGO'])))
print('Majors in Europe: {:.1f}'.format(fun(mj_pct, ['BERLIN', 'LONDON'])))
print('Majors in Asia: {:.1f}'.format(fun(mj_pct, ['TOKYO'])))
Majors in America: 51.7 Majors in Europe: 42.6 Majors in Asia: 5.7
pd.crosstab(df['major_y'], df['major_n'], margins=True, normalize='all')*100
major_n | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | All |
---|---|---|---|---|---|---|---|---|---|
major_y | |||||||||
2005 | 0.002746 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.002746 |
2008 | 0.01099 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.01099 |
2009 | 0.04669 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.04669 |
2010 | 0.1181 | 0.002746 | 0 | 0 | 0 | 0 | 0 | 0 | 0.1208 |
2011 | 0.2142 | 0.002746 | 0 | 0 | 0 | 0 | 0 | 0 | 0.217 |
2012 | 0.6152 | 0.01099 | 0 | 0 | 0 | 0 | 0 | 0 | 0.6262 |
2013 | 1.123 | 0.09063 | 0.01373 | 0 | 0 | 0 | 0 | 0 | 1.228 |
2014 | 1.999 | 0.1785 | 0.01922 | 0.005493 | 0 | 0 | 0 | 0 | 2.203 |
2015 | 5.251 | 0.4147 | 0.04943 | 0.002746 | 0 | 0 | 0 | 0 | 5.718 |
2016 | 7.418 | 0.5795 | 0.08514 | 0.005493 | 0 | 0 | 0 | 0 | 8.088 |
2017 | 12.25 | 1.112 | 0.173 | 0.06317 | 0 | 0 | 0 | 0 | 13.6 |
2018 | 21.96 | 2.153 | 0.2691 | 0.07415 | 0.01648 | 0.005493 | 0.002746 | 0.002746 | 24.49 |
2019 | 36.54 | 5.46 | 1.206 | 0.2801 | 0.06866 | 0.02197 | 0.002746 | 0.002746 | 43.58 |
2020 | 0.05767 | 0.01099 | 0 | 0 | 0 | 0 | 0 | 0 | 0.06866 |
All | 87.61 | 10.02 | 1.815 | 0.4312 | 0.08514 | 0.02746 | 0.005493 | 0.005493 | 100 |
print('Cumulative frequency of year of Majors:')
display(pd.concat((df['major_y'].value_counts(),
df['major_y'].value_counts(normalize=True).cumsum()), axis=1).T*100)
print('Cumulative frequency of number of Majors:')
display(pd.concat((df['major_n'].value_counts(),
df['major_n'].value_counts(normalize=True).cumsum()), axis=1).T*100)
Cumulative frequency of year of Majors:
2019 | 2018 | 2017 | 2016 | 2015 | 2014 | 2013 | 2012 | 2011 | 2010 | 2020 | 2009 | 2008 | 2005 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
major_y | 1.587e+06 | 8.916e+05 | 4.952e+05 | 2.945e+05 | 2.082e+05 | 8.02e+04 | 4.47e+04 | 2.28e+04 | 7900 | 4400 | 2500 | 1700 | 400 | 100 |
major_y | 43.58 | 68.07 | 81.67 | 89.76 | 95.48 | 97.68 | 98.91 | 99.53 | 99.75 | 99.87 | 99.94 | 99.99 | 100 | 100 |
Cumulative frequency of number of Majors:
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|
major_n | 3.19e+06 | 3.647e+05 | 6.61e+04 | 1.57e+04 | 3100 | 1000 | 200 | 200 |
major_n | 87.61 | 97.63 | 99.45 | 99.88 | 99.96 | 99.99 | 99.99 | 100 |
98% of the athletes ran in up to two Majors and 98% of the athletes ran a Major between the years 2014 and 2019.