#!/usr/bin/env python
# coding: utf-8
# # Frequencies of gender, age, country, and participation in Majors
#
# > [https://github.com/BMClab/covid19](https://github.com/BMClab/covid19)
# > [Laboratory of Biomechanics and Motor Control](http://pesquisa.ufabc.edu.br/bmclab/)
# > Federal University of ABC, Brazil
#
# **The data used in this Jupyter notebook are available on the Figshare repository https://doi.org/10.6084/m9.figshare.16620238.**
#
Contents
#
# ## Setup
# In[1]:
import sys, os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import plotly.express as px
pd.options.plotting.backend = "plotly"
from tqdm.notebook import tqdm
import pycountry_convert as pc
get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', '')
get_ipython().run_line_magic('watermark', '--iversions')
# ### Environment
# In[2]:
path2 = r'./../data/'
pd.set_option('display.float_format', lambda x: '%.4g' % x)
plt.rcParams.update({'font.size': 14, 'xtick.labelsize': 12, 'ytick.labelsize': 12})
sns.set_style('whitegrid', rc={'xtick.bottom': True, 'xtick.top': True, 'ytick.left': True,
'ytick.right': True, 'xtick.direction': 'in', 'ytick.direction': 'in'})
colors = sns.color_palette()
colors
# ## Load dataset
# In[3]:
df = pd.read_parquet(os.path.join(path2, 'run_ww_2019_d.parquet'))
df['athlete'] = df['athlete'].astype('category') # bug in parquet
# ### Subject's information
# In[4]:
df = df[['athlete', 'gender', 'age_group', 'country', 'major']
].drop_duplicates(subset='athlete').sort_values('athlete').reset_index(drop=True)
display(df)
# In[5]:
df.info(memory_usage='deep')
# ## Gender and age
# In[6]:
pd.crosstab(df['gender'], df['age_group'], margins=True, normalize='all')
# 76% of athletes are male, 34% are between 18 and 34 years old, 59% are between 35 and 54 and the remaining 7% are 55 or older.
# ## Location
# ### By country
#
# Fix some names in country:
# In[7]:
print('Number of countries: {}'.format(df['country'].unique().size))
print('Number of athletes with unknown country: {} ({:.1f} %)'.format(df['country'].isnull().sum(),
df['country'].isnull().sum()/df['country'].size*100))
# In[8]:
cn_pct = 100*df['country'].value_counts(normalize=True)
cn_pct = cn_pct.to_frame().reset_index()
cn_pct = cn_pct.rename(columns={'index':'country', 'country':'percentage'})
cn_pct['country'] = cn_pct['country'].astype('object')
cn_pct.head(10)
# In[9]:
y = cn_pct.copy(deep=True)
y.loc[y['percentage'] < 1, 'country'] = 'Other countries'
fig = px.pie(y, values='percentage', names='country', labels='percentage',
title='Countries of the athletes')
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()
# ### By continent
# In[10]:
df['continent'] = np.nan
for country in df['country'].unique():
if pd.isnull(country):
country_code, continent_name = np.nan, np.nan
elif country == 'Kosovo':
country_code = 'XK'
continent_name = 'EU'
else:
country_code = pc.country_name_to_country_alpha2(country)
if country_code == 'TL':
country_code = 'TP'
continent_name = pc.country_alpha2_to_continent_code(country_code)
if not pd.isnull(country):
df.loc[df['country'] == country, 'continent'] = continent_name
df['continent'] = df['continent'].astype('category')
df
# In[11]:
ct_pct = 100*df['continent'].value_counts(normalize=True)
ct_pct = ct_pct.to_frame().reset_index()
ct_pct = ct_pct.rename(columns={'index':'continent', 'continent':'percentage'})
ct_pct
# In[12]:
fig = px.pie(ct_pct, values='percentage', names='continent', labels='percentage',
title='Continents of the athletes', height=400)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()
# ## Majors
# In[13]:
f = lambda x: np.array(x, dtype=int).max()
df['major_y'] = df['major'].str.findall(r'[0-9]+').apply(f)
df['major_n'] = df['major'].str.split(',').apply(lambda x: len(x))
df['major_s'] = df['major'].str.replace(' \d+', '', regex=True).str.split(',')
df
# ### Frequency
# In[14]:
mj_pct = 100*df['major_s'].explode().astype('category').value_counts(normalize=True)
mj_pct = mj_pct.to_frame().reset_index()
mj_pct = mj_pct.rename(columns={'index':'major', 'major_s':'percentage'})
mj_pct
# In[15]:
fig = px.pie(mj_pct, values='percentage', names='major', labels='percentage',
title='Frequency of Majors', height=400)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()
# In[16]:
fun = lambda x, y: x[x['major'].isin(y)]['percentage'].values.sum()
print('Majors in America: {:.1f}'.format(fun(mj_pct, ['BOSTON', 'NEW YORK', 'CHICAGO'])))
print('Majors in Europe: {:.1f}'.format(fun(mj_pct, ['BERLIN', 'LONDON'])))
print('Majors in Asia: {:.1f}'.format(fun(mj_pct, ['TOKYO'])))
# ## Number and year of Majors
# In[17]:
pd.crosstab(df['major_y'], df['major_n'], margins=True, normalize='all')*100
# In[18]:
print('Cumulative frequency of year of Majors:')
display(pd.concat((df['major_y'].value_counts(),
df['major_y'].value_counts(normalize=True).cumsum()), axis=1).T*100)
print('Cumulative frequency of number of Majors:')
display(pd.concat((df['major_n'].value_counts(),
df['major_n'].value_counts(normalize=True).cumsum()), axis=1).T*100)
# 98% of the athletes ran in up to two Majors and 98% of the athletes ran a Major between the years 2014 and 2019.