#!/usr/bin/env python # coding: utf-8 # # Frequencies of gender, age, country, and participation in Majors # # > [https://github.com/BMClab/covid19](https://github.com/BMClab/covid19) # > [Laboratory of Biomechanics and Motor Control](http://pesquisa.ufabc.edu.br/bmclab/) # > Federal University of ABC, Brazil # # **The data used in this Jupyter notebook are available on the Figshare repository https://doi.org/10.6084/m9.figshare.16620238.** #

1 Setup
- 1.1 Environment
2 Load dataset
- 2.1 Subject's information
3 Gender and age
4 Location
- 4.1 By country
- 4.2 By continent
5 Majors
- 5.1 Frequency
6 Number and year of Majors

# ## Setup # In[1]: import sys, os import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import plotly.express as px pd.options.plotting.backend = "plotly" from tqdm.notebook import tqdm import pycountry_convert as pc get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '') get_ipython().run_line_magic('watermark', '--iversions') # ### Environment # In[2]: path2 = r'./../data/' pd.set_option('display.float_format', lambda x: '%.4g' % x) plt.rcParams.update({'font.size': 14, 'xtick.labelsize': 12, 'ytick.labelsize': 12}) sns.set_style('whitegrid', rc={'xtick.bottom': True, 'xtick.top': True, 'ytick.left': True, 'ytick.right': True, 'xtick.direction': 'in', 'ytick.direction': 'in'}) colors = sns.color_palette() colors # ## Load dataset # In[3]: df = pd.read_parquet(os.path.join(path2, 'run_ww_2019_d.parquet')) df['athlete'] = df['athlete'].astype('category') # bug in parquet # ### Subject's information # In[4]: df = df[['athlete', 'gender', 'age_group', 'country', 'major'] ].drop_duplicates(subset='athlete').sort_values('athlete').reset_index(drop=True) display(df) # In[5]: df.info(memory_usage='deep') # ## Gender and age # In[6]: pd.crosstab(df['gender'], df['age_group'], margins=True, normalize='all') # 76% of athletes are male, 34% are between 18 and 34 years old, 59% are between 35 and 54 and the remaining 7% are 55 or older. # ## Location # ### By country # # Fix some names in country: # In[7]: print('Number of countries: {}'.format(df['country'].unique().size)) print('Number of athletes with unknown country: {} ({:.1f} %)'.format(df['country'].isnull().sum(), df['country'].isnull().sum()/df['country'].size*100)) # In[8]: cn_pct = 100*df['country'].value_counts(normalize=True) cn_pct = cn_pct.to_frame().reset_index() cn_pct = cn_pct.rename(columns={'index':'country', 'country':'percentage'}) cn_pct['country'] = cn_pct['country'].astype('object') cn_pct.head(10) # In[9]: y = cn_pct.copy(deep=True) y.loc[y['percentage'] < 1, 'country'] = 'Other countries' fig = px.pie(y, values='percentage', names='country', labels='percentage', title='Countries of the athletes') fig.update_traces(textposition='outside', textinfo='percent+label') fig.update_layout(showlegend=False) fig.show() # ### By continent # In[10]: df['continent'] = np.nan for country in df['country'].unique(): if pd.isnull(country): country_code, continent_name = np.nan, np.nan elif country == 'Kosovo': country_code = 'XK' continent_name = 'EU' else: country_code = pc.country_name_to_country_alpha2(country) if country_code == 'TL': country_code = 'TP' continent_name = pc.country_alpha2_to_continent_code(country_code) if not pd.isnull(country): df.loc[df['country'] == country, 'continent'] = continent_name df['continent'] = df['continent'].astype('category') df # In[11]: ct_pct = 100*df['continent'].value_counts(normalize=True) ct_pct = ct_pct.to_frame().reset_index() ct_pct = ct_pct.rename(columns={'index':'continent', 'continent':'percentage'}) ct_pct # In[12]: fig = px.pie(ct_pct, values='percentage', names='continent', labels='percentage', title='Continents of the athletes', height=400) fig.update_traces(textposition='outside', textinfo='percent+label') fig.update_layout(showlegend=False) fig.show() # ## Majors # In[13]: f = lambda x: np.array(x, dtype=int).max() df['major_y'] = df['major'].str.findall(r'[0-9]+').apply(f) df['major_n'] = df['major'].str.split(',').apply(lambda x: len(x)) df['major_s'] = df['major'].str.replace(' \d+', '', regex=True).str.split(',') df # ### Frequency # In[14]: mj_pct = 100*df['major_s'].explode().astype('category').value_counts(normalize=True) mj_pct = mj_pct.to_frame().reset_index() mj_pct = mj_pct.rename(columns={'index':'major', 'major_s':'percentage'}) mj_pct # In[15]: fig = px.pie(mj_pct, values='percentage', names='major', labels='percentage', title='Frequency of Majors', height=400) fig.update_traces(textposition='outside', textinfo='percent+label') fig.update_layout(showlegend=False) fig.show() # In[16]: fun = lambda x, y: x[x['major'].isin(y)]['percentage'].values.sum() print('Majors in America: {:.1f}'.format(fun(mj_pct, ['BOSTON', 'NEW YORK', 'CHICAGO']))) print('Majors in Europe: {:.1f}'.format(fun(mj_pct, ['BERLIN', 'LONDON']))) print('Majors in Asia: {:.1f}'.format(fun(mj_pct, ['TOKYO']))) # ## Number and year of Majors # In[17]: pd.crosstab(df['major_y'], df['major_n'], margins=True, normalize='all')*100 # In[18]: print('Cumulative frequency of year of Majors:') display(pd.concat((df['major_y'].value_counts(), df['major_y'].value_counts(normalize=True).cumsum()), axis=1).T*100) print('Cumulative frequency of number of Majors:') display(pd.concat((df['major_n'].value_counts(), df['major_n'].value_counts(normalize=True).cumsum()), axis=1).T*100) # 98% of the athletes ran in up to two Majors and 98% of the athletes ran a Major between the years 2014 and 2019.

Contents