In [1]:
import pandas as pd
import seaborn as sns

df = pd.read_csv('athlete_events.csv')
df.shape
Out[1]:
(271116, 15)
In [2]:
male_df = df[df.Sex=='M']
sport_weight_height_metrics = male_df.groupby(['Sport'])['Weight','Height'].agg(['min','max','mean'])
In [3]:
sport_weight_height_metrics.Weight.dropna().sort_values('mean', ascending=False)
Out[3]:
min max mean
Sport
Tug-Of-War 75.0 118.0 95.615385
Basketball 59.0 156.0 91.683529
Rugby Sevens 65.0 113.0 91.006623
Bobsleigh 55.0 145.0 90.387385
Beach Volleyball 62.0 110.0 89.512821
Handball 62.0 132.0 89.387914
Water Polo 61.0 125.0 87.706172
Volleyball 56.0 120.0 86.925926
Baseball 38.0 120.0 85.707792
Ice Hockey 52.0 116.0 83.775593
Rowing 37.0 137.0 83.665663
Judo 52.0 214.0 83.573945
Skeleton 65.0 127.0 82.018349
Curling 61.0 105.0 81.465686
Luge 52.0 112.0 80.803311
Weightlifting 50.0 176.5 80.251796
Canoeing 53.0 115.0 79.972378
Golf 63.0 104.0 79.245283
Sailing 50.0 130.0 78.849712
Tennis 59.0 111.0 78.842912
Alpine Skiing 50.0 107.0 78.626035
Swimming 45.0 114.0 78.040567
Shooting 41.0 140.0 77.834960
Rugby 68.0 99.0 77.533333
Archery 46.0 130.0 77.066866
Motorboating 77.0 77.0 77.000000
Snowboarding 50.0 102.0 76.861598
Lacrosse 60.0 98.0 76.714286
Wrestling 47.0 190.0 76.400640
Speed Skating 50.0 100.0 76.300403
Fencing 48.0 108.0 75.381977
Art Competitions 59.0 93.0 75.290909
Taekwondo 54.0 110.0 74.653595
Freestyle Skiing 47.0 108.0 74.648148
Badminton 55.0 97.0 74.362536
Athletics 42.0 165.0 73.839129
Hockey 48.0 105.0 73.343761
Football 28.0 100.0 73.086644
Biathlon 51.0 95.0 72.632123
Cycling 48.0 104.0 72.190234
Modern Pentathlon 56.0 91.0 72.068824
Cross Country Skiing 53.0 100.0 71.700832
Table Tennis 50.0 99.0 71.414239
Short Track Speed Skating 51.0 86.0 71.401869
Equestrianism 50.0 100.0 70.924559
Figure Skating 47.0 90.0 69.591644
Triathlon 54.0 82.0 68.803774
Diving 37.0 91.0 67.069378
Nordic Combined 53.0 86.0 66.909560
Trampolining 57.0 84.0 65.837838
Boxing 46.0 140.0 65.296280
Ski Jumping 50.0 85.0 65.245881
Gymnastics 46.0 102.0 63.343605
In [4]:
sns.distplot(sport_weight_height_metrics.Height.dropna()['mean'])
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x122e09b90>
In [5]:
means = list(sport_weight_height_metrics.Weight.dropna()['mean'])
sports = list(sport_weight_height_metrics.Weight.dropna().index)
plot_data = sorted(zip(sports, means), key = lambda x:x[1])
plot_data_dict = {
    'x' : [i for i, _ in enumerate(plot_data)],
    'y' : [v[1] for i, v in enumerate(plot_data)],
    'group' :  [v[0] for i, v in enumerate(plot_data)]
}
sns.scatterplot(data = plot_data_dict, x = 'x' , y = 'y')
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2698dc90>
In [6]:
print('lightest:')
for sport,weight in plot_data[:5]:
    print(sport + ': ' + str(weight))

print('\nheaviest:')    
for sport,weight in plot_data[-5:]:
    print(sport + ': ' + str(weight))
lightest:
Gymnastics: 63.34360475924893
Ski Jumping: 65.24588053553038
Boxing: 65.29627979505457
Trampolining: 65.83783783783784
Nordic Combined: 66.9095595126523

heaviest:
Beach Volleyball: 89.51282051282051
Bobsleigh: 90.38738521024649
Rugby Sevens: 91.00662251655629
Basketball: 91.68352893565358
Tug-Of-War: 95.61538461538461
In [7]:
means = list(sport_weight_height_metrics.Height.dropna()['mean'])
sports = list(sport_weight_height_metrics.Height.dropna().index)
plot_data = sorted(zip(sports, means), key = lambda x:x[1])
plot_data_dict = {
    'x' : [i for i, _ in enumerate(plot_data)],
    'y' : [v[1] for i, v in enumerate(plot_data)],
    'group' :  [v[0] for i, v in enumerate(plot_data)]
}
sns.scatterplot(data = plot_data_dict, x = 'x' , y = 'y')
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2783bd50>
In [8]:
print('shortest:')
for sport,height in plot_data[:5]:
    print(sport + ': ' + str(height))

print('\ntallest:')    
for sport,height in plot_data[-5:]:
    print(sport + ': ' + str(height))
shortest:
Gymnastics: 167.6444383959354
Weightlifting: 169.1530612244898
Trampolining: 171.3684210526316
Diving: 171.55535224153707
Wrestling: 172.87068623562078

tallest:
Rowing: 186.88269794721407
Handball: 188.77837311251827
Volleyball: 193.26565995525726
Beach Volleyball: 193.29090909090908
Basketball: 194.87262357414448
In [9]:
mean_heights = sport_weight_height_metrics.Height.dropna()['mean']
mean_weights = sport_weight_height_metrics.Weight.dropna()['mean']
avg_build = mean_weights/mean_heights
avg_build.sort_values(ascending = True)
builds = list(avg_build.sort_values(ascending = True))

plot_dict = {'x':[i for i,_ in enumerate(builds)],'y':builds}
sns.lineplot(data=plot_dict, x='x', y='y')
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a27e30c10>
In [10]:
avg_build.sort_values(ascending=False)
Out[10]:
Sport
Tug-Of-War                   0.523977
Rugby Sevens                 0.497754
Bobsleigh                    0.496656
Weightlifting                0.474433
Handball                     0.473507
Judo                         0.470872
Basketball                   0.470479
Water Polo                   0.469515
Baseball                     0.469376
Beach Volleyball             0.463099
Ice Hockey                   0.462870
Skeleton                     0.452958
Curling                      0.450811
Luge                         0.450804
Volleyball                   0.449774
Rowing                       0.447691
Golf                         0.442746
Shooting                     0.442301
Alpine Skiing                0.441989
Wrestling                    0.441953
Canoeing                     0.441318
Lacrosse                     0.440887
Rugby                        0.440288
Sailing                      0.437672
Archery                      0.431801
Snowboarding                 0.430534
Art Competitions             0.430488
Tennis                       0.426529
Speed Skating                0.425824
Motorboating                 0.425414
Swimming                     0.423418
Freestyle Skiing             0.423074
Fencing                      0.418756
Hockey                       0.414717
Badminton                    0.413997
Football                     0.411801
Athletics                    0.410746
Taekwondo                    0.409244
Short Track Speed Skating    0.406590
Cycling                      0.406144
Biathlon                     0.406092
Table Tennis                 0.403451
Cross Country Skiing         0.403363
Modern Pentathlon            0.402422
Equestrianism                0.401695
Figure Skating               0.395267
Diving                       0.390949
Trampolining                 0.384189
Triathlon                    0.381829
Nordic Combined              0.379081
Gymnastics                   0.377845
Boxing                       0.377679
Ski Jumping                  0.369498
Jeu De Paume                      NaN
Polo                              NaN
Racquets                          NaN
Name: mean, dtype: float64
In [11]:
sport_min_year = male_df.groupby('Sport').Year.agg(['min','max'])['min'].sort_values('index')
year_count = {}
for y in sport_min_year:
    try:
        year_count[y] += 1
    except:
        year_count[y] = 1
year = [k for k,v in year_count.items()]
new_sports = [v for k,v in year_count.items()]

data = {'x':year, 'y':new_sports}
sns.scatterplot(data=data, x = 'x', y='y')
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2a084e50>
In [12]:
sport_max_year = male_df.groupby('Sport').Year.agg(['min','max'])['max'].sort_values('index')
year_count = {}
for y in sport_max_year:
    try:
        year_count[y] += 1
    except:
        year_count[y] = 1
year = [k for k,v in year_count.items()]
deprecated_sports = [v for k,v in year_count.items()]

data = {'x':year, 'y':deprecated_sports}
sns.scatterplot(data=data, x = 'x', y='y')
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2a0f8090>
In [13]:
sport_max_year[sport_max_year <2000]
Out[13]:
Sport
Basque Pelota          1900
Croquet                1900
Cricket                1900
Roque                  1904
Jeu De Paume           1908
Racquets               1908
Motorboating           1908
Lacrosse               1908
Tug-Of-War             1920
Rugby                  1924
Military Ski Patrol    1924
Polo                   1936
Aeronautics            1936
Alpinism               1936
Art Competitions       1948
Name: max, dtype: int64
In [14]:
sport_min_year[sport_min_year >1936]
Out[14]:
Sport
Biathlon                     1960
Luge                         1964
Volleyball                   1964
Judo                         1964
Table Tennis                 1988
Baseball                     1992
Short Track Speed Skating    1992
Badminton                    1992
Freestyle Skiing             1992
Beach Volleyball             1996
Snowboarding                 1998
Taekwondo                    2000
Trampolining                 2000
Triathlon                    2000
Rugby Sevens                 2016
Name: min, dtype: int64