In [1]:
import pandas as pd
import seaborn as sns

df = pd.read_csv('athlete_events.csv')
df.shape
Out[1]:
(271116, 15)
In [2]:
list(df)
Out[2]:
['ID',
 'Name',
 'Sex',
 'Age',
 'Height',
 'Weight',
 'Team',
 'NOC',
 'Games',
 'Year',
 'Season',
 'City',
 'Sport',
 'Event',
 'Medal']
In [3]:
(sns.distplot(df[df.Sex=='F'].Weight.dropna()),
sns.distplot(df[df.Sex=='M'].Weight.dropna())
)
Out[3]:
(<matplotlib.axes._subplots.AxesSubplot at 0x1a1efde990>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1efde990>)
In [4]:
(sns.distplot(df[df.Sex=='F'][ df.Sport=='Swimming'].Height.dropna()),
sns.distplot(df[df.Sex=='M'][df.Sport=='Swimming'].Height.dropna())
)
/Users/rod/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
/Users/rod/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  
Out[4]:
(<matplotlib.axes._subplots.AxesSubplot at 0x1a1e7ff1d0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1e7ff1d0>)
In [5]:
def NaN_percent(df, column_name):
    row_count = df[column_name].shape[0]
    empty_values = row_count - df[column_name].count()
    return (100.0*empty_values)/row_count
In [6]:
NaN_percent(df, 'Height')
Out[6]:
22.193821095029435
In [7]:
for i in list(df):
    print(i +': ' + str(NaN_percent(df,i))+'%')
ID: 0.0%
Name: 0.0%
Sex: 0.0%
Age: 3.49444518213606%
Height: 22.193821095029435%
Weight: 23.19118015904631%
Team: 0.0%
NOC: 0.0%
Games: 0.0%
Year: 0.0%
Season: 0.0%
City: 0.0%
Sport: 0.0%
Event: 0.0%
Medal: 85.3262072323286%
In [8]:
total_rows = df.shape[0]
unique_athletes = len(df.Name.unique())
medal_winners = len(df[df.Medal.fillna('None')!='None'].Name.unique())

"{0} {1} {2}".format(total_rows, unique_athletes, medal_winners)
Out[8]:
'271116 134732 28202'
In [9]:
print(df[df.Medal.fillna('None')!='None'].Medal.value_counts())
df[df.Medal.fillna('None')!='None'].shape[0]
Gold      13372
Bronze    13295
Silver    13116
Name: Medal, dtype: int64
Out[9]:
39783
In [10]:
team_medal_count = df.groupby(['Team','Medal']).Medal.agg('count')
team_medal_count = team_medal_count.reset_index(name='count').sort_values(['count'], ascending=False)
team_medal_count.head(10)
Out[10]:
Team Medal count
726 United States Gold 2474
727 United States Silver 1512
725 United States Bronze 1233
627 Soviet Union Gold 1058
628 Soviet Union Silver 716
263 Germany Gold 679
262 Germany Bronze 678
626 Soviet Union Bronze 677
264 Germany Silver 627
278 Great Britain Silver 582
In [11]:
def get_country_stats(country):
    return team_medal_count[team_medal_count.Team==country]
In [12]:
get_country_stats('Chile')
Out[12]:
Team Medal count
121 Chile Bronze 20
123 Chile Silver 9
122 Chile Gold 3
In [13]:
df[df.Team=='Croatia'].Year.unique()
Out[13]:
array([2006, 1996, 2000, 1992, 2008, 2012, 2004, 2016, 2014, 2010, 2002,
       1998, 1994])
In [14]:
unique_women = len(df[df.Sex=='F'].Name.unique())
unique_men = len(df[df.Sex=='M'].Name.unique())
women_medals = df[df.Sex=='F'].Medal.count()
men_medals = df[df.Sex=='M'].Medal.count()

print("{} {} {} {} ".format(unique_women, unique_men, women_medals, men_medals ))

df[df.Sex=='F'].Year.min()
33808 100979 11253 28530 
Out[14]:
1900
In [15]:
female = df[df.Sex=='F']
year_count = female.groupby('Year').agg('count')
years = list(year_count.index)
counts =  list(year_count.Name)
sns.scatterplot(x = years, y = counts)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1eb1bdd0>
In [16]:
f_year_count = df[df.Sex=='F'].groupby('Year').agg('count').Name
m_year_count = df[df.Sex=='M'].groupby('Year').agg('count').Name
(sns.scatterplot(data= m_year_count),
 sns.scatterplot(data =f_year_count))
Out[16]:
(<matplotlib.axes._subplots.AxesSubplot at 0x1a1f970dd0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1f970dd0>)
In [17]:
df.Sport.value_counts()
Out[17]:
Athletics        38624
Gymnastics       26707
Swimming         23195
Shooting         11448
Cycling          10859
                 ...  
Racquets            12
Jeu De Paume        11
Roque                4
Basque Pelota        2
Aeronautics          1
Name: Sport, Length: 66, dtype: int64