In [1]:

import pandas as pd
import seaborn as sns

df = pd.read_csv('athlete_events.csv')
df.shape

Out[1]:

(271116, 15)

In [2]:

list(df)

Out[2]:

['ID',
 'Name',
 'Sex',
 'Age',
 'Height',
 'Weight',
 'Team',
 'NOC',
 'Games',
 'Year',
 'Season',
 'City',
 'Sport',
 'Event',
 'Medal']

In [3]:

(sns.distplot(df[df.Sex=='F'].Weight.dropna()),
sns.distplot(df[df.Sex=='M'].Weight.dropna())
)

Out[3]:

(<matplotlib.axes._subplots.AxesSubplot at 0x1a1efde990>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1efde990>)

In [4]:

(sns.distplot(df[df.Sex=='F'][ df.Sport=='Swimming'].Height.dropna()),
sns.distplot(df[df.Sex=='M'][df.Sport=='Swimming'].Height.dropna())
)

/Users/rod/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
/Users/rod/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.

Out[4]:

(<matplotlib.axes._subplots.AxesSubplot at 0x1a1e7ff1d0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1e7ff1d0>)

In [5]:

def NaN_percent(df, column_name):
    row_count = df[column_name].shape[0]
    empty_values = row_count - df[column_name].count()
    return (100.0*empty_values)/row_count

In [6]:

NaN_percent(df, 'Height')

Out[6]:

22.193821095029435

In [7]:

for i in list(df):
    print(i +': ' + str(NaN_percent(df,i))+'%')

ID: 0.0%
Name: 0.0%
Sex: 0.0%
Age: 3.49444518213606%
Height: 22.193821095029435%
Weight: 23.19118015904631%
Team: 0.0%
NOC: 0.0%
Games: 0.0%
Year: 0.0%
Season: 0.0%
City: 0.0%
Sport: 0.0%
Event: 0.0%
Medal: 85.3262072323286%

In [8]:

total_rows = df.shape[0]
unique_athletes = len(df.Name.unique())
medal_winners = len(df[df.Medal.fillna('None')!='None'].Name.unique())

"{0} {1} {2}".format(total_rows, unique_athletes, medal_winners)

Out[8]:

'271116 134732 28202'

In [9]:

print(df[df.Medal.fillna('None')!='None'].Medal.value_counts())
df[df.Medal.fillna('None')!='None'].shape[0]

Gold      13372
Bronze    13295
Silver    13116
Name: Medal, dtype: int64

Out[9]:

In [10]:

team_medal_count = df.groupby(['Team','Medal']).Medal.agg('count')
team_medal_count = team_medal_count.reset_index(name='count').sort_values(['count'], ascending=False)
team_medal_count.head(10)

Out[10]:

	Team	Medal	count
726	United States	Gold	2474
727	United States	Silver	1512
725	United States	Bronze	1233
627	Soviet Union	Gold	1058
628	Soviet Union	Silver	716
263	Germany	Gold	679
262	Germany	Bronze	678
626	Soviet Union	Bronze	677
264	Germany	Silver	627
278	Great Britain	Silver	582

In [11]:

def get_country_stats(country):
    return team_medal_count[team_medal_count.Team==country]

In [12]:

get_country_stats('Chile')

Out[12]:

	Team	Medal	count
121	Chile	Bronze	20
123	Chile	Silver	9
122	Chile	Gold	3

In [13]:

df[df.Team=='Croatia'].Year.unique()

Out[13]:

array([2006, 1996, 2000, 1992, 2008, 2012, 2004, 2016, 2014, 2010, 2002,
       1998, 1994])

In [14]:

unique_women = len(df[df.Sex=='F'].Name.unique())
unique_men = len(df[df.Sex=='M'].Name.unique())
women_medals = df[df.Sex=='F'].Medal.count()
men_medals = df[df.Sex=='M'].Medal.count()

print("{} {} {} {} ".format(unique_women, unique_men, women_medals, men_medals ))

df[df.Sex=='F'].Year.min()

33808 100979 11253 28530

Out[14]:

In [15]:

female = df[df.Sex=='F']
year_count = female.groupby('Year').agg('count')
years = list(year_count.index)
counts =  list(year_count.Name)
sns.scatterplot(x = years, y = counts)

Out[15]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a1eb1bdd0>

In [16]:

f_year_count = df[df.Sex=='F'].groupby('Year').agg('count').Name
m_year_count = df[df.Sex=='M'].groupby('Year').agg('count').Name
(sns.scatterplot(data= m_year_count),
 sns.scatterplot(data =f_year_count))

Out[16]:

(<matplotlib.axes._subplots.AxesSubplot at 0x1a1f970dd0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1f970dd0>)

In [17]:

df.Sport.value_counts()

Out[17]:

Athletics        38624
Gymnastics       26707
Swimming         23195
Shooting         11448
Cycling          10859
                 ...  
Racquets            12
Jeu De Paume        11
Roque                4
Basque Pelota        2
Aeronautics          1
Name: Sport, Length: 66, dtype: int64