import addutils.toc ; addutils.toc.js(ipy_notebook=True)
import numpy as np
import pandas as pd
import addutils
from IPython.display import display
import bokeh.plotting as bk
bk.output_notebook()
addutils.css_notebook()
We downloaded statistics about baby names choosen over years in the U.S. from: http://www.babycenter.com/baby-names and we stored them on our example data folder.
dataFolder = 'temp/baby_names/'
columnNames = ['name', 'sex', 'births']
names1880 = pd.read_csv(dataFolder+'yob1880.txt', names=columnNames)
names1880.head()
This shows some of the names choosen during 1880.
Now we want to read all the files in the years that spaces from 1880 to 2011.
years = range(1880, 2012)
parts = []
for year in years:
path = '{0}yob{1}.txt'.format(dataFolder, year)
frame = pd.read_csv(path, names=columnNames)
frame['year'] = year
parts.append(frame)
Now parts is a python
list
containing pandas.DataFrame
(s). Let's create a single DataFrame
containing all the names.
names = pd.concat(parts, ignore_index=True)
names[::10**5]
pandas.concat
concatenates pandas objects along a particular axis. If the optional parameter ignore_insex
is True, concat
won't use index values on the concatenation. Values from 0
to n-1
will be used instead.
DataFrame.pivot_table
creates a spreadsheet-style pivot table as a DataFrame. aggfunc
parameter specifies a list of aggregation functions to use on elements, margins
tells if grandtotal/subtotals are to be added to all columns/rows.
from bokeh.models.ranges import Range1d
totalBirths = names.pivot_table('births', index='year', columns='sex',
aggfunc=sum, margins=False)
#display(totalBirths.head())
#totalBirths[['F', 'M']][:-1].plot(title='Total births by sex and year')
fig = bk.figure(plot_width=750, plot_height=300, title=None)
fig.line(x=totalBirths.index, y=totalBirths['F'], legend='F', line_color='magenta')
fig.line(x=totalBirths.index, y=totalBirths['M'], legend='M', line_color='royalblue')
fig.legend.location = 'bottom_right'
fig.xaxis.axis_label = 'Year'
fig.yaxis.axis_label = 'Total births'
fig.yaxis[0].formatter.use_scientific = False
bk.show(fig)
Let's see a couple of example about splitting data.
In the first example we are going to view the number of births grouped by year and sex.
names.groupby(['year', 'sex'])['births'].sum().head()
The second example shows how to split the names in two groups: Boys and Girls.
boys = names[names.sex == 'M']
girls = names[names.sex == 'F']
display(boys[:2000:100])
We can see how many boys with a specific name were born each year.
boys[boys['name']=='Jayden']
bBirths = boys.pivot_table('births', index='year', columns='name',
aggfunc=sum, margins=False)
subset = bBirths[['Ray', 'Elvis', 'Sam', 'John', 'Marvin', 'Bob']]
plots = []
for name in subset.columns:
fig = bk.figure(plot_height=200, plot_width=700, title=None)
fig.line(x=np.asarray(subset.index), y=np.asarray(subset[name]),
line_color='black', legend=name)
plots.append([fig])
bk.show(bk.gridplot(plots))
# Or directly using Pandas (which uses Matplotlib, not Bokeh):
#subset.plot(subplots=True, figsize=(12, 10), grid=False,
# title="Number of births per year")
Now we are going to add a column
named 'prop` that shows the ratio: $\frac{\text{children with a specific name}}{\text{total children}}$
def add_prop(group):
births = group['births']
group['prop'] = births/float(births.sum())
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
display(names.head())
Let's check our calculations by verifying that the sum of all porportions by sex must be equal (or at least close) to 1.
np.allclose(names.groupby(['year', 'sex'])['prop'].sum(), 1)
Now we want to extract the top names for each sex/year combination.
def get_top(group, topNumber):
return group.sort_values(by='births', ascending=False)[:topNumber]
grouped = names.groupby(['year', 'sex'])
topNames = grouped.apply(get_top, topNumber=10)
# rename indexes to avoid warning; index and columns should have different names
topNames.index.rename(['year_', 'sex_', None], inplace=True)
topNames[:50]
This is our concluding example and we want to measure the increasing in name diversity.
from bokeh.models.ranges import Range1d
diversity = topNames.pivot_table('prop', index='year', columns='sex', aggfunc=sum)
fig = bk.figure(plot_width=750, plot_height=300, title=None)
fig.line(x=diversity.index, y=diversity['F'], line_color='green', legend='F')
fig.line(x=diversity.index, y=diversity['M'], line_color='blue', legend='M')
fig.y_range = Range1d(0, 1.2)
bk.show(fig)
# Or, using directly Pandas' "plot" method (which calls Matplotlib, not Bokeh)
# diversity.plot(title='Sum of diversity.prop by year and sex',
# yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
Visit www.add-for.com for more tutorials and updates.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.