Chapter 2: Introductory examples¶

1. USAGov Data From bit.ly¶

In [1]:

path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'

In [2]:

open(path).readline()

import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]

In [3]:

records[0]

Out[3]:

{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 u'al': u'en-US,en;q=0.8',
 u'c': u'US',
 u'cy': u'Danvers',
 u'g': u'A6qOVH',
 u'gr': u'MA',
 u'h': u'wfLQtf',
 u'hc': 1331822918,
 u'hh': u'1.usa.gov',
 u'l': u'orofrog',
 u'll': [42.576698, -70.954903],
 u'nk': 1,
 u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 u't': 1331923247,
 u'tz': u'America/New_York',
 u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}

In [4]:

records[0]['tz']

Out[4]:

u'America/New_York'

In [5]:

print (records[0]['tz'])

America/New_York

Counting Time Zones in pure Python¶

In [6]:

timezones = [rec['tz'] for rec in records]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-6-33fd9d9b9434> in <module>()
----> 1 timezones = [rec['tz'] for rec in records]

KeyError: 'tz'

In [7]:

print (rec['tz'] for rec in records)

<generator object <genexpr> at 0x1098b13c0>

In [8]:

timezones = [rec['tz'] for rec in records if 'tz' in rec]

In [9]:

timezones[:10]

Out[9]:

[u'America/New_York',
 u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York',
 u'America/New_York',
 u'Europe/Warsaw',
 u'',
 u'',
 u'']

In [10]:

def get_counts(sequences):
    count = {}
    for i in sequences:
        if x in count:
            count[x] += 1
        else:
            count[x] = 1
    return count

In [11]:

from collections import defaultdict

def get_counts(sequences):
    count = defaultdict(int)
    for x in count:
        count[x] += 1
    return count

In [12]:

counts = get_counts(timezones)

In [17]:

print counts

defaultdict(<type 'int'>, {'America/New_York': 0})

In [13]:

counts['America/New_York']

Out[13]:

In [14]:

len(timezones)

Out[14]:

In [15]:

def top_counts(count_dict, n = 10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n: ]

In [16]:

top_counts(counts)

Out[16]:

[(0, 'America/New_York')]

In [18]:

from collections import Counter

In [19]:

counts = Counter(timezones)

In [20]:

counts.most_common(10)

Out[20]:

[(u'America/New_York', 1251),
 (u'', 521),
 (u'America/Chicago', 400),
 (u'America/Los_Angeles', 382),
 (u'America/Denver', 191),
 (u'Europe/London', 74),
 (u'Asia/Tokyo', 37),
 (u'Pacific/Honolulu', 36),
 (u'Europe/Madrid', 35),
 (u'America/Sao_Paulo', 33)]

Counting Time Zones with Pandas¶

In [22]:

% matplotlib inline

/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [24]:

from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('figure', figsize = (10, 6))
np.set_printoptions(precision = 4)

In [25]:

import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
lines = open(path).readlines()
records = [json.loads(line) for line in lines]

In [26]:

from pandas import DataFrame, Series
import pandas as pd

frame = DataFrame(records)

In [27]:

frame.head()

Out[27]:

	_heartbeat_	a	al	c	cy	g	gr	h	hc	hh	kw	l	ll	nk	r	t	tz	u
0	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	en-US,en;q=0.8	US	Danvers	A6qOVH	MA	wfLQtf	1331822918	1.usa.gov	NaN	orofrog	[42.576698, -70.954903]	1	http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/...	1331923247	America/New_York	http://www.ncbi.nlm.nih.gov/pubmed/22415991
1	NaN	GoogleMaps/RochesterNY	NaN	US	Provo	mwszkS	UT	mwszkS	1308262393	j.mp	NaN	bitly	[40.218102, -111.613297]	0	http://www.AwareMap.com/	1331923249	America/Denver	http://www.monroecounty.gov/etc/911/rss.php
2	NaN	Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...	en-US	US	Washington	xxr3Qb	DC	xxr3Qb	1331919941	1.usa.gov	NaN	bitly	[38.9007, -77.043098]	1	http://t.co/03elZC4Q	1331923250	America/New_York	http://boxer.senate.gov/en/press/releases/0316...
3	NaN	Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...	pt-br	BR	Braz	zCaLwp	27	zUtuOu	1331923068	1.usa.gov	NaN	alelex88	[-23.549999, -46.616699]	0	direct	1331923249	America/Sao_Paulo	http://apod.nasa.gov/apod/ap120312.html
4	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	en-US,en;q=0.8	US	Shrewsbury	9b6kNl	MA	9b6kNl	1273672411	bit.ly	NaN	bitly	[42.286499, -71.714699]	0	http://www.shrewsbury-ma.gov/selco/	1331923251	America/New_York	http://www.shrewsbury-ma.gov/egov/gallery/1341...

In [28]:

frame['tz'][:10]

Out[28]:

0     America/New_York
1       America/Denver
2     America/New_York
3    America/Sao_Paulo
4     America/New_York
5     America/New_York
6        Europe/Warsaw
7                     
8                     
9                     
Name: tz, dtype: object

In [29]:

tz_counts = frame['tz'].value_counts()
tz_counts[:10]

Out[29]:

America/New_York       1251
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33
Name: tz, dtype: int64

In [30]:

clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]

Out[30]:

America/New_York       1251
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
Name: tz, dtype: int64

In [31]:

plt.figure(figsize = (10, 4))

Out[31]:

<matplotlib.figure.Figure at 0x10aed97d0>

<matplotlib.figure.Figure at 0x10aed97d0>

In [32]:

tz_counts[:10].plot(kind = 'barh', rot = 0)

Out[32]:

<matplotlib.axes._subplots.AxesSubplot at 0x10c09af10>

In [33]:

frame['a'][1]

Out[33]:

u'GoogleMaps/RochesterNY'

In [34]:

frame['a'][50]

Out[34]:

u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'

In [35]:

frame['a'][51]

Out[35]:

u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'

In [36]:

results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]

Out[36]:

0               Mozilla/5.0
1    GoogleMaps/RochesterNY
2               Mozilla/4.0
3               Mozilla/5.0
4               Mozilla/5.0
dtype: object

In [37]:

results.value_counts()[:8]

Out[37]:

Mozilla/5.0                 2594
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4
dtype: int64

In [65]:

cframe = frame[frame.a.notnull()]

In [66]:

operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
operating_system[:5]

Out[66]:

array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'], 
      dtype='|S11')

In [69]:

by_tz_os = cframe.groupby(['tz', operating_system])

In [70]:

agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]

Out[70]:

	Not Windows	Windows
tz
	245	276
Africa/Cairo	0	3
Africa/Casablanca	0	1
Africa/Ceuta	0	2
Africa/Johannesburg	0	1
Africa/Lusaka	0	1
America/Anchorage	4	1
America/Argentina/Buenos_Aires	1	0
America/Argentina/Cordoba	0	1
America/Argentina/Mendoza	0	1

10 rows × 2 columns

In [71]:

# use to sort ascending order
indexer = agg_counts.sum(1).argsort()
indexer[:10]

Out[71]:

tz
                                  24
Africa/Cairo                      20
Africa/Casablanca                 21
Africa/Ceuta                      92
Africa/Johannesburg               87
Africa/Lusaka                     53
America/Anchorage                 54
America/Argentina/Buenos_Aires    57
America/Argentina/Cordoba         26
America/Argentina/Mendoza         55
dtype: int64

In [72]:

count_subset = agg_counts.take(indexer)[-10:]
count_subset

Out[72]:

	Not Windows	Windows
tz
America/Sao_Paulo	13	20
Europe/Madrid	16	19
Pacific/Honolulu	0	36
Asia/Tokyo	2	35
Europe/London	43	31
America/Denver	132	59
America/Los_Angeles	130	252
America/Chicago	115	285
	245	276
America/New_York	339	912

10 rows × 2 columns

In [73]:

plt.figure()

Out[73]:

<matplotlib.figure.Figure at 0x7fb885721e10>

<matplotlib.figure.Figure at 0x7fb885721e10>

In [74]:

count_subset.plot(kind = 'barh', stacked = True)

Out[74]:

<matplotlib.axes.AxesSubplot at 0x7fb885407a90>

In [75]:

plt.figure()

Out[75]:

<matplotlib.figure.Figure at 0x7fb885218f10>

<matplotlib.figure.Figure at 0x7fb885218f10>

In [76]:

normed_subset = count_subset.div(count_subset.sum(1), axis = 0)
normed_subset.plot(kind = 'barh', stacked = True)

Out[76]:

<matplotlib.axes.AxesSubplot at 0x7fb8854b2790>

MovieLens 1M Data Set¶

In [77]:

import pandas as pd
encoding = 'latin1'

upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')

unames = ['user_id', 'gender', 'age', 'accupation', 'zip']
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']

users = pd.read_csv(upath, sep = '::', header = None, names = unames, encoding = encoding)
ratings = pd.read_csv(rpath, sep = '::', header = None, names = rnames, encoding = encoding)
movies = pd.read_csv(mpath, sep = '::', header = None, names = mnames, encoding = encoding)

In [78]:

users[:5]

Out[78]:

	user_id	gender	age	accupation	zip
0	1	F	1	10	48067
1	2	M	56	16	70072
2	3	M	25	15	55117
3	4	M	45	7	02460
4	5	M	25	20	55455

5 rows × 5 columns

In [79]:

ratings[:5]

Out[79]:

	user_id	movie_id	rating	timestamp
0	1	1193	5	978300760
1	1	661	3	978302109
2	1	914	3	978301968
3	1	3408	4	978300275
4	1	2355	5	978824291

5 rows × 4 columns

In [80]:

movies[:5]

Out[80]:

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy

5 rows × 3 columns

In [81]:

ratings

Out[81]:

	user_id	movie_id	rating	timestamp
0	1	1193	5	978300760
1	1	661	3	978302109
2	1	914	3	978301968
3	1	3408	4	978300275
4	1	2355	5	978824291
5	1	1197	3	978302268
6	1	1287	5	978302039
7	1	2804	5	978300719
8	1	594	4	978302268
9	1	919	4	978301368
10	1	595	5	978824268
11	1	938	4	978301752
12	1	2398	4	978302281
13	1	2918	4	978302124
14	1	1035	5	978301753
15	1	2791	4	978302188
16	1	2687	3	978824268
17	1	2018	4	978301777
18	1	3105	5	978301713
19	1	2797	4	978302039
20	1	2321	3	978302205
21	1	720	3	978300760
22	1	1270	5	978300055
23	1	527	5	978824195
24	1	2340	3	978300103
25	1	48	5	978824351
26	1	1097	4	978301953
27	1	1721	4	978300055
28	1	1545	4	978824139
29	1	745	3	978824268
30	1	2294	4	978824291
31	1	3186	4	978300019
32	1	1566	4	978824330
33	1	588	4	978824268
34	1	1907	4	978824330
35	1	783	4	978824291
36	1	1836	5	978300172
37	1	1022	5	978300055
38	1	2762	4	978302091
39	1	150	5	978301777
40	1	1	5	978824268
41	1	1961	5	978301590
42	1	1962	4	978301753
43	1	2692	4	978301570
44	1	260	4	978300760
45	1	1028	5	978301777
46	1	1029	5	978302205
47	1	1207	4	978300719
48	1	2028	5	978301619
49	1	531	4	978302149
50	1	3114	4	978302174
51	1	608	4	978301398
52	1	1246	4	978302091
53	2	1357	5	978298709
54	2	3068	4	978299000
55	2	1537	4	978299620
56	2	647	3	978299351
57	2	2194	4	978299297
58	2	648	4	978299913
59	2	2268	5	978299297
	...	...	...	...

1000209 rows × 4 columns

In [82]:

data = pd.merge(pd.merge(ratings, users), movies)
data

Out[82]:

	user_id	movie_id	rating	timestamp	gender	age	accupation	zip	title	genres
0	1	1193	5	978300760	F	1	10	48067	One Flew Over the Cuckoo's Nest (1975)	Drama
1	2	1193	5	978298413	M	56	16	70072	One Flew Over the Cuckoo's Nest (1975)	Drama
2	12	1193	4	978220179	M	25	12	32793	One Flew Over the Cuckoo's Nest (1975)	Drama
3	15	1193	4	978199279	M	25	7	22903	One Flew Over the Cuckoo's Nest (1975)	Drama
4	17	1193	5	978158471	M	50	1	95350	One Flew Over the Cuckoo's Nest (1975)	Drama
5	18	1193	4	978156168	F	18	3	95825	One Flew Over the Cuckoo's Nest (1975)	Drama
6	19	1193	5	982730936	M	1	10	48073	One Flew Over the Cuckoo's Nest (1975)	Drama
7	24	1193	5	978136709	F	25	7	10023	One Flew Over the Cuckoo's Nest (1975)	Drama
8	28	1193	3	978125194	F	25	1	14607	One Flew Over the Cuckoo's Nest (1975)	Drama
9	33	1193	5	978557765	M	45	3	55421	One Flew Over the Cuckoo's Nest (1975)	Drama
10	39	1193	5	978043535	M	18	4	61820	One Flew Over the Cuckoo's Nest (1975)	Drama
11	42	1193	3	978038981	M	25	8	24502	One Flew Over the Cuckoo's Nest (1975)	Drama
12	44	1193	4	978018995	M	45	17	98052	One Flew Over the Cuckoo's Nest (1975)	Drama
13	47	1193	4	977978345	M	18	4	94305	One Flew Over the Cuckoo's Nest (1975)	Drama
14	48	1193	4	977975061	M	25	4	92107	One Flew Over the Cuckoo's Nest (1975)	Drama
15	49	1193	4	978813972	M	18	12	77084	One Flew Over the Cuckoo's Nest (1975)	Drama
16	53	1193	5	977946400	M	25	0	96931	One Flew Over the Cuckoo's Nest (1975)	Drama
17	54	1193	5	977944039	M	50	1	56723	One Flew Over the Cuckoo's Nest (1975)	Drama
18	58	1193	5	977933866	M	25	2	30303	One Flew Over the Cuckoo's Nest (1975)	Drama
19	59	1193	4	977934292	F	50	1	55413	One Flew Over the Cuckoo's Nest (1975)	Drama
20	62	1193	4	977968584	F	35	3	98105	One Flew Over the Cuckoo's Nest (1975)	Drama
21	80	1193	4	977786172	M	56	1	49327	One Flew Over the Cuckoo's Nest (1975)	Drama
22	81	1193	5	977785864	F	25	0	60640	One Flew Over the Cuckoo's Nest (1975)	Drama
23	88	1193	5	977694161	F	45	1	02476	One Flew Over the Cuckoo's Nest (1975)	Drama
24	89	1193	5	977683596	F	56	9	85749	One Flew Over the Cuckoo's Nest (1975)	Drama
25	95	1193	5	977626632	M	45	0	98201	One Flew Over the Cuckoo's Nest (1975)	Drama
26	96	1193	3	977621789	F	25	16	78028	One Flew Over the Cuckoo's Nest (1975)	Drama
27	99	1193	2	982791053	F	1	10	19390	One Flew Over the Cuckoo's Nest (1975)	Drama
28	102	1193	5	1040737607	M	35	19	20871	One Flew Over the Cuckoo's Nest (1975)	Drama
29	104	1193	2	977546620	M	25	12	00926	One Flew Over the Cuckoo's Nest (1975)	Drama
30	107	1193	5	977533908	M	45	18	63129	One Flew Over the Cuckoo's Nest (1975)	Drama
31	109	1193	2	977516253	M	45	15	92028	One Flew Over the Cuckoo's Nest (1975)	Drama
32	120	1193	5	977458103	M	25	11	27106	One Flew Over the Cuckoo's Nest (1975)	Drama
33	123	1193	5	977448350	M	35	9	67208	One Flew Over the Cuckoo's Nest (1975)	Drama
34	131	1193	5	977431822	M	18	4	06520	One Flew Over the Cuckoo's Nest (1975)	Drama
35	136	1193	4	977421091	M	18	2	21202	One Flew Over the Cuckoo's Nest (1975)	Drama
36	139	1193	4	977359402	F	25	20	45409	One Flew Over the Cuckoo's Nest (1975)	Drama
37	146	1193	4	979940868	F	35	20	10954	One Flew Over the Cuckoo's Nest (1975)	Drama
38	150	1193	5	978163667	M	35	7	98144	One Flew Over the Cuckoo's Nest (1975)	Drama
39	151	1193	4	993121122	F	25	20	85013	One Flew Over the Cuckoo's Nest (1975)	Drama
40	155	1193	5	977253254	M	35	12	07470	One Flew Over the Cuckoo's Nest (1975)	Drama
41	166	1193	5	977299851	M	18	4	92802	One Flew Over the Cuckoo's Nest (1975)	Drama
42	169	1193	5	977197066	M	25	7	55439	One Flew Over the Cuckoo's Nest (1975)	Drama
43	175	1193	5	978051929	F	25	2	95123	One Flew Over the Cuckoo's Nest (1975)	Drama
44	178	1193	4	977098352	M	56	17	53705	One Flew Over the Cuckoo's Nest (1975)	Drama
45	180	1193	5	977089623	M	45	12	01603	One Flew Over the Cuckoo's Nest (1975)	Drama
46	190	1193	5	977026946	M	25	17	55125	One Flew Over the Cuckoo's Nest (1975)	Drama
47	191	1193	3	977026629	M	18	4	04915	One Flew Over the Cuckoo's Nest (1975)	Drama
48	192	1193	4	977089850	M	18	1	10977	One Flew Over the Cuckoo's Nest (1975)	Drama
49	195	1193	5	977064554	M	25	12	10458	One Flew Over the Cuckoo's Nest (1975)	Drama
50	196	1193	4	977004875	F	35	9	94587	One Flew Over the Cuckoo's Nest (1975)	Drama
51	202	1193	4	976937045	M	18	4	53706	One Flew Over the Cuckoo's Nest (1975)	Drama
52	206	1193	1	976921007	M	25	17	20194	One Flew Over the Cuckoo's Nest (1975)	Drama
53	208	1193	5	984510357	M	35	17	55432	One Flew Over the Cuckoo's Nest (1975)	Drama
54	216	1193	4	976868526	M	45	13	52761	One Flew Over the Cuckoo's Nest (1975)	Drama
55	224	1193	4	976833559	F	18	4	14850	One Flew Over the Cuckoo's Nest (1975)	Drama
56	225	1193	4	976832915	M	25	7	11215	One Flew Over the Cuckoo's Nest (1975)	Drama
57	227	1193	4	976829587	M	35	20	90291	One Flew Over the Cuckoo's Nest (1975)	Drama
58	230	1193	4	976824488	M	45	1	43210	One Flew Over the Cuckoo's Nest (1975)	Drama
59	231	1193	4	997057648	M	25	3	55455	One Flew Over the Cuckoo's Nest (1975)	Drama
	...	...	...	...	...	...	...	...	...	...

1000209 rows × 10 columns

In [83]:

data.ix[0]

Out[83]:

user_id                                            1
movie_id                                        1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
accupation                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

In [86]:

mean_rating = data.pivot_table('rating', rows = 'title', cols = 'gender', aggfunc = 'mean')
mean_rating[:10]

Out[86]:

gender	F	M
title
$1,000,000 Duck (1971)	3.375000	2.761905
'Night Mother (1986)	3.388889	3.352941
'Til There Was You (1997)	2.675676	2.733333
'burbs, The (1989)	2.793478	2.962085
...And Justice for All (1979)	3.828571	3.689024
1-900 (1994)	2.000000	3.000000
10 Things I Hate About You (1999)	3.646552	3.311966
101 Dalmatians (1961)	3.791444	3.500000
101 Dalmatians (1996)	3.240000	2.911215
12 Angry Men (1957)	4.184397	4.328421

10 rows × 2 columns

In [87]:

rating_by_title = data.groupby('title').size()

In [88]:

rating_by_title[0:10]

Out[88]:

title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64

In [89]:

active_titles = rating_by_title.index[rating_by_title >= 250]
active_titles[:10]

Out[89]:

Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)', u'101 Dalmatians (1961)', u'101 Dalmatians (1996)', u'12 Angry Men (1957)', u'13th Warrior, The (1999)', u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)', u'2001: A Space Odyssey (1968)', u'2010 (1984)'], dtype='object')

In [90]:

mean_rating = mean_rating.ix[active_titles]
mean_rating

Out[90]:

gender	F	M
title
'burbs, The (1989)	2.793478	2.962085
10 Things I Hate About You (1999)	3.646552	3.311966
101 Dalmatians (1961)	3.791444	3.500000
101 Dalmatians (1996)	3.240000	2.911215
12 Angry Men (1957)	4.184397	4.328421
13th Warrior, The (1999)	3.112000	3.168000
2 Days in the Valley (1996)	3.488889	3.244813
20,000 Leagues Under the Sea (1954)	3.670103	3.709205
2001: A Space Odyssey (1968)	3.825581	4.129738
2010 (1984)	3.446809	3.413712
28 Days (2000)	3.209424	2.977707
39 Steps, The (1935)	3.965517	4.107692
54 (1998)	2.701754	2.782178
7th Voyage of Sinbad, The (1958)	3.409091	3.658879
8MM (1999)	2.906250	2.850962
About Last Night... (1986)	3.188679	3.140909
Absent Minded Professor, The (1961)	3.469388	3.446809
Absolute Power (1997)	3.469136	3.327759
Abyss, The (1989)	3.659236	3.689507
Ace Ventura: Pet Detective (1994)	3.000000	3.197917
Ace Ventura: When Nature Calls (1995)	2.269663	2.543333
Addams Family Values (1993)	3.000000	2.878531
Addams Family, The (1991)	3.186170	3.163498
Adventures in Babysitting (1987)	3.455782	3.208122
Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984)	3.308511	3.402321
Adventures of Priscilla, Queen of the Desert, The (1994)	3.989071	3.688811
Adventures of Robin Hood, The (1938)	4.166667	3.918367
African Queen, The (1951)	4.324232	4.223822
Age of Innocence, The (1993)	3.827068	3.339506
Agnes of God (1985)	3.534884	3.244898
Air America (1990)	2.823529	2.741176
Air Force One (1997)	3.699588	3.555822
Airplane II: The Sequel (1982)	2.906250	2.995671
Airplane! (1980)	3.656566	4.064419
Akira (1988)	3.511111	3.980344
Aladdin (1992)	3.857143	3.756494
Alice in Wonderland (1951)	3.705882	3.692308
Alien (1979)	3.888252	4.216119
Alien Nation (1988)	3.433333	3.195946
Alien: Resurrection (1997)	2.708738	2.997041
Aliens (1986)	3.802083	4.186684
Alien� (1992)	3.008264	3.042289
Alive (1993)	3.469388	3.376866
All About Eve (1950)	4.363057	4.186992
All About My Mother (Todo Sobre Mi Madre) (1999)	4.333333	3.944000
All Quiet on the Western Front (1930)	3.957447	4.244344
All That Jazz (1979)	4.061856	3.718182
Almost Famous (2000)	4.220217	4.228731
Amadeus (1984)	4.346734	4.213415
American Beauty (1999)	4.238901	4.347301
American Gigolo (1980)	3.337500	3.106977
American Graffiti (1973)	3.940476	4.055556
American History X (1998)	4.181818	4.240741
American Movie (1999)	4.028986	4.008850
American Pie (1999)	3.539792	3.754545
American President, The (1995)	3.923483	3.718654
American Psycho (2000)	3.085271	3.253493
American Tail, An (1986)	3.649254	3.318519
American Werewolf in London, An (1981)	3.548077	3.800000
American Werewolf in Paris, An (1997)	2.814815	2.739336
	...	...

1216 rows × 2 columns

In [91]:

mean_rating = mean_rating.rename(index = {'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':
                                          'Seven Samurai (Shichinin no samurai)(1954)'})                                          

In [93]:

top_female_rating = mean_rating.sort_index(by = 'F', ascending = False)
top_female_rating[:10]                                            

Out[93]:

gender	F	M
title
Close Shave, A (1995)	4.644444	4.473795
Wrong Trousers, The (1993)	4.588235	4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)	4.572650	4.464589
Wallace & Gromit: The Best of Aardman Animation (1996)	4.563107	4.385075
Schindler's List (1993)	4.562602	4.491415
Shawshank Redemption, The (1994)	4.539075	4.560625
Grand Day Out, A (1992)	4.537879	4.293255
To Kill a Mockingbird (1962)	4.536667	4.372611
Creature Comforts (1990)	4.513889	4.272277
Usual Suspects, The (1995)	4.513317	4.518248

10 rows × 2 columns

Measure rating disaggreement¶

In [94]:

mean_rating['diff'] = mean_rating['M'] - mean_rating['F']

In [95]:

sorted_by_diff = mean_rating.sort_index(by = 'diff')

In [96]:

sorted_by_diff[:15]

Out[96]:

gender	F	M	diff
title
Dirty Dancing (1987)	3.790378	2.959596	-0.830782
Jumpin' Jack Flash (1986)	3.254717	2.578358	-0.676359
Grease (1978)	3.975265	3.367041	-0.608224
Little Women (1994)	3.870588	3.321739	-0.548849
Steel Magnolias (1989)	3.901734	3.365957	-0.535777
Anastasia (1997)	3.800000	3.281609	-0.518391
Rocky Horror Picture Show, The (1975)	3.673016	3.160131	-0.512885
Color Purple, The (1985)	4.158192	3.659341	-0.498851
Age of Innocence, The (1993)	3.827068	3.339506	-0.487561
Free Willy (1993)	2.921348	2.438776	-0.482573
French Kiss (1995)	3.535714	3.056962	-0.478752
Little Shop of Horrors, The (1960)	3.650000	3.179688	-0.470312
Guys and Dolls (1955)	4.051724	3.583333	-0.468391
Mary Poppins (1964)	4.197740	3.730594	-0.467147
Patch Adams (1998)	3.473282	3.008746	-0.464536

15 rows × 3 columns

In [97]:

# Reverse order of rows, taking first 15 rows
sorted_by_diff[::-1][:15]

Out[97]:

gender	F	M	diff
title
Good, The Bad and The Ugly, The (1966)	3.494949	4.221300	0.726351
Kentucky Fried Movie, The (1977)	2.878788	3.555147	0.676359
Dumb & Dumber (1994)	2.697987	3.336595	0.638608
Longest Day, The (1962)	3.411765	4.031447	0.619682
Cable Guy, The (1996)	2.250000	2.863787	0.613787
Evil Dead II (Dead By Dawn) (1987)	3.297297	3.909283	0.611985
Hidden, The (1987)	3.137931	3.745098	0.607167
Rocky III (1982)	2.361702	2.943503	0.581801
Caddyshack (1980)	3.396135	3.969737	0.573602
For a Few Dollars More (1965)	3.409091	3.953795	0.544704
Porky's (1981)	2.296875	2.836364	0.539489
Animal House (1978)	3.628906	4.167192	0.538286
Exorcist, The (1973)	3.537634	4.067239	0.529605
Fright Night (1985)	2.973684	3.500000	0.526316
Barb Wire (1996)	1.585366	2.100386	0.515020

15 rows × 3 columns

In [98]:

# Standard deviation of rating grouped by title
rating_sd_by_title = data.groupby('title')['rating'].std()

In [99]:

# Filter down to active titles
rating_sd_by_title = rating_sd_by_title.ix[active_titles]

In [100]:

# Order series by values in ascending order
rating_sd_by_title.order(ascending=False)[:10]

Out[100]:

title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: rating, dtype: float64

US baby name¶

In [103]:

from __future__ import division
from numpy.random import randn
import numpy as np
import matplotlib.pylab as plt
plt.rc('figure', figsize = (12, 5))
np.set_printoptions(precision = 4)

http://www.ssa.gov/oact/babynames/limits.html

In [106]:

import pandas as pd
names1880 = pd.read_csv('ch02/names/yob1880.txt', names = ['names', 'sex', 'births'])
names1880

Out[106]:

	names	sex	births
0	Mary	F	7065
1	Anna	F	2604
2	Emma	F	2003
3	Elizabeth	F	1939
4	Minnie	F	1746
5	Margaret	F	1578
6	Ida	F	1472
7	Alice	F	1414
8	Bertha	F	1320
9	Sarah	F	1288
10	Annie	F	1258
11	Clara	F	1226
12	Ella	F	1156
13	Florence	F	1063
14	Cora	F	1045
15	Martha	F	1040
16	Laura	F	1012
17	Nellie	F	995
18	Grace	F	982
19	Carrie	F	949
20	Maude	F	858
21	Mabel	F	808
22	Bessie	F	794
23	Jennie	F	793
24	Gertrude	F	787
25	Julia	F	783
26	Hattie	F	769
27	Edith	F	768
28	Mattie	F	704
29	Rose	F	700
30	Catherine	F	688
31	Lillian	F	672
32	Ada	F	652
33	Lillie	F	647
34	Helen	F	636
35	Jessie	F	635
36	Louise	F	635
37	Ethel	F	633
38	Lula	F	621
39	Myrtle	F	615
40	Eva	F	614
41	Frances	F	605
42	Lena	F	603
43	Lucy	F	591
44	Edna	F	588
45	Maggie	F	582
46	Pearl	F	569
47	Daisy	F	564
48	Fannie	F	560
49	Josephine	F	544
50	Dora	F	524
51	Rosa	F	507
52	Katherine	F	502
53	Agnes	F	473
54	Marie	F	471
55	Nora	F	471
56	May	F	462
57	Mamie	F	436
58	Blanche	F	427
59	Stella	F	414
	...	...	...

2000 rows × 3 columns

In [107]:

names1880.groupby('sex').births.sum()

Out[107]:

sex
F       90993
M      110493
Name: births, dtype: int64

In [111]:

# year 2010 is the last year available right now
years = range(1880, 2011)

pieces = []
columns = ['names', 'sex', 'births']

for year in years:
    path = 'ch02/names/yob%d.txt' %year
    frame = pd.read_csv(path, names = columns)
    frame['year'] = year
    pieces.append(frame)
    
# concatenate them into a single dataframe
names = pd.concat(pieces, ignore_index = True)

In [112]:

# names = pd.read_pickle('ch02/all_names')

---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-112-af1154dd18f9> in <module>()
----> 1 names = pd.read_pickle('ch02/all_names')

/usr/lib/python2.7/dist-packages/pandas/io/pickle.pyc in read_pickle(path)
     47 
     48     try:
---> 49         return try_read(path)
     50     except:
     51         if PY3:

/usr/lib/python2.7/dist-packages/pandas/io/pickle.pyc in try_read(path, encoding)
     43                 return pc.load(fh, encoding=encoding, compat=False)
     44         except:
---> 45             with open(path, 'rb') as fh:
     46                 return pc.load(fh, encoding=encoding, compat=True)
     47 

IOError: [Errno 2] No such file or directory: 'ch02/all_names'

In [114]:

total_births = names.pivot_table('births', rows = 'year', cols = 'sex', aggfunc = sum)
total_births.tail()

Out[114]:

sex	F	M
year
2006	1896468	2050234
2007	1916888	2069242
2008	1883645	2032310
2009	1827643	1973359
2010	1759010	1898382

5 rows × 2 columns

In [115]:

total_births.plot(title = 'Total birth by sex and year')

Out[115]:

<matplotlib.axes.AxesSubplot at 0x7fb871688890>

In [117]:

def add_prop(group):
    # integers division floors
    births = group.births.astype(float)
    group['prop'] = births/births.sum()
    return group

names = names.groupby(['year', 'sex']).apply(add_prop)
names

Out[117]:

	names	sex	births	year	prop
0	Mary	F	7065	1880	0.077643
1	Anna	F	2604	1880	0.028618
2	Emma	F	2003	1880	0.022013
3	Elizabeth	F	1939	1880	0.021309
4	Minnie	F	1746	1880	0.019188
5	Margaret	F	1578	1880	0.017342
6	Ida	F	1472	1880	0.016177
7	Alice	F	1414	1880	0.015540
8	Bertha	F	1320	1880	0.014507
9	Sarah	F	1288	1880	0.014155
10	Annie	F	1258	1880	0.013825
11	Clara	F	1226	1880	0.013474
12	Ella	F	1156	1880	0.012704
13	Florence	F	1063	1880	0.011682
14	Cora	F	1045	1880	0.011484
15	Martha	F	1040	1880	0.011429
16	Laura	F	1012	1880	0.011122
17	Nellie	F	995	1880	0.010935
18	Grace	F	982	1880	0.010792
19	Carrie	F	949	1880	0.010429
20	Maude	F	858	1880	0.009429
21	Mabel	F	808	1880	0.008880
22	Bessie	F	794	1880	0.008726
23	Jennie	F	793	1880	0.008715
24	Gertrude	F	787	1880	0.008649
25	Julia	F	783	1880	0.008605
26	Hattie	F	769	1880	0.008451
27	Edith	F	768	1880	0.008440
28	Mattie	F	704	1880	0.007737
29	Rose	F	700	1880	0.007693
30	Catherine	F	688	1880	0.007561
31	Lillian	F	672	1880	0.007385
32	Ada	F	652	1880	0.007165
33	Lillie	F	647	1880	0.007110
34	Helen	F	636	1880	0.006990
35	Jessie	F	635	1880	0.006979
36	Louise	F	635	1880	0.006979
37	Ethel	F	633	1880	0.006957
38	Lula	F	621	1880	0.006825
39	Myrtle	F	615	1880	0.006759
40	Eva	F	614	1880	0.006748
41	Frances	F	605	1880	0.006649
42	Lena	F	603	1880	0.006627
43	Lucy	F	591	1880	0.006495
44	Edna	F	588	1880	0.006462
45	Maggie	F	582	1880	0.006396
46	Pearl	F	569	1880	0.006253
47	Daisy	F	564	1880	0.006198
48	Fannie	F	560	1880	0.006154
49	Josephine	F	544	1880	0.005978
50	Dora	F	524	1880	0.005759
51	Rosa	F	507	1880	0.005572
52	Katherine	F	502	1880	0.005517
53	Agnes	F	473	1880	0.005198
54	Marie	F	471	1880	0.005176
55	Nora	F	471	1880	0.005176
56	May	F	462	1880	0.005077
57	Mamie	F	436	1880	0.004792
58	Blanche	F	427	1880	0.004693
59	Stella	F	414	1880	0.004550
	...	...	...	...	...

1690784 rows × 5 columns

In [119]:

np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

Out[119]:

True

In [120]:

def get_top1000(group):
    return group.sort_index(by = 'births', ascending = False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

In [122]:

pieces = []

for year, group in names.groupby(['year', 'sex']):
    pieces.append(group.sort_index(by = 'births', ascending = False)[:1000])
top1000 = pd.concat(pieces, ignore_index = True)

In [123]:

top1000.index = np.arange(len(top1000))
top1000

Out[123]:

	names	sex	births	year	prop
0	Mary	F	7065	1880	0.077643
1	Anna	F	2604	1880	0.028618
2	Emma	F	2003	1880	0.022013
3	Elizabeth	F	1939	1880	0.021309
4	Minnie	F	1746	1880	0.019188
5	Margaret	F	1578	1880	0.017342
6	Ida	F	1472	1880	0.016177
7	Alice	F	1414	1880	0.015540
8	Bertha	F	1320	1880	0.014507
9	Sarah	F	1288	1880	0.014155
10	Annie	F	1258	1880	0.013825
11	Clara	F	1226	1880	0.013474
12	Ella	F	1156	1880	0.012704
13	Florence	F	1063	1880	0.011682
14	Cora	F	1045	1880	0.011484
15	Martha	F	1040	1880	0.011429
16	Laura	F	1012	1880	0.011122
17	Nellie	F	995	1880	0.010935
18	Grace	F	982	1880	0.010792
19	Carrie	F	949	1880	0.010429
20	Maude	F	858	1880	0.009429
21	Mabel	F	808	1880	0.008880
22	Bessie	F	794	1880	0.008726
23	Jennie	F	793	1880	0.008715
24	Gertrude	F	787	1880	0.008649
25	Julia	F	783	1880	0.008605
26	Hattie	F	769	1880	0.008451
27	Edith	F	768	1880	0.008440
28	Mattie	F	704	1880	0.007737
29	Rose	F	700	1880	0.007693
30	Catherine	F	688	1880	0.007561
31	Lillian	F	672	1880	0.007385
32	Ada	F	652	1880	0.007165
33	Lillie	F	647	1880	0.007110
34	Helen	F	636	1880	0.006990
35	Jessie	F	635	1880	0.006979
36	Louise	F	635	1880	0.006979
37	Ethel	F	633	1880	0.006957
38	Lula	F	621	1880	0.006825
39	Myrtle	F	615	1880	0.006759
40	Eva	F	614	1880	0.006748
41	Frances	F	605	1880	0.006649
42	Lena	F	603	1880	0.006627
43	Lucy	F	591	1880	0.006495
44	Edna	F	588	1880	0.006462
45	Maggie	F	582	1880	0.006396
46	Pearl	F	569	1880	0.006253
47	Daisy	F	564	1880	0.006198
48	Fannie	F	560	1880	0.006154
49	Josephine	F	544	1880	0.005978
50	Dora	F	524	1880	0.005759
51	Rosa	F	507	1880	0.005572
52	Katherine	F	502	1880	0.005517
53	Agnes	F	473	1880	0.005198
54	Marie	F	471	1880	0.005176
55	Nora	F	471	1880	0.005176
56	May	F	462	1880	0.005077
57	Mamie	F	436	1880	0.004792
58	Blanche	F	427	1880	0.004693
59	Stella	F	414	1880	0.004550
	...	...	...	...	...

261877 rows × 5 columns

Analyzing naming trends¶

In [124]:

boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

In [126]:

total_births = top1000.pivot_table('births', rows = 'year', cols = 'names', aggfunc = sum)

In [127]:

total_births

Out[127]:

names	Aaden	Aaliyah	Aarav	Aaron	Aarush	Ab	Abagail	Abb	Abbey	Abbie	Abbigail	Abbott	Abby	Abdiel	Abdul	Abdullah	Abe	Abel	Abelardo	Abigail
year
1880	NaN	NaN	NaN	102	NaN	NaN	NaN	NaN	NaN	71	NaN	NaN	6	NaN	NaN	NaN	50	9	NaN	12	...
1881	NaN	NaN	NaN	94	NaN	NaN	NaN	NaN	NaN	81	NaN	NaN	7	NaN	NaN	NaN	36	12	NaN	8	...
1882	NaN	NaN	NaN	85	NaN	NaN	NaN	NaN	NaN	80	NaN	NaN	11	NaN	NaN	NaN	50	10	NaN	14	...
1883	NaN	NaN	NaN	105	NaN	NaN	NaN	NaN	NaN	79	NaN	NaN	NaN	NaN	NaN	NaN	43	12	NaN	11	...
1884	NaN	NaN	NaN	97	NaN	NaN	NaN	NaN	NaN	98	NaN	NaN	6	NaN	NaN	NaN	45	14	NaN	13	...
1885	NaN	NaN	NaN	88	NaN	6	NaN	NaN	NaN	88	NaN	NaN	NaN	NaN	NaN	NaN	47	6	NaN	9	...
1886	NaN	NaN	NaN	86	NaN	NaN	NaN	NaN	NaN	84	NaN	NaN	7	NaN	NaN	NaN	50	16	NaN	15	...
1887	NaN	NaN	NaN	78	NaN	NaN	NaN	NaN	NaN	104	NaN	NaN	NaN	NaN	NaN	NaN	37	11	NaN	13	...
1888	NaN	NaN	NaN	90	NaN	NaN	NaN	NaN	NaN	137	NaN	6	9	NaN	NaN	NaN	46	8	NaN	18	...
1889	NaN	NaN	NaN	85	NaN	NaN	NaN	NaN	NaN	107	NaN	NaN	NaN	NaN	NaN	NaN	39	9	NaN	20	...
1890	NaN	NaN	NaN	96	NaN	NaN	NaN	6	NaN	140	NaN	NaN	NaN	NaN	NaN	NaN	49	14	NaN	NaN	...
1891	NaN	NaN	NaN	69	NaN	NaN	NaN	NaN	NaN	124	NaN	NaN	12	NaN	NaN	NaN	40	10	NaN	11	...
1892	NaN	NaN	NaN	95	NaN	NaN	NaN	NaN	NaN	119	NaN	NaN	NaN	NaN	NaN	NaN	53	8	NaN	11	...
1893	NaN	NaN	NaN	81	NaN	NaN	NaN	NaN	NaN	115	NaN	NaN	NaN	NaN	NaN	NaN	43	13	NaN	21	...
1894	NaN	NaN	NaN	79	NaN	NaN	NaN	NaN	NaN	118	NaN	NaN	NaN	NaN	NaN	NaN	50	15	NaN	13	...
1895	NaN	NaN	NaN	94	NaN	NaN	NaN	NaN	NaN	92	NaN	NaN	NaN	NaN	NaN	NaN	54	NaN	NaN	15	...
1896	NaN	NaN	NaN	69	NaN	NaN	NaN	NaN	NaN	121	NaN	NaN	NaN	NaN	NaN	NaN	51	15	NaN	15	...
1897	NaN	NaN	NaN	87	NaN	NaN	NaN	NaN	NaN	97	NaN	NaN	NaN	NaN	NaN	NaN	58	12	NaN	15	...
1898	NaN	NaN	NaN	89	NaN	NaN	NaN	NaN	NaN	120	NaN	NaN	NaN	NaN	NaN	NaN	67	10	NaN	NaN	...
1899	NaN	NaN	NaN	71	NaN	NaN	NaN	NaN	NaN	87	NaN	NaN	NaN	NaN	NaN	NaN	39	15	NaN	NaN	...
1900	NaN	NaN	NaN	104	NaN	NaN	NaN	NaN	NaN	112	NaN	NaN	NaN	NaN	NaN	NaN	56	15	NaN	NaN	...
1901	NaN	NaN	NaN	80	NaN	NaN	NaN	NaN	NaN	87	NaN	NaN	NaN	NaN	NaN	NaN	48	13	NaN	13	...
1902	NaN	NaN	NaN	78	NaN	NaN	NaN	NaN	NaN	91	NaN	NaN	NaN	NaN	NaN	NaN	54	11	NaN	18	...
1903	NaN	NaN	NaN	93	NaN	NaN	NaN	NaN	NaN	91	NaN	NaN	NaN	NaN	NaN	NaN	58	14	NaN	20	...
1904	NaN	NaN	NaN	117	NaN	NaN	NaN	NaN	NaN	80	NaN	NaN	NaN	NaN	NaN	NaN	52	10	NaN	NaN	...
1905	NaN	NaN	NaN	96	NaN	NaN	NaN	NaN	NaN	73	NaN	NaN	NaN	NaN	NaN	NaN	65	NaN	NaN	NaN	...
1906	NaN	NaN	NaN	96	NaN	NaN	NaN	NaN	NaN	72	NaN	NaN	NaN	NaN	NaN	NaN	50	11	NaN	15	...
1907	NaN	NaN	NaN	130	NaN	NaN	NaN	NaN	NaN	79	NaN	NaN	NaN	NaN	NaN	NaN	65	10	NaN	NaN	...
1908	NaN	NaN	NaN	114	NaN	NaN	NaN	NaN	NaN	84	NaN	NaN	NaN	NaN	NaN	NaN	57	15	NaN	NaN	...
1909	NaN	NaN	NaN	142	NaN	NaN	NaN	NaN	NaN	57	NaN	NaN	NaN	NaN	NaN	NaN	67	16	NaN	NaN	...
1910	NaN	NaN	NaN	145	NaN	NaN	NaN	NaN	NaN	79	NaN	NaN	NaN	NaN	NaN	NaN	74	NaN	NaN	NaN	...
1911	NaN	NaN	NaN	187	NaN	NaN	NaN	NaN	NaN	77	NaN	NaN	NaN	NaN	NaN	NaN	94	30	NaN	NaN	...
1912	NaN	NaN	NaN	303	NaN	NaN	NaN	NaN	NaN	90	NaN	NaN	NaN	NaN	NaN	NaN	172	40	NaN	NaN	...
1913	NaN	NaN	NaN	417	NaN	NaN	NaN	NaN	NaN	121	NaN	NaN	NaN	NaN	NaN	NaN	202	48	NaN	NaN	...
1914	NaN	NaN	NaN	485	NaN	NaN	NaN	NaN	NaN	99	NaN	NaN	NaN	NaN	NaN	NaN	225	51	NaN	NaN	...
1915	NaN	NaN	NaN	553	NaN	NaN	NaN	NaN	NaN	124	NaN	NaN	NaN	NaN	NaN	NaN	271	64	NaN	NaN	...
1916	NaN	NaN	NaN	584	NaN	NaN	NaN	NaN	NaN	146	NaN	NaN	NaN	NaN	NaN	NaN	261	62	NaN	NaN	...
1917	NaN	NaN	NaN	597	NaN	NaN	NaN	NaN	NaN	136	NaN	NaN	NaN	NaN	NaN	NaN	280	86	NaN	NaN	...
1918	NaN	NaN	NaN	656	NaN	NaN	NaN	NaN	NaN	120	NaN	NaN	NaN	NaN	NaN	NaN	256	76	NaN	NaN	...
1919	NaN	NaN	NaN	645	NaN	NaN	NaN	NaN	NaN	147	NaN	NaN	NaN	NaN	NaN	NaN	265	91	NaN	NaN	...
1920	NaN	NaN	NaN	668	NaN	NaN	NaN	NaN	NaN	98	NaN	NaN	NaN	NaN	NaN	NaN	206	79	NaN	NaN	...
1921	NaN	NaN	NaN	696	NaN	NaN	NaN	NaN	NaN	134	NaN	NaN	NaN	NaN	NaN	NaN	230	84	NaN	NaN	...
1922	NaN	NaN	NaN	700	NaN	NaN	NaN	NaN	NaN	142	NaN	NaN	NaN	NaN	NaN	NaN	176	91	NaN	NaN	...
1923	NaN	NaN	NaN	616	NaN	NaN	NaN	NaN	NaN	129	NaN	NaN	NaN	NaN	NaN	NaN	174	73	NaN	NaN	...
1924	NaN	NaN	NaN	683	NaN	NaN	NaN	NaN	NaN	96	NaN	NaN	NaN	NaN	NaN	NaN	174	111	NaN	NaN	...
1925	NaN	NaN	NaN	644	NaN	NaN	NaN	NaN	NaN	128	NaN	NaN	NaN	NaN	NaN	NaN	117	83	NaN	NaN	...
1926	NaN	NaN	NaN	593	NaN	NaN	NaN	NaN	NaN	105	NaN	NaN	NaN	NaN	NaN	NaN	134	101	NaN	NaN	...
1927	NaN	NaN	NaN	546	NaN	NaN	NaN	NaN	NaN	103	NaN	NaN	NaN	NaN	NaN	NaN	139	78	NaN	NaN	...
1928	NaN	NaN	NaN	557	NaN	NaN	NaN	NaN	NaN	92	NaN	NaN	NaN	NaN	NaN	NaN	90	85	NaN	NaN	...
1929	NaN	NaN	NaN	469	NaN	NaN	NaN	NaN	NaN	78	NaN	NaN	NaN	NaN	NaN	NaN	61	96	NaN	NaN	...
1930	NaN	NaN	NaN	500	NaN	NaN	NaN	NaN	NaN	77	NaN	NaN	NaN	NaN	NaN	NaN	85	123	NaN	NaN	...
1931	NaN	NaN	NaN	458	NaN	NaN	NaN	NaN	NaN	71	NaN	NaN	NaN	NaN	NaN	NaN	79	93	NaN	NaN	...
1932	NaN	NaN	NaN	514	NaN	NaN	NaN	NaN	NaN	57	NaN	NaN	NaN	NaN	NaN	NaN	83	94	NaN	NaN	...
1933	NaN	NaN	NaN	460	NaN	NaN	NaN	NaN	NaN	64	NaN	NaN	NaN	NaN	NaN	NaN	68	96	NaN	NaN	...
1934	NaN	NaN	NaN	478	NaN	NaN	NaN	NaN	NaN	60	NaN	NaN	NaN	NaN	NaN	NaN	75	93	NaN	NaN	...
1935	NaN	NaN	NaN	461	NaN	NaN	NaN	NaN	NaN	60	NaN	NaN	NaN	NaN	NaN	NaN	59	92	NaN	NaN	...
1936	NaN	NaN	NaN	443	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	70	74	NaN	NaN	...
1937	NaN	NaN	NaN	464	NaN	NaN	NaN	NaN	NaN	52	NaN	NaN	NaN	NaN	NaN	NaN	64	91	NaN	NaN	...
1938	NaN	NaN	NaN	478	NaN	NaN	NaN	NaN	NaN	57	NaN	NaN	NaN	NaN	NaN	NaN	55	87	NaN	NaN	...
1939	NaN	NaN	NaN	471	NaN	NaN	NaN	NaN	NaN	69	NaN	NaN	NaN	NaN	NaN	NaN	56	95	NaN	53	...
	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...

131 rows × 6865 columns

In [128]:

subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots = True, figsize = (12, 10), grid = False, title = 'Number of births per year')

Out[128]:

array([<matplotlib.axes.AxesSubplot object at 0x7fb873f6c210>,
       <matplotlib.axes.AxesSubplot object at 0x7fb87152c150>,
       <matplotlib.axes.AxesSubplot object at 0x7fb8714f86d0>,
       <matplotlib.axes.AxesSubplot object at 0x7fb87141aad0>], dtype=object)

Measure the increase of naming density¶

In [129]:

plt.figure()

Out[129]:

<matplotlib.figure.Figure at 0x7fb87175d050>

<matplotlib.figure.Figure at 0x7fb87175d050>

In [131]:

table = top1000.pivot_table('prop', rows = 'year', cols = 'sex', aggfunc = sum)
table.plot(title = 'Sum of table.top1000.prop by sex and year', 
           yticks = np.linspace(0, 1.2, 13),
           xticks = range(1880, 2010, 10))

Out[131]:

<matplotlib.axes.AxesSubplot at 0x7fb8713d91d0>

In [132]:

df = boys[boys.year == 2010]
df

Out[132]:

	names	sex	births	year	prop
260877	Jacob	M	21875	2010	0.011523
260878	Ethan	M	17866	2010	0.009411
260879	Michael	M	17133	2010	0.009025
260880	Jayden	M	17030	2010	0.008971
260881	William	M	16870	2010	0.008887
260882	Alexander	M	16634	2010	0.008762
260883	Noah	M	16281	2010	0.008576
260884	Daniel	M	15679	2010	0.008259
260885	Aiden	M	15403	2010	0.008114
260886	Anthony	M	15364	2010	0.008093
260887	Joshua	M	15238	2010	0.008027
260888	Mason	M	14728	2010	0.007758
260889	Christopher	M	14135	2010	0.007446
260890	Andrew	M	14093	2010	0.007424
260891	David	M	14042	2010	0.007397
260892	Matthew	M	13954	2010	0.007350
260893	Logan	M	13943	2010	0.007345
260894	Elijah	M	13735	2010	0.007235
260895	James	M	13714	2010	0.007224
260896	Joseph	M	13657	2010	0.007194
260897	Gabriel	M	12722	2010	0.006701
260898	Benjamin	M	12280	2010	0.006469
260899	Ryan	M	11886	2010	0.006261
260900	Samuel	M	11776	2010	0.006203
260901	Jackson	M	11693	2010	0.006159
260902	John	M	11424	2010	0.006018
260903	Nathan	M	11269	2010	0.005936
260904	Jonathan	M	11028	2010	0.005809
260905	Christian	M	10965	2010	0.005776
260906	Liam	M	10852	2010	0.005716
260907	Dylan	M	10455	2010	0.005507
260908	Landon	M	10400	2010	0.005478
260909	Caleb	M	10359	2010	0.005457
260910	Tyler	M	10357	2010	0.005456
260911	Lucas	M	10294	2010	0.005423
260912	Evan	M	9655	2010	0.005086
260913	Gavin	M	9551	2010	0.005031
260914	Nicholas	M	9549	2010	0.005030
260915	Isaac	M	9254	2010	0.004875
260916	Brayden	M	9046	2010	0.004765
260917	Luke	M	8767	2010	0.004618
260918	Angel	M	8716	2010	0.004591
260919	Brandon	M	8473	2010	0.004463
260920	Jack	M	8457	2010	0.004455
260921	Isaiah	M	8443	2010	0.004447
260922	Jordan	M	8156	2010	0.004296
260923	Owen	M	8136	2010	0.004286
260924	Carter	M	8101	2010	0.004267
260925	Connor	M	7991	2010	0.004209
260926	Justin	M	7792	2010	0.004105
260927	Jose	M	7656	2010	0.004033
260928	Jeremiah	M	7645	2010	0.004027
260929	Julian	M	7534	2010	0.003969
260930	Robert	M	7471	2010	0.003935
260931	Aaron	M	7374	2010	0.003884
260932	Adrian	M	7354	2010	0.003874
260933	Wyatt	M	7319	2010	0.003855
260934	Kevin	M	7278	2010	0.003834
260935	Hunter	M	7271	2010	0.003830
260936	Cameron	M	7137	2010	0.003760
	...	...	...	...	...

1000 rows × 5 columns

In [133]:

prop_cumsum = df.sort_index(by = 'prop', ascending = False).prop.cumsum()
prop_cumsum[:10]

Out[133]:

260877    0.011523
260878    0.020934
260879    0.029959
260880    0.038930
260881    0.047817
260882    0.056579
260883    0.065155
260884    0.073414
260885    0.081528
260886    0.089621
Name: prop, dtype: float64

In [135]:

prop_cumsum.values.searchsorted(0.5)

Out[135]:

In [138]:

df = boys[boys.year == 1900]
in1900 = df.sort_index(by = 'prop', ascending = False).prop.cumsum()
in1900.values.searchsorted(0.5) + 1

Out[138]:

In [140]:

def get_quantile_count(group, q = 0.5):
    group = group.sort_index(by = 'prop', ascending = False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head()

Out[140]:

sex	F	M
year
1880	38	14
1881	38	14
1882	38	15
1883	39	15
1884	39	16

5 rows × 2 columns

In [141]:

diversity.plot(title = 'Number of Popular names in top 50%')

Out[141]:

<matplotlib.axes.AxesSubplot at 0x7fb87124ffd0>

The last letter revolution¶

In [145]:

# Extract the last letter from the names
get_last_letter = lambda x: x[-1]

last_letters = names.names.apply(get_last_letter)
last_letters.names = 'last_letter'

table = names.pivot_table('births', rows = last_letters, cols = ['sex', 'year'], aggfunc=sum)

In [147]:

subtable = table.reindex(colums = [1910, 1960, 2010], levels = 'year')
subtable.head()

Out[147]:

sex	F
year	1880	1881	1882	1883	1884	1885	1886	1887	1888	1889	1890	1891	1892	1893	1894	1895	1896	1897	1898	1899
names
a	31446	31581	36536	38330	43680	45408	49100	48942	59442	58631	62313	60582	68331	67821	70631	73002	73584	72148	79150	70712	...
b	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...
c	NaN	NaN	5	5	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5	NaN	NaN	NaN	...
d	609	607	734	810	916	862	1007	1027	1298	1374	1438	1512	1775	1821	1985	2268	2372	2455	2953	3028	...
e	33378	34080	40399	41914	48089	49616	53884	54353	66750	66663	70948	67750	77186	76455	79938	83875	84355	82783	91151	81395	...

5 rows × 262 columns

In [148]:

subtable.sum()

Out[148]:

sex  year
F    1880     90993
     1881     91955
     1882    107851
     1883    112322
     1884    129021
     1885    133056
     1886    144538
     1887    145983
     1888    178631
     1889    178369
     1890    190377
     1891    185486
     1892    212350
     1893    212908
     1894    222923
...
M    1996    1892700
     1997    1883571
     1998    1909676
     1999    1918267
     2000    1961702
     2001    1940498
     2002    1938941
     2003    1972439
     2004    1981557
     2005    1993285
     2006    2050234
     2007    2069242
     2008    2032310
     2009    1973359
     2010    1898382
Length: 262, dtype: float64

In [149]:

letter_prop = subtable/subtable.sum().astype(float)

In [152]:

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize = (10, 8))
letter_prop['M'].plot(kind = 'bar', rot = 0, ax = axes[0], title = 'Male', legend = False)
letter_prop['F'].plot(kind = 'bar', rot = 0, ax = axes[1], title = 'Female', legend = False)

Out[152]:

<matplotlib.axes.AxesSubplot at 0x7fb87080edd0>

In [153]:

plt.subplots_adjust(hspace = 0.25)

<matplotlib.figure.Figure at 0x7fb870c82d90>

In [157]:

letter_prop = table/table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y', 'm'], 'M'].T
dny_ts.head()

Out[157]:

	d	n	y	m
year
1880	0.083055	0.153213	0.075760	0.101907
1881	0.083247	0.153214	0.077451	0.101411
1882	0.085340	0.149560	0.077537	0.099000
1883	0.084066	0.151646	0.079144	0.096663
1884	0.086120	0.149915	0.080405	0.093774

5 rows × 4 columns

In [158]:

plt.close('all')

In [159]:

dny_ts.plot()

Out[159]:

<matplotlib.axes.AxesSubplot at 0x7fb868bd7490>

boy names that became girls' names (and vice versa)¶

In [160]:

all_names = top1000.names.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like

Out[160]:

array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)

In [161]:

filtered = top1000[top1000.names.isin(lesley_like)]
filtered.groupby('names').births.sum()

Out[161]:

names
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64

In [163]:

table = filtered.pivot_table('births', rows = 'year', cols = 'sex', aggfunc = sum)
table = table.div(table.sum(1), axis = 0)
table.tail()

Out[163]:

sex	F	M
year
1880	0.091954	0.908046
1881	0.106796	0.893204
1882	0.065693	0.934307
1883	0.053030	0.946970
1884	0.107143	0.892857
1885	0.075758	0.924242
1886	0.055556	0.944444
1887	0.067416	0.932584
1888	0.116162	0.883838
1889	0.129213	0.870787
1890	0.099502	0.900498
1891	0.145833	0.854167
1892	0.096070	0.903930
1893	0.123223	0.876777
1894	0.138996	0.861004
1895	0.085603	0.914397
1896	0.102273	0.897727
1897	0.132812	0.867188
1898	0.092308	0.907692
1899	0.090452	0.909548
1900	0.095238	0.904762
1901	0.124464	0.875536
1902	0.128472	0.871528
1903	0.089552	0.910448
1904	0.109890	0.890110
1905	0.124113	0.875887
1906	0.099315	0.900685
1907	0.110749	0.889251
1908	0.123867	0.876133
1909	0.107034	0.892966
1910	0.137931	0.862069
1911	0.097765	0.902235
1912	0.066038	0.933962
1913	0.058427	0.941573
1914	0.059441	0.940559
1915	0.058824	0.941176
1916	0.069368	0.930632
1917	0.064286	0.935714
1918	0.071630	0.928370
1919	0.069049	0.930951
1920	0.053026	0.946974
1921	0.071106	0.928894
1922	0.070267	0.929733
1923	0.088252	0.911748
1924	0.078246	0.921754
1925	0.081382	0.918618
1926	0.086497	0.913503
1927	0.078541	0.921459
1928	0.068780	0.931220
1929	0.075166	0.924834
1930	0.096140	0.903860
1931	0.078234	0.921766
1932	0.073841	0.926159
1933	0.095423	0.904577
1934	0.090514	0.909486
1935	0.116968	0.883032
1936	0.125526	0.874474
1937	0.124736	0.875264
1938	0.123856	0.876144
1939	0.117841	0.882159
	...	...

131 rows × 2 columns

In [164]:

plt.close('all')

In [166]:

table.plot(style = {'M', 'k-', 'F': 'k--'})

  File "<ipython-input-166-e72db582d4c7>", line 1
    table.plot(style = {'M', 'k-', 'F': 'k--'})
                                      ^
SyntaxError: invalid syntax

In [ ]: