path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
open(path).readline()
import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]
records[0]
{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', u'al': u'en-US,en;q=0.8', u'c': u'US', u'cy': u'Danvers', u'g': u'A6qOVH', u'gr': u'MA', u'h': u'wfLQtf', u'hc': 1331822918, u'hh': u'1.usa.gov', u'l': u'orofrog', u'll': [42.576698, -70.954903], u'nk': 1, u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf', u't': 1331923247, u'tz': u'America/New_York', u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
records[0]['tz']
u'America/New_York'
print (records[0]['tz'])
America/New_York
timezones = [rec['tz'] for rec in records]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-6-33fd9d9b9434> in <module>() ----> 1 timezones = [rec['tz'] for rec in records] KeyError: 'tz'
print (rec['tz'] for rec in records)
<generator object <genexpr> at 0x1098b13c0>
timezones = [rec['tz'] for rec in records if 'tz' in rec]
timezones[:10]
[u'America/New_York', u'America/Denver', u'America/New_York', u'America/Sao_Paulo', u'America/New_York', u'America/New_York', u'Europe/Warsaw', u'', u'', u'']
def get_counts(sequences):
count = {}
for i in sequences:
if x in count:
count[x] += 1
else:
count[x] = 1
return count
from collections import defaultdict
def get_counts(sequences):
count = defaultdict(int)
for x in count:
count[x] += 1
return count
counts = get_counts(timezones)
print counts
defaultdict(<type 'int'>, {'America/New_York': 0})
counts['America/New_York']
0
len(timezones)
3440
def top_counts(count_dict, n = 10):
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n: ]
top_counts(counts)
[(0, 'America/New_York')]
from collections import Counter
counts = Counter(timezones)
counts.most_common(10)
[(u'America/New_York', 1251), (u'', 521), (u'America/Chicago', 400), (u'America/Los_Angeles', 382), (u'America/Denver', 191), (u'Europe/London', 74), (u'Asia/Tokyo', 37), (u'Pacific/Honolulu', 36), (u'Europe/Madrid', 35), (u'America/Sao_Paulo', 33)]
% matplotlib inline
/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment. warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('figure', figsize = (10, 6))
np.set_printoptions(precision = 4)
import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
lines = open(path).readlines()
records = [json.loads(line) for line in lines]
from pandas import DataFrame, Series
import pandas as pd
frame = DataFrame(records)
frame.head()
_heartbeat_ | a | al | c | cy | g | gr | h | hc | hh | kw | l | ll | nk | r | t | tz | u | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Danvers | A6qOVH | MA | wfLQtf | 1331822918 | 1.usa.gov | NaN | orofrog | [42.576698, -70.954903] | 1 | http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... | 1331923247 | America/New_York | http://www.ncbi.nlm.nih.gov/pubmed/22415991 |
1 | NaN | GoogleMaps/RochesterNY | NaN | US | Provo | mwszkS | UT | mwszkS | 1308262393 | j.mp | NaN | bitly | [40.218102, -111.613297] | 0 | http://www.AwareMap.com/ | 1331923249 | America/Denver | http://www.monroecounty.gov/etc/911/rss.php |
2 | NaN | Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | en-US | US | Washington | xxr3Qb | DC | xxr3Qb | 1331919941 | 1.usa.gov | NaN | bitly | [38.9007, -77.043098] | 1 | http://t.co/03elZC4Q | 1331923250 | America/New_York | http://boxer.senate.gov/en/press/releases/0316... |
3 | NaN | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | pt-br | BR | Braz | zCaLwp | 27 | zUtuOu | 1331923068 | 1.usa.gov | NaN | alelex88 | [-23.549999, -46.616699] | 0 | direct | 1331923249 | America/Sao_Paulo | http://apod.nasa.gov/apod/ap120312.html |
4 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Shrewsbury | 9b6kNl | MA | 9b6kNl | 1273672411 | bit.ly | NaN | bitly | [42.286499, -71.714699] | 0 | http://www.shrewsbury-ma.gov/selco/ | 1331923251 | America/New_York | http://www.shrewsbury-ma.gov/egov/gallery/1341... |
frame['tz'][:10]
0 America/New_York 1 America/Denver 2 America/New_York 3 America/Sao_Paulo 4 America/New_York 5 America/New_York 6 Europe/Warsaw 7 8 9 Name: tz, dtype: object
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
America/New_York 1251 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 America/Sao_Paulo 33 Name: tz, dtype: int64
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
America/New_York 1251 Unknown 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Missing 120 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 Name: tz, dtype: int64
plt.figure(figsize = (10, 4))
<matplotlib.figure.Figure at 0x10aed97d0>
<matplotlib.figure.Figure at 0x10aed97d0>
tz_counts[:10].plot(kind = 'barh', rot = 0)
<matplotlib.axes._subplots.AxesSubplot at 0x10c09af10>
frame['a'][1]
u'GoogleMaps/RochesterNY'
frame['a'][50]
u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
frame['a'][51]
u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
0 Mozilla/5.0 1 GoogleMaps/RochesterNY 2 Mozilla/4.0 3 Mozilla/5.0 4 Mozilla/5.0 dtype: object
results.value_counts()[:8]
Mozilla/5.0 2594 Mozilla/4.0 601 GoogleMaps/RochesterNY 121 Opera/9.80 34 TEST_INTERNET_AGENT 24 GoogleProducer 21 Mozilla/6.0 5 BlackBerry8520/5.0.0.681 4 dtype: int64
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
operating_system[:5]
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'], dtype='|S11')
by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
Not Windows | Windows | |
---|---|---|
tz | ||
245 | 276 | |
Africa/Cairo | 0 | 3 |
Africa/Casablanca | 0 | 1 |
Africa/Ceuta | 0 | 2 |
Africa/Johannesburg | 0 | 1 |
Africa/Lusaka | 0 | 1 |
America/Anchorage | 4 | 1 |
America/Argentina/Buenos_Aires | 1 | 0 |
America/Argentina/Cordoba | 0 | 1 |
America/Argentina/Mendoza | 0 | 1 |
10 rows × 2 columns
# use to sort ascending order
indexer = agg_counts.sum(1).argsort()
indexer[:10]
tz 24 Africa/Cairo 20 Africa/Casablanca 21 Africa/Ceuta 92 Africa/Johannesburg 87 Africa/Lusaka 53 America/Anchorage 54 America/Argentina/Buenos_Aires 57 America/Argentina/Cordoba 26 America/Argentina/Mendoza 55 dtype: int64
count_subset = agg_counts.take(indexer)[-10:]
count_subset
Not Windows | Windows | |
---|---|---|
tz | ||
America/Sao_Paulo | 13 | 20 |
Europe/Madrid | 16 | 19 |
Pacific/Honolulu | 0 | 36 |
Asia/Tokyo | 2 | 35 |
Europe/London | 43 | 31 |
America/Denver | 132 | 59 |
America/Los_Angeles | 130 | 252 |
America/Chicago | 115 | 285 |
245 | 276 | |
America/New_York | 339 | 912 |
10 rows × 2 columns
plt.figure()
<matplotlib.figure.Figure at 0x7fb885721e10>
<matplotlib.figure.Figure at 0x7fb885721e10>
count_subset.plot(kind = 'barh', stacked = True)
<matplotlib.axes.AxesSubplot at 0x7fb885407a90>
plt.figure()
<matplotlib.figure.Figure at 0x7fb885218f10>
<matplotlib.figure.Figure at 0x7fb885218f10>
normed_subset = count_subset.div(count_subset.sum(1), axis = 0)
normed_subset.plot(kind = 'barh', stacked = True)
<matplotlib.axes.AxesSubplot at 0x7fb8854b2790>
import pandas as pd
encoding = 'latin1'
upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')
unames = ['user_id', 'gender', 'age', 'accupation', 'zip']
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']
users = pd.read_csv(upath, sep = '::', header = None, names = unames, encoding = encoding)
ratings = pd.read_csv(rpath, sep = '::', header = None, names = rnames, encoding = encoding)
movies = pd.read_csv(mpath, sep = '::', header = None, names = mnames, encoding = encoding)
users[:5]
user_id | gender | age | accupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
5 rows × 5 columns
ratings[:5]
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
5 rows × 4 columns
movies[:5]
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
5 rows × 3 columns
ratings
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
5 | 1 | 1197 | 3 | 978302268 |
6 | 1 | 1287 | 5 | 978302039 |
7 | 1 | 2804 | 5 | 978300719 |
8 | 1 | 594 | 4 | 978302268 |
9 | 1 | 919 | 4 | 978301368 |
10 | 1 | 595 | 5 | 978824268 |
11 | 1 | 938 | 4 | 978301752 |
12 | 1 | 2398 | 4 | 978302281 |
13 | 1 | 2918 | 4 | 978302124 |
14 | 1 | 1035 | 5 | 978301753 |
15 | 1 | 2791 | 4 | 978302188 |
16 | 1 | 2687 | 3 | 978824268 |
17 | 1 | 2018 | 4 | 978301777 |
18 | 1 | 3105 | 5 | 978301713 |
19 | 1 | 2797 | 4 | 978302039 |
20 | 1 | 2321 | 3 | 978302205 |
21 | 1 | 720 | 3 | 978300760 |
22 | 1 | 1270 | 5 | 978300055 |
23 | 1 | 527 | 5 | 978824195 |
24 | 1 | 2340 | 3 | 978300103 |
25 | 1 | 48 | 5 | 978824351 |
26 | 1 | 1097 | 4 | 978301953 |
27 | 1 | 1721 | 4 | 978300055 |
28 | 1 | 1545 | 4 | 978824139 |
29 | 1 | 745 | 3 | 978824268 |
30 | 1 | 2294 | 4 | 978824291 |
31 | 1 | 3186 | 4 | 978300019 |
32 | 1 | 1566 | 4 | 978824330 |
33 | 1 | 588 | 4 | 978824268 |
34 | 1 | 1907 | 4 | 978824330 |
35 | 1 | 783 | 4 | 978824291 |
36 | 1 | 1836 | 5 | 978300172 |
37 | 1 | 1022 | 5 | 978300055 |
38 | 1 | 2762 | 4 | 978302091 |
39 | 1 | 150 | 5 | 978301777 |
40 | 1 | 1 | 5 | 978824268 |
41 | 1 | 1961 | 5 | 978301590 |
42 | 1 | 1962 | 4 | 978301753 |
43 | 1 | 2692 | 4 | 978301570 |
44 | 1 | 260 | 4 | 978300760 |
45 | 1 | 1028 | 5 | 978301777 |
46 | 1 | 1029 | 5 | 978302205 |
47 | 1 | 1207 | 4 | 978300719 |
48 | 1 | 2028 | 5 | 978301619 |
49 | 1 | 531 | 4 | 978302149 |
50 | 1 | 3114 | 4 | 978302174 |
51 | 1 | 608 | 4 | 978301398 |
52 | 1 | 1246 | 4 | 978302091 |
53 | 2 | 1357 | 5 | 978298709 |
54 | 2 | 3068 | 4 | 978299000 |
55 | 2 | 1537 | 4 | 978299620 |
56 | 2 | 647 | 3 | 978299351 |
57 | 2 | 2194 | 4 | 978299297 |
58 | 2 | 648 | 4 | 978299913 |
59 | 2 | 2268 | 5 | 978299297 |
... | ... | ... | ... |
1000209 rows × 4 columns
data = pd.merge(pd.merge(ratings, users), movies)
data
user_id | movie_id | rating | timestamp | gender | age | accupation | zip | title | genres | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 | F | 1 | 10 | 48067 | One Flew Over the Cuckoo's Nest (1975) | Drama |
1 | 2 | 1193 | 5 | 978298413 | M | 56 | 16 | 70072 | One Flew Over the Cuckoo's Nest (1975) | Drama |
2 | 12 | 1193 | 4 | 978220179 | M | 25 | 12 | 32793 | One Flew Over the Cuckoo's Nest (1975) | Drama |
3 | 15 | 1193 | 4 | 978199279 | M | 25 | 7 | 22903 | One Flew Over the Cuckoo's Nest (1975) | Drama |
4 | 17 | 1193 | 5 | 978158471 | M | 50 | 1 | 95350 | One Flew Over the Cuckoo's Nest (1975) | Drama |
5 | 18 | 1193 | 4 | 978156168 | F | 18 | 3 | 95825 | One Flew Over the Cuckoo's Nest (1975) | Drama |
6 | 19 | 1193 | 5 | 982730936 | M | 1 | 10 | 48073 | One Flew Over the Cuckoo's Nest (1975) | Drama |
7 | 24 | 1193 | 5 | 978136709 | F | 25 | 7 | 10023 | One Flew Over the Cuckoo's Nest (1975) | Drama |
8 | 28 | 1193 | 3 | 978125194 | F | 25 | 1 | 14607 | One Flew Over the Cuckoo's Nest (1975) | Drama |
9 | 33 | 1193 | 5 | 978557765 | M | 45 | 3 | 55421 | One Flew Over the Cuckoo's Nest (1975) | Drama |
10 | 39 | 1193 | 5 | 978043535 | M | 18 | 4 | 61820 | One Flew Over the Cuckoo's Nest (1975) | Drama |
11 | 42 | 1193 | 3 | 978038981 | M | 25 | 8 | 24502 | One Flew Over the Cuckoo's Nest (1975) | Drama |
12 | 44 | 1193 | 4 | 978018995 | M | 45 | 17 | 98052 | One Flew Over the Cuckoo's Nest (1975) | Drama |
13 | 47 | 1193 | 4 | 977978345 | M | 18 | 4 | 94305 | One Flew Over the Cuckoo's Nest (1975) | Drama |
14 | 48 | 1193 | 4 | 977975061 | M | 25 | 4 | 92107 | One Flew Over the Cuckoo's Nest (1975) | Drama |
15 | 49 | 1193 | 4 | 978813972 | M | 18 | 12 | 77084 | One Flew Over the Cuckoo's Nest (1975) | Drama |
16 | 53 | 1193 | 5 | 977946400 | M | 25 | 0 | 96931 | One Flew Over the Cuckoo's Nest (1975) | Drama |
17 | 54 | 1193 | 5 | 977944039 | M | 50 | 1 | 56723 | One Flew Over the Cuckoo's Nest (1975) | Drama |
18 | 58 | 1193 | 5 | 977933866 | M | 25 | 2 | 30303 | One Flew Over the Cuckoo's Nest (1975) | Drama |
19 | 59 | 1193 | 4 | 977934292 | F | 50 | 1 | 55413 | One Flew Over the Cuckoo's Nest (1975) | Drama |
20 | 62 | 1193 | 4 | 977968584 | F | 35 | 3 | 98105 | One Flew Over the Cuckoo's Nest (1975) | Drama |
21 | 80 | 1193 | 4 | 977786172 | M | 56 | 1 | 49327 | One Flew Over the Cuckoo's Nest (1975) | Drama |
22 | 81 | 1193 | 5 | 977785864 | F | 25 | 0 | 60640 | One Flew Over the Cuckoo's Nest (1975) | Drama |
23 | 88 | 1193 | 5 | 977694161 | F | 45 | 1 | 02476 | One Flew Over the Cuckoo's Nest (1975) | Drama |
24 | 89 | 1193 | 5 | 977683596 | F | 56 | 9 | 85749 | One Flew Over the Cuckoo's Nest (1975) | Drama |
25 | 95 | 1193 | 5 | 977626632 | M | 45 | 0 | 98201 | One Flew Over the Cuckoo's Nest (1975) | Drama |
26 | 96 | 1193 | 3 | 977621789 | F | 25 | 16 | 78028 | One Flew Over the Cuckoo's Nest (1975) | Drama |
27 | 99 | 1193 | 2 | 982791053 | F | 1 | 10 | 19390 | One Flew Over the Cuckoo's Nest (1975) | Drama |
28 | 102 | 1193 | 5 | 1040737607 | M | 35 | 19 | 20871 | One Flew Over the Cuckoo's Nest (1975) | Drama |
29 | 104 | 1193 | 2 | 977546620 | M | 25 | 12 | 00926 | One Flew Over the Cuckoo's Nest (1975) | Drama |
30 | 107 | 1193 | 5 | 977533908 | M | 45 | 18 | 63129 | One Flew Over the Cuckoo's Nest (1975) | Drama |
31 | 109 | 1193 | 2 | 977516253 | M | 45 | 15 | 92028 | One Flew Over the Cuckoo's Nest (1975) | Drama |
32 | 120 | 1193 | 5 | 977458103 | M | 25 | 11 | 27106 | One Flew Over the Cuckoo's Nest (1975) | Drama |
33 | 123 | 1193 | 5 | 977448350 | M | 35 | 9 | 67208 | One Flew Over the Cuckoo's Nest (1975) | Drama |
34 | 131 | 1193 | 5 | 977431822 | M | 18 | 4 | 06520 | One Flew Over the Cuckoo's Nest (1975) | Drama |
35 | 136 | 1193 | 4 | 977421091 | M | 18 | 2 | 21202 | One Flew Over the Cuckoo's Nest (1975) | Drama |
36 | 139 | 1193 | 4 | 977359402 | F | 25 | 20 | 45409 | One Flew Over the Cuckoo's Nest (1975) | Drama |
37 | 146 | 1193 | 4 | 979940868 | F | 35 | 20 | 10954 | One Flew Over the Cuckoo's Nest (1975) | Drama |
38 | 150 | 1193 | 5 | 978163667 | M | 35 | 7 | 98144 | One Flew Over the Cuckoo's Nest (1975) | Drama |
39 | 151 | 1193 | 4 | 993121122 | F | 25 | 20 | 85013 | One Flew Over the Cuckoo's Nest (1975) | Drama |
40 | 155 | 1193 | 5 | 977253254 | M | 35 | 12 | 07470 | One Flew Over the Cuckoo's Nest (1975) | Drama |
41 | 166 | 1193 | 5 | 977299851 | M | 18 | 4 | 92802 | One Flew Over the Cuckoo's Nest (1975) | Drama |
42 | 169 | 1193 | 5 | 977197066 | M | 25 | 7 | 55439 | One Flew Over the Cuckoo's Nest (1975) | Drama |
43 | 175 | 1193 | 5 | 978051929 | F | 25 | 2 | 95123 | One Flew Over the Cuckoo's Nest (1975) | Drama |
44 | 178 | 1193 | 4 | 977098352 | M | 56 | 17 | 53705 | One Flew Over the Cuckoo's Nest (1975) | Drama |
45 | 180 | 1193 | 5 | 977089623 | M | 45 | 12 | 01603 | One Flew Over the Cuckoo's Nest (1975) | Drama |
46 | 190 | 1193 | 5 | 977026946 | M | 25 | 17 | 55125 | One Flew Over the Cuckoo's Nest (1975) | Drama |
47 | 191 | 1193 | 3 | 977026629 | M | 18 | 4 | 04915 | One Flew Over the Cuckoo's Nest (1975) | Drama |
48 | 192 | 1193 | 4 | 977089850 | M | 18 | 1 | 10977 | One Flew Over the Cuckoo's Nest (1975) | Drama |
49 | 195 | 1193 | 5 | 977064554 | M | 25 | 12 | 10458 | One Flew Over the Cuckoo's Nest (1975) | Drama |
50 | 196 | 1193 | 4 | 977004875 | F | 35 | 9 | 94587 | One Flew Over the Cuckoo's Nest (1975) | Drama |
51 | 202 | 1193 | 4 | 976937045 | M | 18 | 4 | 53706 | One Flew Over the Cuckoo's Nest (1975) | Drama |
52 | 206 | 1193 | 1 | 976921007 | M | 25 | 17 | 20194 | One Flew Over the Cuckoo's Nest (1975) | Drama |
53 | 208 | 1193 | 5 | 984510357 | M | 35 | 17 | 55432 | One Flew Over the Cuckoo's Nest (1975) | Drama |
54 | 216 | 1193 | 4 | 976868526 | M | 45 | 13 | 52761 | One Flew Over the Cuckoo's Nest (1975) | Drama |
55 | 224 | 1193 | 4 | 976833559 | F | 18 | 4 | 14850 | One Flew Over the Cuckoo's Nest (1975) | Drama |
56 | 225 | 1193 | 4 | 976832915 | M | 25 | 7 | 11215 | One Flew Over the Cuckoo's Nest (1975) | Drama |
57 | 227 | 1193 | 4 | 976829587 | M | 35 | 20 | 90291 | One Flew Over the Cuckoo's Nest (1975) | Drama |
58 | 230 | 1193 | 4 | 976824488 | M | 45 | 1 | 43210 | One Flew Over the Cuckoo's Nest (1975) | Drama |
59 | 231 | 1193 | 4 | 997057648 | M | 25 | 3 | 55455 | One Flew Over the Cuckoo's Nest (1975) | Drama |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1000209 rows × 10 columns
data.ix[0]
user_id 1 movie_id 1193 rating 5 timestamp 978300760 gender F age 1 accupation 10 zip 48067 title One Flew Over the Cuckoo's Nest (1975) genres Drama Name: 0, dtype: object
mean_rating = data.pivot_table('rating', rows = 'title', cols = 'gender', aggfunc = 'mean')
mean_rating[:10]
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
1-900 (1994) | 2.000000 | 3.000000 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 |
101 Dalmatians (1961) | 3.791444 | 3.500000 |
101 Dalmatians (1996) | 3.240000 | 2.911215 |
12 Angry Men (1957) | 4.184397 | 4.328421 |
10 rows × 2 columns
rating_by_title = data.groupby('title').size()
rating_by_title[0:10]
title $1,000,000 Duck (1971) 37 'Night Mother (1986) 70 'Til There Was You (1997) 52 'burbs, The (1989) 303 ...And Justice for All (1979) 199 1-900 (1994) 2 10 Things I Hate About You (1999) 700 101 Dalmatians (1961) 565 101 Dalmatians (1996) 364 12 Angry Men (1957) 616 dtype: int64
active_titles = rating_by_title.index[rating_by_title >= 250]
active_titles[:10]
Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)', u'101 Dalmatians (1961)', u'101 Dalmatians (1996)', u'12 Angry Men (1957)', u'13th Warrior, The (1999)', u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)', u'2001: A Space Odyssey (1968)', u'2010 (1984)'], dtype='object')
mean_rating = mean_rating.ix[active_titles]
mean_rating
gender | F | M |
---|---|---|
title | ||
'burbs, The (1989) | 2.793478 | 2.962085 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 |
101 Dalmatians (1961) | 3.791444 | 3.500000 |
101 Dalmatians (1996) | 3.240000 | 2.911215 |
12 Angry Men (1957) | 4.184397 | 4.328421 |
13th Warrior, The (1999) | 3.112000 | 3.168000 |
2 Days in the Valley (1996) | 3.488889 | 3.244813 |
20,000 Leagues Under the Sea (1954) | 3.670103 | 3.709205 |
2001: A Space Odyssey (1968) | 3.825581 | 4.129738 |
2010 (1984) | 3.446809 | 3.413712 |
28 Days (2000) | 3.209424 | 2.977707 |
39 Steps, The (1935) | 3.965517 | 4.107692 |
54 (1998) | 2.701754 | 2.782178 |
7th Voyage of Sinbad, The (1958) | 3.409091 | 3.658879 |
8MM (1999) | 2.906250 | 2.850962 |
About Last Night... (1986) | 3.188679 | 3.140909 |
Absent Minded Professor, The (1961) | 3.469388 | 3.446809 |
Absolute Power (1997) | 3.469136 | 3.327759 |
Abyss, The (1989) | 3.659236 | 3.689507 |
Ace Ventura: Pet Detective (1994) | 3.000000 | 3.197917 |
Ace Ventura: When Nature Calls (1995) | 2.269663 | 2.543333 |
Addams Family Values (1993) | 3.000000 | 2.878531 |
Addams Family, The (1991) | 3.186170 | 3.163498 |
Adventures in Babysitting (1987) | 3.455782 | 3.208122 |
Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984) | 3.308511 | 3.402321 |
Adventures of Priscilla, Queen of the Desert, The (1994) | 3.989071 | 3.688811 |
Adventures of Robin Hood, The (1938) | 4.166667 | 3.918367 |
African Queen, The (1951) | 4.324232 | 4.223822 |
Age of Innocence, The (1993) | 3.827068 | 3.339506 |
Agnes of God (1985) | 3.534884 | 3.244898 |
Air America (1990) | 2.823529 | 2.741176 |
Air Force One (1997) | 3.699588 | 3.555822 |
Airplane II: The Sequel (1982) | 2.906250 | 2.995671 |
Airplane! (1980) | 3.656566 | 4.064419 |
Akira (1988) | 3.511111 | 3.980344 |
Aladdin (1992) | 3.857143 | 3.756494 |
Alice in Wonderland (1951) | 3.705882 | 3.692308 |
Alien (1979) | 3.888252 | 4.216119 |
Alien Nation (1988) | 3.433333 | 3.195946 |
Alien: Resurrection (1997) | 2.708738 | 2.997041 |
Aliens (1986) | 3.802083 | 4.186684 |
Alien� (1992) | 3.008264 | 3.042289 |
Alive (1993) | 3.469388 | 3.376866 |
All About Eve (1950) | 4.363057 | 4.186992 |
All About My Mother (Todo Sobre Mi Madre) (1999) | 4.333333 | 3.944000 |
All Quiet on the Western Front (1930) | 3.957447 | 4.244344 |
All That Jazz (1979) | 4.061856 | 3.718182 |
Almost Famous (2000) | 4.220217 | 4.228731 |
Amadeus (1984) | 4.346734 | 4.213415 |
American Beauty (1999) | 4.238901 | 4.347301 |
American Gigolo (1980) | 3.337500 | 3.106977 |
American Graffiti (1973) | 3.940476 | 4.055556 |
American History X (1998) | 4.181818 | 4.240741 |
American Movie (1999) | 4.028986 | 4.008850 |
American Pie (1999) | 3.539792 | 3.754545 |
American President, The (1995) | 3.923483 | 3.718654 |
American Psycho (2000) | 3.085271 | 3.253493 |
American Tail, An (1986) | 3.649254 | 3.318519 |
American Werewolf in London, An (1981) | 3.548077 | 3.800000 |
American Werewolf in Paris, An (1997) | 2.814815 | 2.739336 |
... | ... |
1216 rows × 2 columns
mean_rating = mean_rating.rename(index = {'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':
'Seven Samurai (Shichinin no samurai)(1954)'})
top_female_rating = mean_rating.sort_index(by = 'F', ascending = False)
top_female_rating[:10]
gender | F | M |
---|---|---|
title | ||
Close Shave, A (1995) | 4.644444 | 4.473795 |
Wrong Trousers, The (1993) | 4.588235 | 4.478261 |
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) | 4.572650 | 4.464589 |
Wallace & Gromit: The Best of Aardman Animation (1996) | 4.563107 | 4.385075 |
Schindler's List (1993) | 4.562602 | 4.491415 |
Shawshank Redemption, The (1994) | 4.539075 | 4.560625 |
Grand Day Out, A (1992) | 4.537879 | 4.293255 |
To Kill a Mockingbird (1962) | 4.536667 | 4.372611 |
Creature Comforts (1990) | 4.513889 | 4.272277 |
Usual Suspects, The (1995) | 4.513317 | 4.518248 |
10 rows × 2 columns
mean_rating['diff'] = mean_rating['M'] - mean_rating['F']
sorted_by_diff = mean_rating.sort_index(by = 'diff')
sorted_by_diff[:15]
gender | F | M | diff |
---|---|---|---|
title | |||
Dirty Dancing (1987) | 3.790378 | 2.959596 | -0.830782 |
Jumpin' Jack Flash (1986) | 3.254717 | 2.578358 | -0.676359 |
Grease (1978) | 3.975265 | 3.367041 | -0.608224 |
Little Women (1994) | 3.870588 | 3.321739 | -0.548849 |
Steel Magnolias (1989) | 3.901734 | 3.365957 | -0.535777 |
Anastasia (1997) | 3.800000 | 3.281609 | -0.518391 |
Rocky Horror Picture Show, The (1975) | 3.673016 | 3.160131 | -0.512885 |
Color Purple, The (1985) | 4.158192 | 3.659341 | -0.498851 |
Age of Innocence, The (1993) | 3.827068 | 3.339506 | -0.487561 |
Free Willy (1993) | 2.921348 | 2.438776 | -0.482573 |
French Kiss (1995) | 3.535714 | 3.056962 | -0.478752 |
Little Shop of Horrors, The (1960) | 3.650000 | 3.179688 | -0.470312 |
Guys and Dolls (1955) | 4.051724 | 3.583333 | -0.468391 |
Mary Poppins (1964) | 4.197740 | 3.730594 | -0.467147 |
Patch Adams (1998) | 3.473282 | 3.008746 | -0.464536 |
15 rows × 3 columns
# Reverse order of rows, taking first 15 rows
sorted_by_diff[::-1][:15]
gender | F | M | diff |
---|---|---|---|
title | |||
Good, The Bad and The Ugly, The (1966) | 3.494949 | 4.221300 | 0.726351 |
Kentucky Fried Movie, The (1977) | 2.878788 | 3.555147 | 0.676359 |
Dumb & Dumber (1994) | 2.697987 | 3.336595 | 0.638608 |
Longest Day, The (1962) | 3.411765 | 4.031447 | 0.619682 |
Cable Guy, The (1996) | 2.250000 | 2.863787 | 0.613787 |
Evil Dead II (Dead By Dawn) (1987) | 3.297297 | 3.909283 | 0.611985 |
Hidden, The (1987) | 3.137931 | 3.745098 | 0.607167 |
Rocky III (1982) | 2.361702 | 2.943503 | 0.581801 |
Caddyshack (1980) | 3.396135 | 3.969737 | 0.573602 |
For a Few Dollars More (1965) | 3.409091 | 3.953795 | 0.544704 |
Porky's (1981) | 2.296875 | 2.836364 | 0.539489 |
Animal House (1978) | 3.628906 | 4.167192 | 0.538286 |
Exorcist, The (1973) | 3.537634 | 4.067239 | 0.529605 |
Fright Night (1985) | 2.973684 | 3.500000 | 0.526316 |
Barb Wire (1996) | 1.585366 | 2.100386 | 0.515020 |
15 rows × 3 columns
# Standard deviation of rating grouped by title
rating_sd_by_title = data.groupby('title')['rating'].std()
# Filter down to active titles
rating_sd_by_title = rating_sd_by_title.ix[active_titles]
# Order series by values in ascending order
rating_sd_by_title.order(ascending=False)[:10]
title Dumb & Dumber (1994) 1.321333 Blair Witch Project, The (1999) 1.316368 Natural Born Killers (1994) 1.307198 Tank Girl (1995) 1.277695 Rocky Horror Picture Show, The (1975) 1.260177 Eyes Wide Shut (1999) 1.259624 Evita (1996) 1.253631 Billy Madison (1995) 1.249970 Fear and Loathing in Las Vegas (1998) 1.246408 Bicentennial Man (1999) 1.245533 Name: rating, dtype: float64
from __future__ import division
from numpy.random import randn
import numpy as np
import matplotlib.pylab as plt
plt.rc('figure', figsize = (12, 5))
np.set_printoptions(precision = 4)
import pandas as pd
names1880 = pd.read_csv('ch02/names/yob1880.txt', names = ['names', 'sex', 'births'])
names1880
names | sex | births | |
---|---|---|---|
0 | Mary | F | 7065 |
1 | Anna | F | 2604 |
2 | Emma | F | 2003 |
3 | Elizabeth | F | 1939 |
4 | Minnie | F | 1746 |
5 | Margaret | F | 1578 |
6 | Ida | F | 1472 |
7 | Alice | F | 1414 |
8 | Bertha | F | 1320 |
9 | Sarah | F | 1288 |
10 | Annie | F | 1258 |
11 | Clara | F | 1226 |
12 | Ella | F | 1156 |
13 | Florence | F | 1063 |
14 | Cora | F | 1045 |
15 | Martha | F | 1040 |
16 | Laura | F | 1012 |
17 | Nellie | F | 995 |
18 | Grace | F | 982 |
19 | Carrie | F | 949 |
20 | Maude | F | 858 |
21 | Mabel | F | 808 |
22 | Bessie | F | 794 |
23 | Jennie | F | 793 |
24 | Gertrude | F | 787 |
25 | Julia | F | 783 |
26 | Hattie | F | 769 |
27 | Edith | F | 768 |
28 | Mattie | F | 704 |
29 | Rose | F | 700 |
30 | Catherine | F | 688 |
31 | Lillian | F | 672 |
32 | Ada | F | 652 |
33 | Lillie | F | 647 |
34 | Helen | F | 636 |
35 | Jessie | F | 635 |
36 | Louise | F | 635 |
37 | Ethel | F | 633 |
38 | Lula | F | 621 |
39 | Myrtle | F | 615 |
40 | Eva | F | 614 |
41 | Frances | F | 605 |
42 | Lena | F | 603 |
43 | Lucy | F | 591 |
44 | Edna | F | 588 |
45 | Maggie | F | 582 |
46 | Pearl | F | 569 |
47 | Daisy | F | 564 |
48 | Fannie | F | 560 |
49 | Josephine | F | 544 |
50 | Dora | F | 524 |
51 | Rosa | F | 507 |
52 | Katherine | F | 502 |
53 | Agnes | F | 473 |
54 | Marie | F | 471 |
55 | Nora | F | 471 |
56 | May | F | 462 |
57 | Mamie | F | 436 |
58 | Blanche | F | 427 |
59 | Stella | F | 414 |
... | ... | ... |
2000 rows × 3 columns
names1880.groupby('sex').births.sum()
sex F 90993 M 110493 Name: births, dtype: int64
# year 2010 is the last year available right now
years = range(1880, 2011)
pieces = []
columns = ['names', 'sex', 'births']
for year in years:
path = 'ch02/names/yob%d.txt' %year
frame = pd.read_csv(path, names = columns)
frame['year'] = year
pieces.append(frame)
# concatenate them into a single dataframe
names = pd.concat(pieces, ignore_index = True)
# names = pd.read_pickle('ch02/all_names')
--------------------------------------------------------------------------- IOError Traceback (most recent call last) <ipython-input-112-af1154dd18f9> in <module>() ----> 1 names = pd.read_pickle('ch02/all_names') /usr/lib/python2.7/dist-packages/pandas/io/pickle.pyc in read_pickle(path) 47 48 try: ---> 49 return try_read(path) 50 except: 51 if PY3: /usr/lib/python2.7/dist-packages/pandas/io/pickle.pyc in try_read(path, encoding) 43 return pc.load(fh, encoding=encoding, compat=False) 44 except: ---> 45 with open(path, 'rb') as fh: 46 return pc.load(fh, encoding=encoding, compat=True) 47 IOError: [Errno 2] No such file or directory: 'ch02/all_names'
total_births = names.pivot_table('births', rows = 'year', cols = 'sex', aggfunc = sum)
total_births.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1896468 | 2050234 |
2007 | 1916888 | 2069242 |
2008 | 1883645 | 2032310 |
2009 | 1827643 | 1973359 |
2010 | 1759010 | 1898382 |
5 rows × 2 columns
total_births.plot(title = 'Total birth by sex and year')
<matplotlib.axes.AxesSubplot at 0x7fb871688890>
def add_prop(group):
# integers division floors
births = group.births.astype(float)
group['prop'] = births/births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
names
names | sex | births | year | prop | |
---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 |
2 | Emma | F | 2003 | 1880 | 0.022013 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 |
5 | Margaret | F | 1578 | 1880 | 0.017342 |
6 | Ida | F | 1472 | 1880 | 0.016177 |
7 | Alice | F | 1414 | 1880 | 0.015540 |
8 | Bertha | F | 1320 | 1880 | 0.014507 |
9 | Sarah | F | 1288 | 1880 | 0.014155 |
10 | Annie | F | 1258 | 1880 | 0.013825 |
11 | Clara | F | 1226 | 1880 | 0.013474 |
12 | Ella | F | 1156 | 1880 | 0.012704 |
13 | Florence | F | 1063 | 1880 | 0.011682 |
14 | Cora | F | 1045 | 1880 | 0.011484 |
15 | Martha | F | 1040 | 1880 | 0.011429 |
16 | Laura | F | 1012 | 1880 | 0.011122 |
17 | Nellie | F | 995 | 1880 | 0.010935 |
18 | Grace | F | 982 | 1880 | 0.010792 |
19 | Carrie | F | 949 | 1880 | 0.010429 |
20 | Maude | F | 858 | 1880 | 0.009429 |
21 | Mabel | F | 808 | 1880 | 0.008880 |
22 | Bessie | F | 794 | 1880 | 0.008726 |
23 | Jennie | F | 793 | 1880 | 0.008715 |
24 | Gertrude | F | 787 | 1880 | 0.008649 |
25 | Julia | F | 783 | 1880 | 0.008605 |
26 | Hattie | F | 769 | 1880 | 0.008451 |
27 | Edith | F | 768 | 1880 | 0.008440 |
28 | Mattie | F | 704 | 1880 | 0.007737 |
29 | Rose | F | 700 | 1880 | 0.007693 |
30 | Catherine | F | 688 | 1880 | 0.007561 |
31 | Lillian | F | 672 | 1880 | 0.007385 |
32 | Ada | F | 652 | 1880 | 0.007165 |
33 | Lillie | F | 647 | 1880 | 0.007110 |
34 | Helen | F | 636 | 1880 | 0.006990 |
35 | Jessie | F | 635 | 1880 | 0.006979 |
36 | Louise | F | 635 | 1880 | 0.006979 |
37 | Ethel | F | 633 | 1880 | 0.006957 |
38 | Lula | F | 621 | 1880 | 0.006825 |
39 | Myrtle | F | 615 | 1880 | 0.006759 |
40 | Eva | F | 614 | 1880 | 0.006748 |
41 | Frances | F | 605 | 1880 | 0.006649 |
42 | Lena | F | 603 | 1880 | 0.006627 |
43 | Lucy | F | 591 | 1880 | 0.006495 |
44 | Edna | F | 588 | 1880 | 0.006462 |
45 | Maggie | F | 582 | 1880 | 0.006396 |
46 | Pearl | F | 569 | 1880 | 0.006253 |
47 | Daisy | F | 564 | 1880 | 0.006198 |
48 | Fannie | F | 560 | 1880 | 0.006154 |
49 | Josephine | F | 544 | 1880 | 0.005978 |
50 | Dora | F | 524 | 1880 | 0.005759 |
51 | Rosa | F | 507 | 1880 | 0.005572 |
52 | Katherine | F | 502 | 1880 | 0.005517 |
53 | Agnes | F | 473 | 1880 | 0.005198 |
54 | Marie | F | 471 | 1880 | 0.005176 |
55 | Nora | F | 471 | 1880 | 0.005176 |
56 | May | F | 462 | 1880 | 0.005077 |
57 | Mamie | F | 436 | 1880 | 0.004792 |
58 | Blanche | F | 427 | 1880 | 0.004693 |
59 | Stella | F | 414 | 1880 | 0.004550 |
... | ... | ... | ... | ... |
1690784 rows × 5 columns
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
True
def get_top1000(group):
return group.sort_index(by = 'births', ascending = False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
pieces = []
for year, group in names.groupby(['year', 'sex']):
pieces.append(group.sort_index(by = 'births', ascending = False)[:1000])
top1000 = pd.concat(pieces, ignore_index = True)
top1000.index = np.arange(len(top1000))
top1000
names | sex | births | year | prop | |
---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 |
2 | Emma | F | 2003 | 1880 | 0.022013 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 |
5 | Margaret | F | 1578 | 1880 | 0.017342 |
6 | Ida | F | 1472 | 1880 | 0.016177 |
7 | Alice | F | 1414 | 1880 | 0.015540 |
8 | Bertha | F | 1320 | 1880 | 0.014507 |
9 | Sarah | F | 1288 | 1880 | 0.014155 |
10 | Annie | F | 1258 | 1880 | 0.013825 |
11 | Clara | F | 1226 | 1880 | 0.013474 |
12 | Ella | F | 1156 | 1880 | 0.012704 |
13 | Florence | F | 1063 | 1880 | 0.011682 |
14 | Cora | F | 1045 | 1880 | 0.011484 |
15 | Martha | F | 1040 | 1880 | 0.011429 |
16 | Laura | F | 1012 | 1880 | 0.011122 |
17 | Nellie | F | 995 | 1880 | 0.010935 |
18 | Grace | F | 982 | 1880 | 0.010792 |
19 | Carrie | F | 949 | 1880 | 0.010429 |
20 | Maude | F | 858 | 1880 | 0.009429 |
21 | Mabel | F | 808 | 1880 | 0.008880 |
22 | Bessie | F | 794 | 1880 | 0.008726 |
23 | Jennie | F | 793 | 1880 | 0.008715 |
24 | Gertrude | F | 787 | 1880 | 0.008649 |
25 | Julia | F | 783 | 1880 | 0.008605 |
26 | Hattie | F | 769 | 1880 | 0.008451 |
27 | Edith | F | 768 | 1880 | 0.008440 |
28 | Mattie | F | 704 | 1880 | 0.007737 |
29 | Rose | F | 700 | 1880 | 0.007693 |
30 | Catherine | F | 688 | 1880 | 0.007561 |
31 | Lillian | F | 672 | 1880 | 0.007385 |
32 | Ada | F | 652 | 1880 | 0.007165 |
33 | Lillie | F | 647 | 1880 | 0.007110 |
34 | Helen | F | 636 | 1880 | 0.006990 |
35 | Jessie | F | 635 | 1880 | 0.006979 |
36 | Louise | F | 635 | 1880 | 0.006979 |
37 | Ethel | F | 633 | 1880 | 0.006957 |
38 | Lula | F | 621 | 1880 | 0.006825 |
39 | Myrtle | F | 615 | 1880 | 0.006759 |
40 | Eva | F | 614 | 1880 | 0.006748 |
41 | Frances | F | 605 | 1880 | 0.006649 |
42 | Lena | F | 603 | 1880 | 0.006627 |
43 | Lucy | F | 591 | 1880 | 0.006495 |
44 | Edna | F | 588 | 1880 | 0.006462 |
45 | Maggie | F | 582 | 1880 | 0.006396 |
46 | Pearl | F | 569 | 1880 | 0.006253 |
47 | Daisy | F | 564 | 1880 | 0.006198 |
48 | Fannie | F | 560 | 1880 | 0.006154 |
49 | Josephine | F | 544 | 1880 | 0.005978 |
50 | Dora | F | 524 | 1880 | 0.005759 |
51 | Rosa | F | 507 | 1880 | 0.005572 |
52 | Katherine | F | 502 | 1880 | 0.005517 |
53 | Agnes | F | 473 | 1880 | 0.005198 |
54 | Marie | F | 471 | 1880 | 0.005176 |
55 | Nora | F | 471 | 1880 | 0.005176 |
56 | May | F | 462 | 1880 | 0.005077 |
57 | Mamie | F | 436 | 1880 | 0.004792 |
58 | Blanche | F | 427 | 1880 | 0.004693 |
59 | Stella | F | 414 | 1880 | 0.004550 |
... | ... | ... | ... | ... |
261877 rows × 5 columns
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('births', rows = 'year', cols = 'names', aggfunc = sum)
total_births
names | Aaden | Aaliyah | Aarav | Aaron | Aarush | Ab | Abagail | Abb | Abbey | Abbie | Abbigail | Abbott | Abby | Abdiel | Abdul | Abdullah | Abe | Abel | Abelardo | Abigail | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | |||||||||||||||||||||
1880 | NaN | NaN | NaN | 102 | NaN | NaN | NaN | NaN | NaN | 71 | NaN | NaN | 6 | NaN | NaN | NaN | 50 | 9 | NaN | 12 | ... |
1881 | NaN | NaN | NaN | 94 | NaN | NaN | NaN | NaN | NaN | 81 | NaN | NaN | 7 | NaN | NaN | NaN | 36 | 12 | NaN | 8 | ... |
1882 | NaN | NaN | NaN | 85 | NaN | NaN | NaN | NaN | NaN | 80 | NaN | NaN | 11 | NaN | NaN | NaN | 50 | 10 | NaN | 14 | ... |
1883 | NaN | NaN | NaN | 105 | NaN | NaN | NaN | NaN | NaN | 79 | NaN | NaN | NaN | NaN | NaN | NaN | 43 | 12 | NaN | 11 | ... |
1884 | NaN | NaN | NaN | 97 | NaN | NaN | NaN | NaN | NaN | 98 | NaN | NaN | 6 | NaN | NaN | NaN | 45 | 14 | NaN | 13 | ... |
1885 | NaN | NaN | NaN | 88 | NaN | 6 | NaN | NaN | NaN | 88 | NaN | NaN | NaN | NaN | NaN | NaN | 47 | 6 | NaN | 9 | ... |
1886 | NaN | NaN | NaN | 86 | NaN | NaN | NaN | NaN | NaN | 84 | NaN | NaN | 7 | NaN | NaN | NaN | 50 | 16 | NaN | 15 | ... |
1887 | NaN | NaN | NaN | 78 | NaN | NaN | NaN | NaN | NaN | 104 | NaN | NaN | NaN | NaN | NaN | NaN | 37 | 11 | NaN | 13 | ... |
1888 | NaN | NaN | NaN | 90 | NaN | NaN | NaN | NaN | NaN | 137 | NaN | 6 | 9 | NaN | NaN | NaN | 46 | 8 | NaN | 18 | ... |
1889 | NaN | NaN | NaN | 85 | NaN | NaN | NaN | NaN | NaN | 107 | NaN | NaN | NaN | NaN | NaN | NaN | 39 | 9 | NaN | 20 | ... |
1890 | NaN | NaN | NaN | 96 | NaN | NaN | NaN | 6 | NaN | 140 | NaN | NaN | NaN | NaN | NaN | NaN | 49 | 14 | NaN | NaN | ... |
1891 | NaN | NaN | NaN | 69 | NaN | NaN | NaN | NaN | NaN | 124 | NaN | NaN | 12 | NaN | NaN | NaN | 40 | 10 | NaN | 11 | ... |
1892 | NaN | NaN | NaN | 95 | NaN | NaN | NaN | NaN | NaN | 119 | NaN | NaN | NaN | NaN | NaN | NaN | 53 | 8 | NaN | 11 | ... |
1893 | NaN | NaN | NaN | 81 | NaN | NaN | NaN | NaN | NaN | 115 | NaN | NaN | NaN | NaN | NaN | NaN | 43 | 13 | NaN | 21 | ... |
1894 | NaN | NaN | NaN | 79 | NaN | NaN | NaN | NaN | NaN | 118 | NaN | NaN | NaN | NaN | NaN | NaN | 50 | 15 | NaN | 13 | ... |
1895 | NaN | NaN | NaN | 94 | NaN | NaN | NaN | NaN | NaN | 92 | NaN | NaN | NaN | NaN | NaN | NaN | 54 | NaN | NaN | 15 | ... |
1896 | NaN | NaN | NaN | 69 | NaN | NaN | NaN | NaN | NaN | 121 | NaN | NaN | NaN | NaN | NaN | NaN | 51 | 15 | NaN | 15 | ... |
1897 | NaN | NaN | NaN | 87 | NaN | NaN | NaN | NaN | NaN | 97 | NaN | NaN | NaN | NaN | NaN | NaN | 58 | 12 | NaN | 15 | ... |
1898 | NaN | NaN | NaN | 89 | NaN | NaN | NaN | NaN | NaN | 120 | NaN | NaN | NaN | NaN | NaN | NaN | 67 | 10 | NaN | NaN | ... |
1899 | NaN | NaN | NaN | 71 | NaN | NaN | NaN | NaN | NaN | 87 | NaN | NaN | NaN | NaN | NaN | NaN | 39 | 15 | NaN | NaN | ... |
1900 | NaN | NaN | NaN | 104 | NaN | NaN | NaN | NaN | NaN | 112 | NaN | NaN | NaN | NaN | NaN | NaN | 56 | 15 | NaN | NaN | ... |
1901 | NaN | NaN | NaN | 80 | NaN | NaN | NaN | NaN | NaN | 87 | NaN | NaN | NaN | NaN | NaN | NaN | 48 | 13 | NaN | 13 | ... |
1902 | NaN | NaN | NaN | 78 | NaN | NaN | NaN | NaN | NaN | 91 | NaN | NaN | NaN | NaN | NaN | NaN | 54 | 11 | NaN | 18 | ... |
1903 | NaN | NaN | NaN | 93 | NaN | NaN | NaN | NaN | NaN | 91 | NaN | NaN | NaN | NaN | NaN | NaN | 58 | 14 | NaN | 20 | ... |
1904 | NaN | NaN | NaN | 117 | NaN | NaN | NaN | NaN | NaN | 80 | NaN | NaN | NaN | NaN | NaN | NaN | 52 | 10 | NaN | NaN | ... |
1905 | NaN | NaN | NaN | 96 | NaN | NaN | NaN | NaN | NaN | 73 | NaN | NaN | NaN | NaN | NaN | NaN | 65 | NaN | NaN | NaN | ... |
1906 | NaN | NaN | NaN | 96 | NaN | NaN | NaN | NaN | NaN | 72 | NaN | NaN | NaN | NaN | NaN | NaN | 50 | 11 | NaN | 15 | ... |
1907 | NaN | NaN | NaN | 130 | NaN | NaN | NaN | NaN | NaN | 79 | NaN | NaN | NaN | NaN | NaN | NaN | 65 | 10 | NaN | NaN | ... |
1908 | NaN | NaN | NaN | 114 | NaN | NaN | NaN | NaN | NaN | 84 | NaN | NaN | NaN | NaN | NaN | NaN | 57 | 15 | NaN | NaN | ... |
1909 | NaN | NaN | NaN | 142 | NaN | NaN | NaN | NaN | NaN | 57 | NaN | NaN | NaN | NaN | NaN | NaN | 67 | 16 | NaN | NaN | ... |
1910 | NaN | NaN | NaN | 145 | NaN | NaN | NaN | NaN | NaN | 79 | NaN | NaN | NaN | NaN | NaN | NaN | 74 | NaN | NaN | NaN | ... |
1911 | NaN | NaN | NaN | 187 | NaN | NaN | NaN | NaN | NaN | 77 | NaN | NaN | NaN | NaN | NaN | NaN | 94 | 30 | NaN | NaN | ... |
1912 | NaN | NaN | NaN | 303 | NaN | NaN | NaN | NaN | NaN | 90 | NaN | NaN | NaN | NaN | NaN | NaN | 172 | 40 | NaN | NaN | ... |
1913 | NaN | NaN | NaN | 417 | NaN | NaN | NaN | NaN | NaN | 121 | NaN | NaN | NaN | NaN | NaN | NaN | 202 | 48 | NaN | NaN | ... |
1914 | NaN | NaN | NaN | 485 | NaN | NaN | NaN | NaN | NaN | 99 | NaN | NaN | NaN | NaN | NaN | NaN | 225 | 51 | NaN | NaN | ... |
1915 | NaN | NaN | NaN | 553 | NaN | NaN | NaN | NaN | NaN | 124 | NaN | NaN | NaN | NaN | NaN | NaN | 271 | 64 | NaN | NaN | ... |
1916 | NaN | NaN | NaN | 584 | NaN | NaN | NaN | NaN | NaN | 146 | NaN | NaN | NaN | NaN | NaN | NaN | 261 | 62 | NaN | NaN | ... |
1917 | NaN | NaN | NaN | 597 | NaN | NaN | NaN | NaN | NaN | 136 | NaN | NaN | NaN | NaN | NaN | NaN | 280 | 86 | NaN | NaN | ... |
1918 | NaN | NaN | NaN | 656 | NaN | NaN | NaN | NaN | NaN | 120 | NaN | NaN | NaN | NaN | NaN | NaN | 256 | 76 | NaN | NaN | ... |
1919 | NaN | NaN | NaN | 645 | NaN | NaN | NaN | NaN | NaN | 147 | NaN | NaN | NaN | NaN | NaN | NaN | 265 | 91 | NaN | NaN | ... |
1920 | NaN | NaN | NaN | 668 | NaN | NaN | NaN | NaN | NaN | 98 | NaN | NaN | NaN | NaN | NaN | NaN | 206 | 79 | NaN | NaN | ... |
1921 | NaN | NaN | NaN | 696 | NaN | NaN | NaN | NaN | NaN | 134 | NaN | NaN | NaN | NaN | NaN | NaN | 230 | 84 | NaN | NaN | ... |
1922 | NaN | NaN | NaN | 700 | NaN | NaN | NaN | NaN | NaN | 142 | NaN | NaN | NaN | NaN | NaN | NaN | 176 | 91 | NaN | NaN | ... |
1923 | NaN | NaN | NaN | 616 | NaN | NaN | NaN | NaN | NaN | 129 | NaN | NaN | NaN | NaN | NaN | NaN | 174 | 73 | NaN | NaN | ... |
1924 | NaN | NaN | NaN | 683 | NaN | NaN | NaN | NaN | NaN | 96 | NaN | NaN | NaN | NaN | NaN | NaN | 174 | 111 | NaN | NaN | ... |
1925 | NaN | NaN | NaN | 644 | NaN | NaN | NaN | NaN | NaN | 128 | NaN | NaN | NaN | NaN | NaN | NaN | 117 | 83 | NaN | NaN | ... |
1926 | NaN | NaN | NaN | 593 | NaN | NaN | NaN | NaN | NaN | 105 | NaN | NaN | NaN | NaN | NaN | NaN | 134 | 101 | NaN | NaN | ... |
1927 | NaN | NaN | NaN | 546 | NaN | NaN | NaN | NaN | NaN | 103 | NaN | NaN | NaN | NaN | NaN | NaN | 139 | 78 | NaN | NaN | ... |
1928 | NaN | NaN | NaN | 557 | NaN | NaN | NaN | NaN | NaN | 92 | NaN | NaN | NaN | NaN | NaN | NaN | 90 | 85 | NaN | NaN | ... |
1929 | NaN | NaN | NaN | 469 | NaN | NaN | NaN | NaN | NaN | 78 | NaN | NaN | NaN | NaN | NaN | NaN | 61 | 96 | NaN | NaN | ... |
1930 | NaN | NaN | NaN | 500 | NaN | NaN | NaN | NaN | NaN | 77 | NaN | NaN | NaN | NaN | NaN | NaN | 85 | 123 | NaN | NaN | ... |
1931 | NaN | NaN | NaN | 458 | NaN | NaN | NaN | NaN | NaN | 71 | NaN | NaN | NaN | NaN | NaN | NaN | 79 | 93 | NaN | NaN | ... |
1932 | NaN | NaN | NaN | 514 | NaN | NaN | NaN | NaN | NaN | 57 | NaN | NaN | NaN | NaN | NaN | NaN | 83 | 94 | NaN | NaN | ... |
1933 | NaN | NaN | NaN | 460 | NaN | NaN | NaN | NaN | NaN | 64 | NaN | NaN | NaN | NaN | NaN | NaN | 68 | 96 | NaN | NaN | ... |
1934 | NaN | NaN | NaN | 478 | NaN | NaN | NaN | NaN | NaN | 60 | NaN | NaN | NaN | NaN | NaN | NaN | 75 | 93 | NaN | NaN | ... |
1935 | NaN | NaN | NaN | 461 | NaN | NaN | NaN | NaN | NaN | 60 | NaN | NaN | NaN | NaN | NaN | NaN | 59 | 92 | NaN | NaN | ... |
1936 | NaN | NaN | NaN | 443 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 70 | 74 | NaN | NaN | ... |
1937 | NaN | NaN | NaN | 464 | NaN | NaN | NaN | NaN | NaN | 52 | NaN | NaN | NaN | NaN | NaN | NaN | 64 | 91 | NaN | NaN | ... |
1938 | NaN | NaN | NaN | 478 | NaN | NaN | NaN | NaN | NaN | 57 | NaN | NaN | NaN | NaN | NaN | NaN | 55 | 87 | NaN | NaN | ... |
1939 | NaN | NaN | NaN | 471 | NaN | NaN | NaN | NaN | NaN | 69 | NaN | NaN | NaN | NaN | NaN | NaN | 56 | 95 | NaN | 53 | ... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
131 rows × 6865 columns
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots = True, figsize = (12, 10), grid = False, title = 'Number of births per year')
array([<matplotlib.axes.AxesSubplot object at 0x7fb873f6c210>, <matplotlib.axes.AxesSubplot object at 0x7fb87152c150>, <matplotlib.axes.AxesSubplot object at 0x7fb8714f86d0>, <matplotlib.axes.AxesSubplot object at 0x7fb87141aad0>], dtype=object)
plt.figure()
<matplotlib.figure.Figure at 0x7fb87175d050>
<matplotlib.figure.Figure at 0x7fb87175d050>
table = top1000.pivot_table('prop', rows = 'year', cols = 'sex', aggfunc = sum)
table.plot(title = 'Sum of table.top1000.prop by sex and year',
yticks = np.linspace(0, 1.2, 13),
xticks = range(1880, 2010, 10))
<matplotlib.axes.AxesSubplot at 0x7fb8713d91d0>
df = boys[boys.year == 2010]
df
names | sex | births | year | prop | |
---|---|---|---|---|---|
260877 | Jacob | M | 21875 | 2010 | 0.011523 |
260878 | Ethan | M | 17866 | 2010 | 0.009411 |
260879 | Michael | M | 17133 | 2010 | 0.009025 |
260880 | Jayden | M | 17030 | 2010 | 0.008971 |
260881 | William | M | 16870 | 2010 | 0.008887 |
260882 | Alexander | M | 16634 | 2010 | 0.008762 |
260883 | Noah | M | 16281 | 2010 | 0.008576 |
260884 | Daniel | M | 15679 | 2010 | 0.008259 |
260885 | Aiden | M | 15403 | 2010 | 0.008114 |
260886 | Anthony | M | 15364 | 2010 | 0.008093 |
260887 | Joshua | M | 15238 | 2010 | 0.008027 |
260888 | Mason | M | 14728 | 2010 | 0.007758 |
260889 | Christopher | M | 14135 | 2010 | 0.007446 |
260890 | Andrew | M | 14093 | 2010 | 0.007424 |
260891 | David | M | 14042 | 2010 | 0.007397 |
260892 | Matthew | M | 13954 | 2010 | 0.007350 |
260893 | Logan | M | 13943 | 2010 | 0.007345 |
260894 | Elijah | M | 13735 | 2010 | 0.007235 |
260895 | James | M | 13714 | 2010 | 0.007224 |
260896 | Joseph | M | 13657 | 2010 | 0.007194 |
260897 | Gabriel | M | 12722 | 2010 | 0.006701 |
260898 | Benjamin | M | 12280 | 2010 | 0.006469 |
260899 | Ryan | M | 11886 | 2010 | 0.006261 |
260900 | Samuel | M | 11776 | 2010 | 0.006203 |
260901 | Jackson | M | 11693 | 2010 | 0.006159 |
260902 | John | M | 11424 | 2010 | 0.006018 |
260903 | Nathan | M | 11269 | 2010 | 0.005936 |
260904 | Jonathan | M | 11028 | 2010 | 0.005809 |
260905 | Christian | M | 10965 | 2010 | 0.005776 |
260906 | Liam | M | 10852 | 2010 | 0.005716 |
260907 | Dylan | M | 10455 | 2010 | 0.005507 |
260908 | Landon | M | 10400 | 2010 | 0.005478 |
260909 | Caleb | M | 10359 | 2010 | 0.005457 |
260910 | Tyler | M | 10357 | 2010 | 0.005456 |
260911 | Lucas | M | 10294 | 2010 | 0.005423 |
260912 | Evan | M | 9655 | 2010 | 0.005086 |
260913 | Gavin | M | 9551 | 2010 | 0.005031 |
260914 | Nicholas | M | 9549 | 2010 | 0.005030 |
260915 | Isaac | M | 9254 | 2010 | 0.004875 |
260916 | Brayden | M | 9046 | 2010 | 0.004765 |
260917 | Luke | M | 8767 | 2010 | 0.004618 |
260918 | Angel | M | 8716 | 2010 | 0.004591 |
260919 | Brandon | M | 8473 | 2010 | 0.004463 |
260920 | Jack | M | 8457 | 2010 | 0.004455 |
260921 | Isaiah | M | 8443 | 2010 | 0.004447 |
260922 | Jordan | M | 8156 | 2010 | 0.004296 |
260923 | Owen | M | 8136 | 2010 | 0.004286 |
260924 | Carter | M | 8101 | 2010 | 0.004267 |
260925 | Connor | M | 7991 | 2010 | 0.004209 |
260926 | Justin | M | 7792 | 2010 | 0.004105 |
260927 | Jose | M | 7656 | 2010 | 0.004033 |
260928 | Jeremiah | M | 7645 | 2010 | 0.004027 |
260929 | Julian | M | 7534 | 2010 | 0.003969 |
260930 | Robert | M | 7471 | 2010 | 0.003935 |
260931 | Aaron | M | 7374 | 2010 | 0.003884 |
260932 | Adrian | M | 7354 | 2010 | 0.003874 |
260933 | Wyatt | M | 7319 | 2010 | 0.003855 |
260934 | Kevin | M | 7278 | 2010 | 0.003834 |
260935 | Hunter | M | 7271 | 2010 | 0.003830 |
260936 | Cameron | M | 7137 | 2010 | 0.003760 |
... | ... | ... | ... | ... |
1000 rows × 5 columns
prop_cumsum = df.sort_index(by = 'prop', ascending = False).prop.cumsum()
prop_cumsum[:10]
260877 0.011523 260878 0.020934 260879 0.029959 260880 0.038930 260881 0.047817 260882 0.056579 260883 0.065155 260884 0.073414 260885 0.081528 260886 0.089621 Name: prop, dtype: float64
prop_cumsum.values.searchsorted(0.5)
116
df = boys[boys.year == 1900]
in1900 = df.sort_index(by = 'prop', ascending = False).prop.cumsum()
in1900.values.searchsorted(0.5) + 1
25
def get_quantile_count(group, q = 0.5):
group = group.sort_index(by = 'prop', ascending = False)
return group.prop.cumsum().values.searchsorted(q) + 1
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head()
sex | F | M |
---|---|---|
year | ||
1880 | 38 | 14 |
1881 | 38 | 14 |
1882 | 38 | 15 |
1883 | 39 | 15 |
1884 | 39 | 16 |
5 rows × 2 columns
diversity.plot(title = 'Number of Popular names in top 50%')
<matplotlib.axes.AxesSubplot at 0x7fb87124ffd0>
# Extract the last letter from the names
get_last_letter = lambda x: x[-1]
last_letters = names.names.apply(get_last_letter)
last_letters.names = 'last_letter'
table = names.pivot_table('births', rows = last_letters, cols = ['sex', 'year'], aggfunc=sum)
subtable = table.reindex(colums = [1910, 1960, 2010], levels = 'year')
subtable.head()
sex | F | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | 1880 | 1881 | 1882 | 1883 | 1884 | 1885 | 1886 | 1887 | 1888 | 1889 | 1890 | 1891 | 1892 | 1893 | 1894 | 1895 | 1896 | 1897 | 1898 | 1899 | |
names | |||||||||||||||||||||
a | 31446 | 31581 | 36536 | 38330 | 43680 | 45408 | 49100 | 48942 | 59442 | 58631 | 62313 | 60582 | 68331 | 67821 | 70631 | 73002 | 73584 | 72148 | 79150 | 70712 | ... |
b | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... |
c | NaN | NaN | 5 | 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5 | NaN | NaN | NaN | ... |
d | 609 | 607 | 734 | 810 | 916 | 862 | 1007 | 1027 | 1298 | 1374 | 1438 | 1512 | 1775 | 1821 | 1985 | 2268 | 2372 | 2455 | 2953 | 3028 | ... |
e | 33378 | 34080 | 40399 | 41914 | 48089 | 49616 | 53884 | 54353 | 66750 | 66663 | 70948 | 67750 | 77186 | 76455 | 79938 | 83875 | 84355 | 82783 | 91151 | 81395 | ... |
5 rows × 262 columns
subtable.sum()
sex year F 1880 90993 1881 91955 1882 107851 1883 112322 1884 129021 1885 133056 1886 144538 1887 145983 1888 178631 1889 178369 1890 190377 1891 185486 1892 212350 1893 212908 1894 222923 ... M 1996 1892700 1997 1883571 1998 1909676 1999 1918267 2000 1961702 2001 1940498 2002 1938941 2003 1972439 2004 1981557 2005 1993285 2006 2050234 2007 2069242 2008 2032310 2009 1973359 2010 1898382 Length: 262, dtype: float64
letter_prop = subtable/subtable.sum().astype(float)
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize = (10, 8))
letter_prop['M'].plot(kind = 'bar', rot = 0, ax = axes[0], title = 'Male', legend = False)
letter_prop['F'].plot(kind = 'bar', rot = 0, ax = axes[1], title = 'Female', legend = False)
<matplotlib.axes.AxesSubplot at 0x7fb87080edd0>
plt.subplots_adjust(hspace = 0.25)
<matplotlib.figure.Figure at 0x7fb870c82d90>
letter_prop = table/table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y', 'm'], 'M'].T
dny_ts.head()
d | n | y | m | |
---|---|---|---|---|
year | ||||
1880 | 0.083055 | 0.153213 | 0.075760 | 0.101907 |
1881 | 0.083247 | 0.153214 | 0.077451 | 0.101411 |
1882 | 0.085340 | 0.149560 | 0.077537 | 0.099000 |
1883 | 0.084066 | 0.151646 | 0.079144 | 0.096663 |
1884 | 0.086120 | 0.149915 | 0.080405 | 0.093774 |
5 rows × 4 columns
plt.close('all')
dny_ts.plot()
<matplotlib.axes.AxesSubplot at 0x7fb868bd7490>
all_names = top1000.names.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like
array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)
filtered = top1000[top1000.names.isin(lesley_like)]
filtered.groupby('names').births.sum()
names Leslee 1082 Lesley 35022 Lesli 929 Leslie 370429 Lesly 10067 Name: births, dtype: int64
table = filtered.pivot_table('births', rows = 'year', cols = 'sex', aggfunc = sum)
table = table.div(table.sum(1), axis = 0)
table.tail()
sex | F | M |
---|---|---|
year | ||
1880 | 0.091954 | 0.908046 |
1881 | 0.106796 | 0.893204 |
1882 | 0.065693 | 0.934307 |
1883 | 0.053030 | 0.946970 |
1884 | 0.107143 | 0.892857 |
1885 | 0.075758 | 0.924242 |
1886 | 0.055556 | 0.944444 |
1887 | 0.067416 | 0.932584 |
1888 | 0.116162 | 0.883838 |
1889 | 0.129213 | 0.870787 |
1890 | 0.099502 | 0.900498 |
1891 | 0.145833 | 0.854167 |
1892 | 0.096070 | 0.903930 |
1893 | 0.123223 | 0.876777 |
1894 | 0.138996 | 0.861004 |
1895 | 0.085603 | 0.914397 |
1896 | 0.102273 | 0.897727 |
1897 | 0.132812 | 0.867188 |
1898 | 0.092308 | 0.907692 |
1899 | 0.090452 | 0.909548 |
1900 | 0.095238 | 0.904762 |
1901 | 0.124464 | 0.875536 |
1902 | 0.128472 | 0.871528 |
1903 | 0.089552 | 0.910448 |
1904 | 0.109890 | 0.890110 |
1905 | 0.124113 | 0.875887 |
1906 | 0.099315 | 0.900685 |
1907 | 0.110749 | 0.889251 |
1908 | 0.123867 | 0.876133 |
1909 | 0.107034 | 0.892966 |
1910 | 0.137931 | 0.862069 |
1911 | 0.097765 | 0.902235 |
1912 | 0.066038 | 0.933962 |
1913 | 0.058427 | 0.941573 |
1914 | 0.059441 | 0.940559 |
1915 | 0.058824 | 0.941176 |
1916 | 0.069368 | 0.930632 |
1917 | 0.064286 | 0.935714 |
1918 | 0.071630 | 0.928370 |
1919 | 0.069049 | 0.930951 |
1920 | 0.053026 | 0.946974 |
1921 | 0.071106 | 0.928894 |
1922 | 0.070267 | 0.929733 |
1923 | 0.088252 | 0.911748 |
1924 | 0.078246 | 0.921754 |
1925 | 0.081382 | 0.918618 |
1926 | 0.086497 | 0.913503 |
1927 | 0.078541 | 0.921459 |
1928 | 0.068780 | 0.931220 |
1929 | 0.075166 | 0.924834 |
1930 | 0.096140 | 0.903860 |
1931 | 0.078234 | 0.921766 |
1932 | 0.073841 | 0.926159 |
1933 | 0.095423 | 0.904577 |
1934 | 0.090514 | 0.909486 |
1935 | 0.116968 | 0.883032 |
1936 | 0.125526 | 0.874474 |
1937 | 0.124736 | 0.875264 |
1938 | 0.123856 | 0.876144 |
1939 | 0.117841 | 0.882159 |
... | ... |
131 rows × 2 columns
plt.close('all')
table.plot(style = {'M', 'k-', 'F': 'k--'})
File "<ipython-input-166-e72db582d4c7>", line 1 table.plot(style = {'M', 'k-', 'F': 'k--'}) ^ SyntaxError: invalid syntax