In [1]:

from numpy.random import randn
import numpy as np
np.random.seed(123)
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80

In [2]:

path = "datasets/bitly_usagov/example.txt"

In [3]:

import json
with open(path) as f:
    records = [json.loads(line) for line in f]

In [4]:

time_zones = [rec["tz"] for rec in records]

In [5]:

time_zones = [rec["tz"] for rec in records if "tz" in rec]
time_zones[:10]

In [6]:

def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [7]:

from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence:
        counts[x] += 1
    return counts

In [8]:

counts = get_counts(time_zones)
counts["America/New_York"]
len(time_zones)

In [9]:

def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [10]:

top_counts(counts)

In [11]:

from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)

In [12]:

frame = pd.DataFrame(records)

In [13]:

frame.info()
frame["tz"].head()

In [14]:

tz_counts = frame["tz"].value_counts()
tz_counts.head()

In [15]:

clean_tz = frame["tz"].fillna("Missing")
clean_tz[clean_tz == ""] = "Unknown"
tz_counts = clean_tz.value_counts()
tz_counts.head()

In [16]:

plt.figure(figsize=(10, 4))

In [17]:

import seaborn as sns
subset = tz_counts.head()
sns.barplot(y=subset.index, x=subset.to_numpy())

In [18]:

frame["a"][1]
frame["a"][50]
frame["a"][51][:50]  # long line

In [19]:

results = pd.Series([x.split()[0] for x in frame["a"].dropna()])
results.head(5)
results.value_counts().head(8)

In [20]:

cframe = frame[frame["a"].notna()].copy()

In [21]:

cframe["os"] = np.where(cframe["a"].str.contains("Windows"),
                        "Windows", "Not Windows")
cframe["os"].head(5)

In [22]:

by_tz_os = cframe.groupby(["tz", "os"])

In [23]:

agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts.head()

In [24]:

indexer = agg_counts.sum("columns").argsort()
indexer.values[:10]

In [25]:

count_subset = agg_counts.take(indexer[-10:])
count_subset

In [26]:

agg_counts.sum(axis="columns").nlargest(10)

In [27]:

plt.figure()

In [28]:

count_subset = count_subset.stack()
count_subset.name = "total"
count_subset = count_subset.reset_index()
count_subset.head(10)
sns.barplot(x="total", y="tz", hue="os",  data=count_subset)

In [29]:

def norm_total(group):
    group["normed_total"] = group["total"] / group["total"].sum()
    return group

results = count_subset.groupby("tz").apply(norm_total)

In [30]:

plt.figure()

In [31]:

sns.barplot(x="normed_total", y="tz", hue="os",  data=results)

In [32]:

g = count_subset.groupby("tz")
results2 = count_subset["total"] / g["total"].transform("sum")

In [33]:

unames = ["user_id", "gender", "age", "occupation", "zip"]
users = pd.read_table("datasets/movielens/users.dat", sep="::",
                      header=None, names=unames, engine="python")

rnames = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_table("datasets/movielens/ratings.dat", sep="::",
                        header=None, names=rnames, engine="python")

mnames = ["movie_id", "title", "genres"]
movies = pd.read_table("datasets/movielens/movies.dat", sep="::",
                       header=None, names=mnames, engine="python")

In [34]:

users.head(5)
ratings.head(5)
movies.head(5)
ratings

In [35]:

data = pd.merge(pd.merge(ratings, users), movies)
data
data.iloc[0]

In [36]:

mean_ratings = data.pivot_table("rating", index="title",
                                columns="gender", aggfunc="mean")
mean_ratings.head(5)

In [37]:

ratings_by_title = data.groupby("title").size()
ratings_by_title.head()
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

In [38]:

mean_ratings = mean_ratings.loc[active_titles]
mean_ratings

In [39]:

mean_ratings = mean_ratings.rename(index={"Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)":
                           "Seven Samurai (Shichinin no samurai) (1954)"})

In [40]:

top_female_ratings = mean_ratings.sort_values("F", ascending=False)
top_female_ratings.head()

In [41]:

mean_ratings["diff"] = mean_ratings["M"] - mean_ratings["F"]

In [42]:

sorted_by_diff = mean_ratings.sort_values("diff")
sorted_by_diff.head()

In [43]:

sorted_by_diff[::-1].head()

In [44]:

rating_std_by_title = data.groupby("title")["rating"].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
rating_std_by_title.head()

In [45]:

rating_std_by_title.sort_values(ascending=False)[:10]

In [46]:

movies["genres"].head()
movies["genres"].head().str.split("|")
movies["genre"] = movies.pop("genres").str.split("|")
movies.head()

In [47]:

movies_exploded = movies.explode("genre")
movies_exploded[:10]

In [48]:

ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users)
ratings_with_genre.iloc[0]
genre_ratings = (ratings_with_genre.groupby(["genre", "age"])
                 ["rating"].mean()
                 .unstack("age"))
genre_ratings[:10]

In [49]:

!head -n 10 datasets/babynames/yob1880.txt

In [50]:

names1880 = pd.read_csv("datasets/babynames/yob1880.txt",
                        names=["name", "sex", "births"])
names1880

In [51]:

names1880.groupby("sex")["births"].sum()

In [52]:

pieces = []
for year in range(1880, 2011):
    path = f"datasets/babynames/yob{year}.txt"
    frame = pd.read_csv(path, names=["name", "sex", "births"])

    # Add a column for the year
    frame["year"] = year
    pieces.append(frame)

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

In [53]:

names

In [54]:

total_births = names.pivot_table("births", index="year",
                                 columns="sex", aggfunc=sum)
total_births.tail()
total_births.plot(title="Total births by sex and year")

In [55]:

def add_prop(group):
    group["prop"] = group["births"] / group["births"].sum()
    return group
names = names.groupby(["year", "sex"], group_keys=False).apply(add_prop)

In [56]:

names

In [57]:

names.groupby(["year", "sex"])["prop"].sum()

In [58]:

def get_top1000(group):
    return group.sort_values("births", ascending=False)[:1000]
grouped = names.groupby(["year", "sex"])
top1000 = grouped.apply(get_top1000)
top1000.head()

In [59]:

top1000 = top1000.reset_index(drop=True)

In [60]:

top1000.head()

In [61]:

boys = top1000[top1000["sex"] == "M"]
girls = top1000[top1000["sex"] == "F"]

In [62]:

total_births = top1000.pivot_table("births", index="year",
                                   columns="name",
                                   aggfunc=sum)

In [63]:

total_births.info()
subset = total_births[["John", "Harry", "Mary", "Marilyn"]]
subset.plot(subplots=True, figsize=(12, 10),
            title="Number of births per year")

In [64]:

plt.figure()

In [65]:

table = top1000.pivot_table("prop", index="year",
                            columns="sex", aggfunc=sum)
table.plot(title="Sum of table1000.prop by year and sex",
           yticks=np.linspace(0, 1.2, 13))

In [66]:

df = boys[boys["year"] == 2010]
df

In [67]:

prop_cumsum = df["prop"].sort_values(ascending=False).cumsum()
prop_cumsum[:10]
prop_cumsum.searchsorted(0.5)

In [68]:

df = boys[boys.year == 1900]
in1900 = df.sort_values("prop", ascending=False).prop.cumsum()
in1900.searchsorted(0.5) + 1

In [69]:

def get_quantile_count(group, q=0.5):
    group = group.sort_values("prop", ascending=False)
    return group.prop.cumsum().searchsorted(q) + 1

diversity = top1000.groupby(["year", "sex"]).apply(get_quantile_count)
diversity = diversity.unstack()

In [70]:

fig = plt.figure()

In [71]:

diversity.head()
diversity.plot(title="Number of popular names in top 50%")

In [72]:

def get_last_letter(x):
    return x[-1]

last_letters = names["name"].map(get_last_letter)
last_letters.name = "last_letter"

table = names.pivot_table("births", index=last_letters,
                          columns=["sex", "year"], aggfunc=sum)

In [73]:

subtable = table.reindex(columns=[1910, 1960, 2010], level="year")
subtable.head()

In [74]:

subtable.sum()
letter_prop = subtable / subtable.sum()
letter_prop

In [75]:

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop["M"].plot(kind="bar", rot=0, ax=axes[0], title="Male")
letter_prop["F"].plot(kind="bar", rot=0, ax=axes[1], title="Female",
                      legend=False)

In [76]:

plt.subplots_adjust(hspace=0.25)

In [77]:

letter_prop = table / table.sum()

dny_ts = letter_prop.loc[["d", "n", "y"], "M"].T
dny_ts.head()

In [78]:

plt.close("all")

In [79]:

fig = plt.figure()

In [80]:

dny_ts.plot()

In [81]:

all_names = pd.Series(top1000["name"].unique())
lesley_like = all_names[all_names.str.contains("Lesl")]
lesley_like

In [82]:

filtered = top1000[top1000["name"].isin(lesley_like)]
filtered.groupby("name")["births"].sum()

In [83]:

table = filtered.pivot_table("births", index="year",
                             columns="sex", aggfunc="sum")
table = table.div(table.sum(axis="columns"), axis="index")
table.tail()

In [84]:

fig = plt.figure()

In [85]:

table.plot(style={"M": "k-", "F": "k--"})

In [86]:

import json
db = json.load(open("datasets/usda_food/database.json"))
len(db)

In [87]:

db[0].keys()
db[0]["nutrients"][0]
nutrients = pd.DataFrame(db[0]["nutrients"])
nutrients.head(7)

In [88]:

info_keys = ["description", "group", "id", "manufacturer"]
info = pd.DataFrame(db, columns=info_keys)
info.head()
info.info()

In [89]:

pd.value_counts(info["group"])[:10]

In [90]:

nutrients = []

for rec in db:
    fnuts = pd.DataFrame(rec["nutrients"])
    fnuts["id"] = rec["id"]
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)

In [91]:

nutrients

In [92]:

nutrients.duplicated().sum()  # number of duplicates
nutrients = nutrients.drop_duplicates()

In [93]:

col_mapping = {"description" : "food",
               "group"       : "fgroup"}
info = info.rename(columns=col_mapping, copy=False)
info.info()
col_mapping = {"description" : "nutrient",
               "group" : "nutgroup"}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients

In [94]:

ndata = pd.merge(nutrients, info, on="id")
ndata.info()
ndata.iloc[30000]

In [95]:

fig = plt.figure()

In [96]:

result = ndata.groupby(["nutrient", "fgroup"])["value"].quantile(0.5)
result["Zinc, Zn"].sort_values().plot(kind="barh")

In [97]:

by_nutrient = ndata.groupby(["nutgroup", "nutrient"])

def get_maximum(x):
    return x.loc[x.value.idxmax()]

max_foods = by_nutrient.apply(get_maximum)[["value", "food"]]

# make the food a little smaller
max_foods["food"] = max_foods["food"].str[:50]

In [98]:

max_foods.loc["Amino Acids"]["food"]

In [99]:

fec = pd.read_csv("datasets/fec/P00000001-ALL.csv", low_memory=False)
fec.info()

In [100]:

fec.iloc[123456]

In [101]:

unique_cands = fec["cand_nm"].unique()
unique_cands
unique_cands[2]

In [102]:

parties = {"Bachmann, Michelle": "Republican",
           "Cain, Herman": "Republican",
           "Gingrich, Newt": "Republican",
           "Huntsman, Jon": "Republican",
           "Johnson, Gary Earl": "Republican",
           "McCotter, Thaddeus G": "Republican",
           "Obama, Barack": "Democrat",
           "Paul, Ron": "Republican",
           "Pawlenty, Timothy": "Republican",
           "Perry, Rick": "Republican",
           "Roemer, Charles E. 'Buddy' III": "Republican",
           "Romney, Mitt": "Republican",
           "Santorum, Rick": "Republican"}

In [103]:

fec["cand_nm"][123456:123461]
fec["cand_nm"][123456:123461].map(parties)
# Add it as a column
fec["party"] = fec["cand_nm"].map(parties)
fec["party"].value_counts()

In [104]:

(fec["contb_receipt_amt"] > 0).value_counts()

In [105]:

fec = fec[fec["contb_receipt_amt"] > 0]

In [106]:

fec_mrbo = fec[fec["cand_nm"].isin(["Obama, Barack", "Romney, Mitt"])]

In [107]:

fec["contbr_occupation"].value_counts()[:10]

In [108]:

occ_mapping = {
   "INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED",
   "INFORMATION REQUESTED" : "NOT PROVIDED",
   "INFORMATION REQUESTED (BEST EFFORTS)" : "NOT PROVIDED",
   "C.E.O.": "CEO"
}

def get_occ(x):
    # If no mapping provided, return x
    return occ_mapping.get(x, x)

fec["contbr_occupation"] = fec["contbr_occupation"].map(get_occ)

In [109]:

emp_mapping = {
   "INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED",
   "INFORMATION REQUESTED" : "NOT PROVIDED",
   "SELF" : "SELF-EMPLOYED",
   "SELF EMPLOYED" : "SELF-EMPLOYED",
}

def get_emp(x):
    # If no mapping provided, return x
    return emp_mapping.get(x, x)

fec["contbr_employer"] = fec["contbr_employer"].map(get_emp)

In [110]:

by_occupation = fec.pivot_table("contb_receipt_amt",
                                index="contbr_occupation",
                                columns="party", aggfunc="sum")
over_2mm = by_occupation[by_occupation.sum(axis="columns") > 2000000]
over_2mm

In [111]:

plt.figure()

In [112]:

over_2mm.plot(kind="barh")

In [113]:

def get_top_amounts(group, key, n=5):
    totals = group.groupby(key)["contb_receipt_amt"].sum()
    return totals.nlargest(n)

In [114]:

grouped = fec_mrbo.groupby("cand_nm")
grouped.apply(get_top_amounts, "contbr_occupation", n=7)
grouped.apply(get_top_amounts, "contbr_employer", n=10)

In [115]:

bins = np.array([0, 1, 10, 100, 1000, 10000,
                 100_000, 1_000_000, 10_000_000])
labels = pd.cut(fec_mrbo["contb_receipt_amt"], bins)
labels

In [116]:

grouped = fec_mrbo.groupby(["cand_nm", labels])
grouped.size().unstack(level=0)

In [117]:

plt.figure()

In [118]:

bucket_sums = grouped["contb_receipt_amt"].sum().unstack(level=0)
normed_sums = bucket_sums.div(bucket_sums.sum(axis="columns"),
                              axis="index")
normed_sums
normed_sums[:-2].plot(kind="barh")

In [119]:

grouped = fec_mrbo.groupby(["cand_nm", "contbr_st"])
totals = grouped["contb_receipt_amt"].sum().unstack(level=0).fillna(0)
totals = totals[totals.sum(axis="columns") > 100000]
totals.head(10)

In [120]:

percent = totals.div(totals.sum(axis="columns"), axis="index")
percent.head(10)

In [121]: