from numpy.random import randn
import numpy as np
np.random.seed(123)
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
path = "datasets/bitly_usagov/example.txt"
import json
with open(path) as f:
records = [json.loads(line) for line in f]
time_zones = [rec["tz"] for rec in records]
time_zones = [rec["tz"] for rec in records if "tz" in rec]
time_zones[:10]
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int) # values will initialize to 0
for x in sequence:
counts[x] += 1
return counts
counts = get_counts(time_zones)
counts["America/New_York"]
len(time_zones)
def top_counts(count_dict, n=10):
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
top_counts(counts)
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)
frame = pd.DataFrame(records)
frame.info()
frame["tz"].head()
tz_counts = frame["tz"].value_counts()
tz_counts.head()
clean_tz = frame["tz"].fillna("Missing")
clean_tz[clean_tz == ""] = "Unknown"
tz_counts = clean_tz.value_counts()
tz_counts.head()
plt.figure(figsize=(10, 4))
import seaborn as sns
subset = tz_counts.head()
sns.barplot(y=subset.index, x=subset.to_numpy())
frame["a"][1]
frame["a"][50]
frame["a"][51][:50] # long line
results = pd.Series([x.split()[0] for x in frame["a"].dropna()])
results.head(5)
results.value_counts().head(8)
cframe = frame[frame["a"].notna()].copy()
cframe["os"] = np.where(cframe["a"].str.contains("Windows"),
"Windows", "Not Windows")
cframe["os"].head(5)
by_tz_os = cframe.groupby(["tz", "os"])
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts.head()
indexer = agg_counts.sum("columns").argsort()
indexer.values[:10]
count_subset = agg_counts.take(indexer[-10:])
count_subset
agg_counts.sum(axis="columns").nlargest(10)
plt.figure()
count_subset = count_subset.stack()
count_subset.name = "total"
count_subset = count_subset.reset_index()
count_subset.head(10)
sns.barplot(x="total", y="tz", hue="os", data=count_subset)
def norm_total(group):
group["normed_total"] = group["total"] / group["total"].sum()
return group
results = count_subset.groupby("tz").apply(norm_total)
plt.figure()
sns.barplot(x="normed_total", y="tz", hue="os", data=results)
g = count_subset.groupby("tz")
results2 = count_subset["total"] / g["total"].transform("sum")
unames = ["user_id", "gender", "age", "occupation", "zip"]
users = pd.read_table("datasets/movielens/users.dat", sep="::",
header=None, names=unames, engine="python")
rnames = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_table("datasets/movielens/ratings.dat", sep="::",
header=None, names=rnames, engine="python")
mnames = ["movie_id", "title", "genres"]
movies = pd.read_table("datasets/movielens/movies.dat", sep="::",
header=None, names=mnames, engine="python")
users.head(5)
ratings.head(5)
movies.head(5)
ratings
data = pd.merge(pd.merge(ratings, users), movies)
data
data.iloc[0]
mean_ratings = data.pivot_table("rating", index="title",
columns="gender", aggfunc="mean")
mean_ratings.head(5)
ratings_by_title = data.groupby("title").size()
ratings_by_title.head()
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings
mean_ratings = mean_ratings.rename(index={"Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)":
"Seven Samurai (Shichinin no samurai) (1954)"})
top_female_ratings = mean_ratings.sort_values("F", ascending=False)
top_female_ratings.head()
mean_ratings["diff"] = mean_ratings["M"] - mean_ratings["F"]
sorted_by_diff = mean_ratings.sort_values("diff")
sorted_by_diff.head()
sorted_by_diff[::-1].head()
rating_std_by_title = data.groupby("title")["rating"].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
rating_std_by_title.head()
rating_std_by_title.sort_values(ascending=False)[:10]
movies["genres"].head()
movies["genres"].head().str.split("|")
movies["genre"] = movies.pop("genres").str.split("|")
movies.head()
movies_exploded = movies.explode("genre")
movies_exploded[:10]
ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users)
ratings_with_genre.iloc[0]
genre_ratings = (ratings_with_genre.groupby(["genre", "age"])
["rating"].mean()
.unstack("age"))
genre_ratings[:10]
!head -n 10 datasets/babynames/yob1880.txt
names1880 = pd.read_csv("datasets/babynames/yob1880.txt",
names=["name", "sex", "births"])
names1880
names1880.groupby("sex")["births"].sum()
pieces = []
for year in range(1880, 2011):
path = f"datasets/babynames/yob{year}.txt"
frame = pd.read_csv(path, names=["name", "sex", "births"])
# Add a column for the year
frame["year"] = year
pieces.append(frame)
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)
names
total_births = names.pivot_table("births", index="year",
columns="sex", aggfunc=sum)
total_births.tail()
total_births.plot(title="Total births by sex and year")
def add_prop(group):
group["prop"] = group["births"] / group["births"].sum()
return group
names = names.groupby(["year", "sex"], group_keys=False).apply(add_prop)
names
names.groupby(["year", "sex"])["prop"].sum()
def get_top1000(group):
return group.sort_values("births", ascending=False)[:1000]
grouped = names.groupby(["year", "sex"])
top1000 = grouped.apply(get_top1000)
top1000.head()
top1000 = top1000.reset_index(drop=True)
top1000.head()
boys = top1000[top1000["sex"] == "M"]
girls = top1000[top1000["sex"] == "F"]
total_births = top1000.pivot_table("births", index="year",
columns="name",
aggfunc=sum)
total_births.info()
subset = total_births[["John", "Harry", "Mary", "Marilyn"]]
subset.plot(subplots=True, figsize=(12, 10),
title="Number of births per year")
plt.figure()
table = top1000.pivot_table("prop", index="year",
columns="sex", aggfunc=sum)
table.plot(title="Sum of table1000.prop by year and sex",
yticks=np.linspace(0, 1.2, 13))
df = boys[boys["year"] == 2010]
df
prop_cumsum = df["prop"].sort_values(ascending=False).cumsum()
prop_cumsum[:10]
prop_cumsum.searchsorted(0.5)
df = boys[boys.year == 1900]
in1900 = df.sort_values("prop", ascending=False).prop.cumsum()
in1900.searchsorted(0.5) + 1
def get_quantile_count(group, q=0.5):
group = group.sort_values("prop", ascending=False)
return group.prop.cumsum().searchsorted(q) + 1
diversity = top1000.groupby(["year", "sex"]).apply(get_quantile_count)
diversity = diversity.unstack()
fig = plt.figure()
diversity.head()
diversity.plot(title="Number of popular names in top 50%")
def get_last_letter(x):
return x[-1]
last_letters = names["name"].map(get_last_letter)
last_letters.name = "last_letter"
table = names.pivot_table("births", index=last_letters,
columns=["sex", "year"], aggfunc=sum)
subtable = table.reindex(columns=[1910, 1960, 2010], level="year")
subtable.head()
subtable.sum()
letter_prop = subtable / subtable.sum()
letter_prop
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop["M"].plot(kind="bar", rot=0, ax=axes[0], title="Male")
letter_prop["F"].plot(kind="bar", rot=0, ax=axes[1], title="Female",
legend=False)
plt.subplots_adjust(hspace=0.25)
letter_prop = table / table.sum()
dny_ts = letter_prop.loc[["d", "n", "y"], "M"].T
dny_ts.head()
plt.close("all")
fig = plt.figure()
dny_ts.plot()
all_names = pd.Series(top1000["name"].unique())
lesley_like = all_names[all_names.str.contains("Lesl")]
lesley_like
filtered = top1000[top1000["name"].isin(lesley_like)]
filtered.groupby("name")["births"].sum()
table = filtered.pivot_table("births", index="year",
columns="sex", aggfunc="sum")
table = table.div(table.sum(axis="columns"), axis="index")
table.tail()
fig = plt.figure()
table.plot(style={"M": "k-", "F": "k--"})
import json
db = json.load(open("datasets/usda_food/database.json"))
len(db)
db[0].keys()
db[0]["nutrients"][0]
nutrients = pd.DataFrame(db[0]["nutrients"])
nutrients.head(7)
info_keys = ["description", "group", "id", "manufacturer"]
info = pd.DataFrame(db, columns=info_keys)
info.head()
info.info()
pd.value_counts(info["group"])[:10]
nutrients = []
for rec in db:
fnuts = pd.DataFrame(rec["nutrients"])
fnuts["id"] = rec["id"]
nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
nutrients
nutrients.duplicated().sum() # number of duplicates
nutrients = nutrients.drop_duplicates()
col_mapping = {"description" : "food",
"group" : "fgroup"}
info = info.rename(columns=col_mapping, copy=False)
info.info()
col_mapping = {"description" : "nutrient",
"group" : "nutgroup"}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients
ndata = pd.merge(nutrients, info, on="id")
ndata.info()
ndata.iloc[30000]
fig = plt.figure()
result = ndata.groupby(["nutrient", "fgroup"])["value"].quantile(0.5)
result["Zinc, Zn"].sort_values().plot(kind="barh")
by_nutrient = ndata.groupby(["nutgroup", "nutrient"])
def get_maximum(x):
return x.loc[x.value.idxmax()]
max_foods = by_nutrient.apply(get_maximum)[["value", "food"]]
# make the food a little smaller
max_foods["food"] = max_foods["food"].str[:50]
max_foods.loc["Amino Acids"]["food"]
fec = pd.read_csv("datasets/fec/P00000001-ALL.csv", low_memory=False)
fec.info()
fec.iloc[123456]
unique_cands = fec["cand_nm"].unique()
unique_cands
unique_cands[2]
parties = {"Bachmann, Michelle": "Republican",
"Cain, Herman": "Republican",
"Gingrich, Newt": "Republican",
"Huntsman, Jon": "Republican",
"Johnson, Gary Earl": "Republican",
"McCotter, Thaddeus G": "Republican",
"Obama, Barack": "Democrat",
"Paul, Ron": "Republican",
"Pawlenty, Timothy": "Republican",
"Perry, Rick": "Republican",
"Roemer, Charles E. 'Buddy' III": "Republican",
"Romney, Mitt": "Republican",
"Santorum, Rick": "Republican"}
fec["cand_nm"][123456:123461]
fec["cand_nm"][123456:123461].map(parties)
# Add it as a column
fec["party"] = fec["cand_nm"].map(parties)
fec["party"].value_counts()
(fec["contb_receipt_amt"] > 0).value_counts()
fec = fec[fec["contb_receipt_amt"] > 0]
fec_mrbo = fec[fec["cand_nm"].isin(["Obama, Barack", "Romney, Mitt"])]
fec["contbr_occupation"].value_counts()[:10]
occ_mapping = {
"INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED",
"INFORMATION REQUESTED" : "NOT PROVIDED",
"INFORMATION REQUESTED (BEST EFFORTS)" : "NOT PROVIDED",
"C.E.O.": "CEO"
}
def get_occ(x):
# If no mapping provided, return x
return occ_mapping.get(x, x)
fec["contbr_occupation"] = fec["contbr_occupation"].map(get_occ)
emp_mapping = {
"INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED",
"INFORMATION REQUESTED" : "NOT PROVIDED",
"SELF" : "SELF-EMPLOYED",
"SELF EMPLOYED" : "SELF-EMPLOYED",
}
def get_emp(x):
# If no mapping provided, return x
return emp_mapping.get(x, x)
fec["contbr_employer"] = fec["contbr_employer"].map(get_emp)
by_occupation = fec.pivot_table("contb_receipt_amt",
index="contbr_occupation",
columns="party", aggfunc="sum")
over_2mm = by_occupation[by_occupation.sum(axis="columns") > 2000000]
over_2mm
plt.figure()
over_2mm.plot(kind="barh")
def get_top_amounts(group, key, n=5):
totals = group.groupby(key)["contb_receipt_amt"].sum()
return totals.nlargest(n)
grouped = fec_mrbo.groupby("cand_nm")
grouped.apply(get_top_amounts, "contbr_occupation", n=7)
grouped.apply(get_top_amounts, "contbr_employer", n=10)
bins = np.array([0, 1, 10, 100, 1000, 10000,
100_000, 1_000_000, 10_000_000])
labels = pd.cut(fec_mrbo["contb_receipt_amt"], bins)
labels
grouped = fec_mrbo.groupby(["cand_nm", labels])
grouped.size().unstack(level=0)
plt.figure()
bucket_sums = grouped["contb_receipt_amt"].sum().unstack(level=0)
normed_sums = bucket_sums.div(bucket_sums.sum(axis="columns"),
axis="index")
normed_sums
normed_sums[:-2].plot(kind="barh")
grouped = fec_mrbo.groupby(["cand_nm", "contbr_st"])
totals = grouped["contb_receipt_amt"].sum().unstack(level=0).fillna(0)
totals = totals[totals.sum(axis="columns") > 100000]
totals.head(10)
percent = totals.div(totals.sum(axis="columns"), axis="index")
percent.head(10)