This notebook provides summary information and descriptive statistics for our data sets.
import sys
import re
from pathlib import Path
import itertools as it
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import plotnine as p
import bookgender.datatools as dt
from bookgender.nbutils import *
def eprint(*args):
print(*args, file=sys.stderr)
fig_dir = init_figs('DataSummary')
def lbl_pct(fs):
return ['{:.0f}%'.format(f*100) for f in fs]
Function to make plots:
Load book author gender info:
datasets = sorted(list(dt.datasets.keys()))
book_gender = pd.read_parquet('data/author-gender.parquet')
book_gender['gender'] = book_gender['gender'].astype('category')
book_gender.info()
book_gender = pd.read_csv('data/author-gender.csv.gz', dtype={'gender': 'category'})
book_gender.info()
Book gender will be more useful if we index it, and it's basically now a series.
book_gender = book_gender.set_index('item')['gender']
book_gender
Load the Library of Congress book list:
loc_books = pd.read_csv('data/loc-books.csv.gz')
loc_books.info()
Load rating data sets:
ratings = {}
for ds in datasets:
eprint('loading ratings for', ds)
ratings[ds] = pd.read_parquet(f'data/{ds}/ratings.parquet')
For later computations, we want to upgrade the book-gender frame so it has the following properties:
This will simplify combining other records with the book gender data later.
Let's start by making a huge array of all available book IDs:
item_lists = [loc_books['item'].unique()]
for rdf in ratings.values():
item_lists.append(rdf['item'].unique())
all_item_ids = np.unique(np.concatenate(item_lists))
all_item_ids.shape
How does that compare to the book gender frame?
book_gender.count()
Add a category to gender
for no-matching-book, and put an order on the categories (we're also going to make book_gender
refer to the series, to simplify code):
book_gender.cat.add_categories(['no-book'], inplace=True)
book_gender.cat.reorder_categories(['no-book', 'no-loc-author', 'no-viaf-author',
'unknown', 'ambiguous', 'female', 'male'],
inplace=True)
Reindex to match our list of book IDs, and fill in the missing value:
book_gender = book_gender.reindex(all_item_ids, fill_value='no-book')
book_gender
Now the index should be both monotonic and unique - this should simplify later use. Double-check:
book_gender.index.is_unique
book_gender.index.is_monotonic
Let's quick look at a histogram:
sns.countplot(book_gender)
Ok. Last thing we need to do here is create a simplified column that collapsed our various types of link failure into 'unlinked'. We'll put this in the gender
column, and make the existing series gender_status
:
book_gender = pd.DataFrame({
'gender_status': book_gender,
'gender': book_gender.cat.rename_categories({
'no-book': 'unlinked'
}).cat.remove_categories([
'no-loc-author', 'no-viaf-author'
]).fillna('unlinked')
})
book_gender
And see that histogram:
sns.countplot(book_gender['gender'])
ds_summary = pd.DataFrame.from_dict(dict(
(n, {'Users': f['user'].nunique(), 'Items': f['item'].nunique(), 'Pairs': len(f)})
for (n, f) in ratings.items()
), orient='index')
ds_summary['Density'] = ds_summary['Pairs'] / (ds_summary['Users'] * ds_summary['Items'])
ds_summary
def pct_fmt(p):
return '{:.4f}%'.format(p * 100)
def n_fmt(n):
return '{:,d}'.format(n)
print(ds_summary.to_latex(formatters={
'Users': n_fmt,
'Items': n_fmt,
'Pairs': n_fmt,
'Density': pct_fmt
}))
What is the rating distribution for explicit-feedback data sets?
exp_re = re.compile(r'^\w\w(-E|$)')
[ds for ds in ratings.keys() if exp_re.match(ds)]
exp_rate_stats = pd.concat(
(rates.groupby('rating').item.count().reset_index(name='count').assign(Set=ds)
for (ds, rates) in ratings.items() if exp_re.match(ds))
, ignore_index=True)
exp_rate_stats.head()
grid = sns.FacetGrid(col='Set', data=exp_rate_stats, sharex=False, sharey=False)
grid.map(sns.barplot, 'rating', 'count')
user_means = pd.concat(
(rates.groupby('user').rating.mean().reset_index(name='AvgRating').assign(Set=ds)
for (ds, rates) in ratings.items() if exp_re.match(ds))
, ignore_index=True)
user_means.head()
grid = sns.FacetGrid(col='Set', data=user_means, sharey=False, sharex=False)
grid.map(sns.distplot, 'AvgRating')
item_means = pd.concat(
(rates.groupby('item').rating.mean().reset_index(name='AvgRating').assign(Set=ds)
for (ds, rates) in ratings.items() if exp_re.match(ds))
, ignore_index=True)
item_means.head()
grid = sns.FacetGrid(col='Set', data=item_means, sharey=False, sharex=False)
grid.map(sns.distplot, 'AvgRating')
Now that we have the data loaded, we need to do a few things:
unlinked
.To start, we'll define a helper function for summarizing a frame of interactions by gender:
def summarize_by_gender(rate_frame, gender_col='gender'):
# count ratings per book
i_counts = rate_frame['item'].value_counts().to_frame(name='ratings')
# join with gender
books = i_counts.join(book_gender)
# count by gender
counts = books.groupby(gender_col)['ratings'].agg(['count', 'sum'])
counts.rename(columns={
'count': 'Books',
'sum': 'Ratings'
}, inplace=True)
return counts
Let's see the function in action:
summarize_by_gender(ratings['BX-E'])
Now build up a full frame of everything:
eprint('summarizing LOC')
summaries = {'LOC': summarize_by_gender(loc_books).assign(ratings=np.nan) }
for ds, f in ratings.items():
eprint('summarizing', ds)
summaries[ds] = summarize_by_gender(f)
gender_stats = pd.concat(summaries, names=['DataSet'])
gender_stats.info()
eprint('summarizing LOC')
fsums = {'LOC': summarize_by_gender(loc_books, 'gender_status')}
for ds, f in ratings.items():
eprint('summarizing', ds)
fsums[ds] = summarize_by_gender(f, 'gender_status')
full_stats = pd.concat(fsums, names=['DataSet'])
full_stats.info()
book_counts = full_stats['Books'].unstack()
book_counts
book_counts[['no-book', 'no-loc-author', 'no-viaf-author']].sum(axis=1)
book_fracs = book_counts.divide(book_counts.sum(axis=1), axis=0)
book_fracs
# book_counts.divide(book_counts.sum(axis=1), axis=0) * 100
print((book_counts.divide(book_counts.sum(axis=1), axis=0) * 100).to_latex(float_format='%.1f%%'))
To facilitate plotting, we need to do a few more transformations:
Scope
gs_tall = pd.DataFrame({'Count': gender_stats.stack()})
gs_tall.index.rename(['DataSet', 'Gender', 'Scope'], inplace=True)
gs_tall = gs_tall.reorder_levels(['DataSet', 'Scope', 'Gender']).sort_index()
gs_tall['Fraction'] = gs_tall['Count'] / gs_tall.groupby(level=['DataSet', 'Scope'])['Count'].sum()
gs_tall.drop(('LOC', 'Ratings'), inplace=True)
gs_tall.sort_index(inplace=True)
gs_tall.reset_index(inplace=True)
gs_tall['Gender'].cat.rename_categories({
'female': 'F',
'male': 'M',
'ambiguous': 'Amb.',
'unknown': 'UnK',
'unlinked': 'UnL'
}, inplace=True)
gs_tall['Gender'].cat.reorder_categories([
'F',
'M',
'Amb.',
'UnK',
'UnL'
], inplace=True)
gs_tall['DataSet'] = gs_tall['DataSet'].astype('category')
gs_tall['DataSet'].cat.reorder_categories(['LOC', 'AZ', 'BX-I', 'BX-E', 'GR-I', 'GR-E'], inplace=True)
gs_tall
Finally, we can plot it:
sns.catplot(x='Gender', y='Fraction', col='DataSet', col_wrap=2, hue='Scope',
data=gs_tall.reset_index(),
kind='bar', sharey=False, height=2, aspect=2)
Manual plotting logic for the paper:
make_plot(gs_tall, p.aes('Gender', 'Fraction', fill='Scope'),
p.geom_bar(stat='identity', position='dodge'),
p.geom_text(p.aes(label='Fraction*100'), format_string='{:.1f}%', size=5,
position=p.position_dodge(width=1), va='bottom'),
p.facet_wrap('~DataSet', ncol=2),
p.scale_fill_brewer('qual', 'Dark2'),
p.scale_y_continuous(labels=lbl_pct),
p.ylab('% of Books or Ratings'),
legend_position='top', legend_title=p.element_blank(),
file='link-stats.pdf', width=7, height=4.5)
Known-gender books:
k_bc = book_counts[['male', 'female']]
k_bf = k_bc.divide(k_bc.sum(axis=1), axis=0)
k_bf = k_bf.loc[['LOC', 'AZ', 'BX-I', 'GR-I']]
k_bf
print((k_bf * 100).to_latex(float_format='%.1f%%'))
k_bf.columns = k_bf.columns.astype('str')
k_bft = k_bf.reset_index().melt(id_vars='DataSet', var_name='gender')
k_bft['gender'] = k_bft.gender.astype('category').cat.reorder_categories(['male', 'female'])
k_bft['DataSet'] = k_bft.DataSet.astype('category').cat.reorder_categories(['LOC', 'AZ', 'BX-I', 'GR-I'])
make_plot(k_bft, p.aes('DataSet', 'value', fill='gender'),
p.geom_bar(stat='identity'),
p.scale_fill_brewer('qual', 'Dark2'),
p.labs(x='Data Set', y='% of Books', fill='Gender'),
p.scale_y_continuous(labels=lbl_pct),
file='frac-known-books.pdf', width=4, height=2.5)
And do that again for ratings.
rate_counts = full_stats['Ratings'].unstack()
k_rc = rate_counts[['male', 'female']]
k_rf = k_rc.divide(k_rc.sum(axis=1), axis=0)
k_rf = k_rf.loc[datasets]
k_rf
all_cts = full_stats.reorder_levels([1,0]).loc[['male', 'female']].reorder_levels([1,0]).unstack()
all_cts.sort_index(axis=1, inplace=True)
print(all_cts.divide(all_cts.sum(axis=1, level=0), axis=0, level=0).to_latex(float_format=lambda f: '{:.1f}%'.format(f*100)))
k_rf.columns = k_rf.columns.astype('str')
k_rft = k_rf.reset_index().melt(id_vars='DataSet', var_name='gender')
k_rft['gender'] = k_rft.gender.astype('category').cat.reorder_categories(['male', 'female'])
k_rft['DataSet'] = k_rft.DataSet.astype('category').cat.reorder_categories(datasets)
make_plot(k_rft, p.aes('DataSet', 'value', fill='gender'),
p.geom_bar(stat='identity'),
p.scale_fill_brewer('qual', 'Dark2'),
p.scale_y_continuous(labels=lbl_pct),
p.labs(x='Data Set', y='% of Ratings', fill='Gender'),
file='frac-known-rates.pdf', width=4, height=2.5)
We now want to look at popularity and assorted distributions.
We will start by computing item statistics.
def _ds_stats(ds, df):
eprint('summarizing ', ds)
stats = df.groupby('item').user.count().reset_index(name='nratings')
stats = stats.join(book_gender, on='item')
stats['PopRank'] = stats['nratings'].rank()
stats['PopRank'] = stats['PopRank'] / stats['PopRank'].max()
stats['PopQ'] = (stats['PopRank'] * 100).round().astype('i4')
stats['Set'] = ds
return stats
item_stats = pd.concat(_ds_stats(ds, df) for (ds, df) in ratings.items() if not ds.endswith('-E'))
item_stats['Set'] = item_stats['Set'].astype('category')
item_stats.head()
Compute rating count histograms:
nr_hist = item_stats.groupby(['Set', 'nratings'])['item'].count().reset_index(name='items')
make_plot(nr_hist, p.aes(x='nratings', y='items', color='Set'),
p.geom_point(),
p.scale_x_log10(),
p.scale_y_log10())
Let's look at rating count per book by gender resolution:
rate_rates = item_stats.groupby(['Set', 'gender'])['nratings'].agg(['mean', 'median'])
rr_stat = rate_rates.unstack().swaplevel(axis=1).loc[:, ['male', 'female']].sort_index(axis=1)
print(rr_stat.to_latex(float_format='%.2f'))
Now compute gender histograms by percentile so we can stack:
pop_g = item_stats.groupby(['Set', 'PopQ', 'gender'], observed=True)['item'].count().unstack()
pop_g.fillna(0, inplace=True)
pop_g = pop_g.divide(pop_g.sum(axis=1), axis=0)
pop_g.sort_index(inplace=True)
pop_g.head()
Propagate to percentile 0, so we can plot the whole width:
for ds in pop_g.index.levels[0].categories:
dspg = pop_g.loc[ds, :]
pop_g.loc[(ds, 0), :] = dspg.iloc[0, :]
pop_g.sort_index(inplace=True)
Stack for plotting:
pop_g = pop_g.stack().reset_index(name='items')
pop_g.head()
pop_g['gender'].cat.reorder_categories([
'male', 'female', 'ambiguous',
'unknown', 'unlinked'
], inplace=True)
And make an area plot.
make_plot(pop_g, p.aes(x='PopQ', y='items', fill='gender'),
p.geom_area(),
p.scale_fill_brewer('qual', 'Set2'),
p.scale_y_continuous(labels=lbl_pct),
p.scale_x_continuous(expand=(0,0)),
p.facet_grid('Set ~'),
p.labs(x='Item Popularity Percentile (100 is most popular)',
y='% of Books',
fill='Gender'),
file='gender-by-pop', width=8, height=5)