import sys
import re
from pathlib import Path
import itertools as it
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import plotnine as p
import bookgender.datatools as dt
from bookgender.nbutils import *
def eprint(*args):
    print(*args, file=sys.stderr)
fig_dir = init_figs('DataSummary')
def lbl_pct(fs):
    return ['{:.0f}%'.format(f*100) for f in fs]
# Function to make plots:
# ## Load Data Files
# Load book author gender info:
datasets = sorted(list(dt.datasets.keys()))
book_gender = pd.read_parquet('data/author-gender.parquet')
book_gender['gender'] = book_gender['gender'].astype('category')
book_gender.info()
book_gender = pd.read_csv('data/author-gender.csv.gz', dtype={'gender': 'category'})
book_gender.info()
# Book gender will be more useful if we index it, and it's basically now a series.
book_gender = book_gender.set_index('item')['gender']
book_gender
# Load the Library of Congress book list:
loc_books = pd.read_csv('data/loc-books.csv.gz')
loc_books.info()
# Load rating data sets:
ratings = {}
for ds in datasets:
    eprint('loading ratings for', ds)
    ratings[ds] = pd.read_parquet(f'data/{ds}/ratings.parquet')
# ### Fill and Expand Gender
#
# For later computations, we want to upgrade the book-gender frame so it has the following properties:
#
# * All available books have a gender record
# * Both full status and simplified unlink status are available for each book
#
# This will simplify combining other records with the book gender data later.
#
# Let's start by making a huge array of all available book IDs:
item_lists = [loc_books['item'].unique()]
for rdf in ratings.values():
    item_lists.append(rdf['item'].unique())
all_item_ids = np.unique(np.concatenate(item_lists))
all_item_ids.shape
# How does that compare to the book gender frame?
book_gender.count()
# Add a category to `gender` for no-matching-book, and put an order on the categories (we're also going to make `book_gender` refer to the series, to simplify code):
book_gender.cat.add_categories(['no-book'], inplace=True)
book_gender.cat.reorder_categories(['no-book', 'no-loc-author', 'no-viaf-author', 'unknown', 'ambiguous', 'female', 'male'], inplace=True)
# Reindex to match our list of book IDs, and fill in the missing value:
book_gender = book_gender.reindex(all_item_ids, fill_value='no-book')
book_gender
# Now the index should be both monotonic and unique - this should simplify later use. Double-check: # In[18]: book_gender.index.is_unique # In[19]: book_gender.index.is_monotonic # Let's quick look at a histogram: # In[20]: sns.countplot(book_gender) # Ok. Last thing we need to do here is create a simplified column that collapsed our various types of link failure into 'unlinked'. We'll put this in the `gender` column, and make the existing series `gender_status`: # In[21]: book_gender = pd.DataFrame({ 'gender_status': book_gender, 'gender': book_gender.cat.rename_categories({ 'no-book': 'unlinked' }).cat.remove_categories([ 'no-loc-author', 'no-viaf-author' ]).fillna('unlinked') }) book_gender # And see that histogram: # In[22]: sns.countplot(book_gender['gender']) # ## Basic Data Set Stats # In[23]: ds_summary = pd.DataFrame.from_dict(dict( (n, {'Users': f['user'].nunique(), 'Items': f['item'].nunique(), 'Pairs': len(f)}) for (n, f) in ratings.items() ), orient='index') ds_summary['Density'] = ds_summary['Pairs'] / (ds_summary['Users'] * ds_summary['Items']) ds_summary # In[24]: def pct_fmt(p): return '{:.4f}%'.format(p * 100) def n_fmt(n): return '{:,d}'.format(n) print(ds_summary.to_latex(formatters={ 'Users': n_fmt, 'Items': n_fmt, 'Pairs': n_fmt, 'Density': pct_fmt })) # ### Distributions # What is the rating distribution for explicit-feedback data sets? # In[25]: exp_re = re.compile(r'^\w\w(-E|$)') [ds for ds in ratings.keys() if exp_re.match(ds)] # In[26]: exp_rate_stats = pd.concat( (rates.groupby('rating').item.count().reset_index(name='count').assign(Set=ds) for (ds, rates) in ratings.items() if exp_re.match(ds)) , ignore_index=True) exp_rate_stats.head() # In[27]: grid = sns.FacetGrid(col='Set', data=exp_rate_stats, sharex=False, sharey=False) grid.map(sns.barplot, 'rating', 'count') # In[28]: user_means = pd.concat( (rates.groupby('user').rating.mean().reset_index(name='AvgRating').assign(Set=ds) for (ds, rates) in ratings.items() if exp_re.match(ds)) , ignore_index=True) user_means.head() # In[29]: grid = sns.FacetGrid(col='Set', data=user_means, sharey=False, sharex=False) grid.map(sns.distplot, 'AvgRating') # In[30]: item_means = pd.concat( (rates.groupby('item').rating.mean().reset_index(name='AvgRating').assign(Set=ds) for (ds, rates) in ratings.items() if exp_re.match(ds)) , ignore_index=True) item_means.head() # In[31]: grid = sns.FacetGrid(col='Set', data=item_means, sharey=False, sharex=False) grid.map(sns.distplot, 'AvgRating') # ## Count and Integrate # # Now that we have the data loaded, we need to do a few things: # # 1. Connect with gender info # 2. Count books (or ratings) by gender. All kinds of unlinked gender are mapped to `unlinked`. # 3. Integrate into a single set of lists # # To start, we'll define a helper function for summarizing a frame of interactions by gender: # In[32]: def summarize_by_gender(rate_frame, gender_col='gender'): # count ratings per book i_counts = rate_frame['item'].value_counts().to_frame(name='ratings') # join with gender books = i_counts.join(book_gender) # count by gender counts = books.groupby(gender_col)['ratings'].agg(['count', 'sum']) counts.rename(columns={ 'count': 'Books', 'sum': 'Ratings' }, inplace=True) return counts # Let's see the function in action: # In[33]: summarize_by_gender(ratings['BX-E']) # Now build up a full frame of everything: # In[34]: eprint('summarizing LOC') summaries = {'LOC': summarize_by_gender(loc_books).assign(ratings=np.nan) } for ds, f in ratings.items(): eprint('summarizing', ds) summaries[ds] = summarize_by_gender(f) gender_stats = pd.concat(summaries, names=['DataSet']) gender_stats.info() # In[35]: eprint('summarizing LOC') fsums = {'LOC': summarize_by_gender(loc_books, 'gender_status')} for ds, f in ratings.items(): eprint('summarizing', ds) fsums[ds] = summarize_by_gender(f, 'gender_status') full_stats = pd.concat(fsums, names=['DataSet']) full_stats.info() # In[36]: book_counts = full_stats['Books'].unstack() book_counts # In[37]: book_counts[['no-book', 'no-loc-author', 'no-viaf-author']].sum(axis=1) # In[38]: book_fracs = book_counts.divide(book_counts.sum(axis=1), axis=0) book_fracs # In[39]: # book_counts.divide(book_counts.sum(axis=1), axis=0) * 100 # In[40]: print((book_counts.divide(book_counts.sum(axis=1), axis=0) * 100).to_latex(float_format='%.1f%%')) # To facilitate plotting, we need to do a few more transformations: # # 1. Shift into a tall format with a `Scope` # 2. Convert counts to percents # 3. Drop the LOC Ratings, because it is meaningless # In[41]: gs_tall = pd.DataFrame({'Count': gender_stats.stack()}) gs_tall.index.rename(['DataSet', 'Gender', 'Scope'], inplace=True) gs_tall = gs_tall.reorder_levels(['DataSet', 'Scope', 'Gender']).sort_index() gs_tall['Fraction'] = gs_tall['Count'] / gs_tall.groupby(level=['DataSet', 'Scope'])['Count'].sum() gs_tall.drop(('LOC', 'Ratings'), inplace=True) gs_tall.sort_index(inplace=True) gs_tall.reset_index(inplace=True) gs_tall['Gender'].cat.rename_categories({ 'female': 'F', 'male': 'M', 'ambiguous': 'Amb.', 'unknown': 'UnK', 'unlinked': 'UnL' }, inplace=True) gs_tall['Gender'].cat.reorder_categories([ 'F', 'M', 'Amb.', 'UnK', 'UnL' ], inplace=True) gs_tall['DataSet'] = gs_tall['DataSet'].astype('category') gs_tall['DataSet'].cat.reorder_categories(['LOC', 'AZ', 'BX-I', 'BX-E', 'GR-I', 'GR-E'], inplace=True) gs_tall # Finally, we can plot it: # In[42]: sns.catplot(x='Gender', y='Fraction', col='DataSet', col_wrap=2, hue='Scope', data=gs_tall.reset_index(), kind='bar', sharey=False, height=2, aspect=2) # Manual plotting logic for the paper: # In[43]: make_plot(gs_tall, p.aes('Gender', 'Fraction', fill='Scope'), p.geom_bar(stat='identity', position='dodge'), p.geom_text(p.aes(label='Fraction*100'), format_string='{:.1f}%', size=5, position=p.position_dodge(width=1), va='bottom'), p.facet_wrap('~DataSet', ncol=2), p.scale_fill_brewer('qual', 'Dark2'), p.scale_y_continuous(labels=lbl_pct), p.ylab('% of Books or Ratings'), legend_position='top', legend_title=p.element_blank(), file='link-stats.pdf', width=7, height=4.5) # Known-gender books: # In[44]: k_bc = book_counts[['male', 'female']] k_bf = k_bc.divide(k_bc.sum(axis=1), axis=0) k_bf = k_bf.loc[['LOC', 'AZ', 'BX-I', 'GR-I']] k_bf # In[45]: print((k_bf * 100).to_latex(float_format='%.1f%%')) # In[46]: k_bf.columns = k_bf.columns.astype('str') k_bft = k_bf.reset_index().melt(id_vars='DataSet', var_name='gender') k_bft['gender'] = k_bft.gender.astype('category').cat.reorder_categories(['male', 'female']) k_bft['DataSet'] = k_bft.DataSet.astype('category').cat.reorder_categories(['LOC', 'AZ', 'BX-I', 'GR-I']) # In[47]: make_plot(k_bft, p.aes('DataSet', 'value', fill='gender'), p.geom_bar(stat='identity'), p.scale_fill_brewer('qual', 'Dark2'), p.labs(x='Data Set', y='% of Books', fill='Gender'), p.scale_y_continuous(labels=lbl_pct), file='frac-known-books.pdf', width=4, height=2.5) # And do that again for ratings. # In[48]: rate_counts = full_stats['Ratings'].unstack() k_rc = rate_counts[['male', 'female']] k_rf = k_rc.divide(k_rc.sum(axis=1), axis=0) k_rf = k_rf.loc[datasets] k_rf # In[49]: all_cts = full_stats.reorder_levels([1,0]).loc[['male', 'female']].reorder_levels([1,0]).unstack() all_cts.sort_index(axis=1, inplace=True) print(all_cts.divide(all_cts.sum(axis=1, level=0), axis=0, level=0).to_latex(float_format=lambda f: '{:.1f}%'.format(f*100))) # In[50]: k_rf.columns = k_rf.columns.astype('str') k_rft = k_rf.reset_index().melt(id_vars='DataSet', var_name='gender') k_rft['gender'] = k_rft.gender.astype('category').cat.reorder_categories(['male', 'female']) k_rft['DataSet'] = k_rft.DataSet.astype('category').cat.reorder_categories(datasets) # In[51]: make_plot(k_rft, p.aes('DataSet', 'value', fill='gender'), p.geom_bar(stat='identity'), p.scale_fill_brewer('qual', 'Dark2'), p.scale_y_continuous(labels=lbl_pct), p.labs(x='Data Set', y='% of Ratings', fill='Gender'), file='frac-known-rates.pdf', width=4, height=2.5) # ## Popularity and Gender Distributions # # We now want to look at popularity and assorted distributions. # # We will start by computing item statistics. # In[52]: def _ds_stats(ds, df): eprint('summarizing ', ds) stats = df.groupby('item').user.count().reset_index(name='nratings') stats = stats.join(book_gender, on='item') stats['PopRank'] = stats['nratings'].rank() stats['PopRank'] = stats['PopRank'] / stats['PopRank'].max() stats['PopQ'] = (stats['PopRank'] * 100).round().astype('i4') stats['Set'] = ds return stats item_stats = pd.concat(_ds_stats(ds, df) for (ds, df) in ratings.items() if not ds.endswith('-E')) item_stats['Set'] = item_stats['Set'].astype('category') item_stats.head() # Compute rating count histograms: # In[53]: nr_hist = item_stats.groupby(['Set', 'nratings'])['item'].count().reset_index(name='items') make_plot(nr_hist, p.aes(x='nratings', y='items', color='Set'), p.geom_point(), p.scale_x_log10(), p.scale_y_log10()) # Let's look at rating count per book by gender resolution: # In[54]: rate_rates = item_stats.groupby(['Set', 'gender'])['nratings'].agg(['mean', 'median']) rr_stat = rate_rates.unstack().swaplevel(axis=1).loc[:, ['male', 'female']].sort_index(axis=1) print(rr_stat.to_latex(float_format='%.2f')) # Now compute gender histograms by percentile so we can stack: # In[55]: pop_g = item_stats.groupby(['Set', 'PopQ', 'gender'], observed=True)['item'].count().unstack() pop_g.fillna(0, inplace=True) pop_g = pop_g.divide(pop_g.sum(axis=1), axis=0) pop_g.sort_index(inplace=True) pop_g.head() # Propagate to percentile 0, so we can plot the whole width: # In[56]: for ds in pop_g.index.levels[0].categories: dspg = pop_g.loc[ds, :] pop_g.loc[(ds, 0), :] = dspg.iloc[0, :] pop_g.sort_index(inplace=True) # Stack for plotting: # In[57]: pop_g = pop_g.stack().reset_index(name='items') pop_g.head() # In[58]: pop_g['gender'].cat.reorder_categories([ 'male', 'female', 'ambiguous', 'unknown', 'unlinked' ], inplace=True) # And make an area plot. # In[59]: make_plot(pop_g, p.aes(x='PopQ', y='items', fill='gender'), p.geom_area(), p.scale_fill_brewer('qual', 'Set2'), p.scale_y_continuous(labels=lbl_pct), p.scale_x_continuous(expand=(0,0)), p.facet_grid('Set ~'), p.labs(x='Item Popularity Percentile (100 is most popular)', y='% of Books', fill='Gender'), file='gender-by-pop', width=8, height=5) # In[ ]: