#!/usr/bin/env python # coding: utf-8 # # Data Set Summary Info # # This notebook provides summary information and descriptive statistics for our data sets. # ## Setup # In[1]: import sys import re # In[2]: from pathlib import Path # In[3]: import itertools as it import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from tqdm.auto import tqdm import plotnine as p # In[4]: import bookgender.datatools as dt from bookgender.nbutils import * # In[5]: def eprint(*args): print(*args, file=sys.stderr) # In[6]: fig_dir = init_figs('DataSummary') # In[7]: def lbl_pct(fs): return ['{:.0f}%'.format(f*100) for f in fs] # Function to make plots: # ## Load Data Files # Load book author gender info: # In[8]: datasets = sorted(list(dt.datasets.keys())) # In[9]: book_gender = pd.read_parquet('data/author-gender.parquet') book_gender['gender'] = book_gender['gender'].astype('category') book_gender.info() # In[10]: book_gender = pd.read_csv('data/author-gender.csv.gz', dtype={'gender': 'category'}) book_gender.info() # Book gender will be more useful if we index it, and it's basically now a series. # In[11]: book_gender = book_gender.set_index('item')['gender'] book_gender # Load the Library of Congress book list: # In[12]: loc_books = pd.read_csv('data/loc-books.csv.gz') loc_books.info() # Load rating data sets: # In[13]: ratings = {} for ds in datasets: eprint('loading ratings for', ds) ratings[ds] = pd.read_parquet(f'data/{ds}/ratings.parquet') # ### Fill and Expand Gender # # For later computations, we want to upgrade the book-gender frame so it has the following properties: # # * All available books have a gender record # * Both full status and simplified unlink status are available for each book # # This will simplify combining other records with the book gender data later. # # Let's start by making a huge array of all available book IDs: # In[14]: item_lists = [loc_books['item'].unique()] for rdf in ratings.values(): item_lists.append(rdf['item'].unique()) all_item_ids = np.unique(np.concatenate(item_lists)) all_item_ids.shape # How does that compare to the book gender frame? # In[15]: book_gender.count() # Add a category to `gender` for no-matching-book, and put an order on the categories (we're also going to make `book_gender` refer to the series, to simplify code): # In[16]: book_gender.cat.add_categories(['no-book'], inplace=True) book_gender.cat.reorder_categories(['no-book', 'no-loc-author', 'no-viaf-author', 'unknown', 'ambiguous', 'female', 'male'], inplace=True) # Reindex to match our list of book IDs, and fill in the missing value: # In[17]: book_gender = book_gender.reindex(all_item_ids, fill_value='no-book') book_gender # Now the index should be both monotonic and unique - this should simplify later use. Double-check: # In[18]: book_gender.index.is_unique # In[19]: book_gender.index.is_monotonic # Let's quick look at a histogram: # In[20]: sns.countplot(book_gender) # Ok. Last thing we need to do here is create a simplified column that collapsed our various types of link failure into 'unlinked'. We'll put this in the `gender` column, and make the existing series `gender_status`: # In[21]: book_gender = pd.DataFrame({ 'gender_status': book_gender, 'gender': book_gender.cat.rename_categories({ 'no-book': 'unlinked' }).cat.remove_categories([ 'no-loc-author', 'no-viaf-author' ]).fillna('unlinked') }) book_gender # And see that histogram: # In[22]: sns.countplot(book_gender['gender']) # ## Basic Data Set Stats # In[23]: ds_summary = pd.DataFrame.from_dict(dict( (n, {'Users': f['user'].nunique(), 'Items': f['item'].nunique(), 'Pairs': len(f)}) for (n, f) in ratings.items() ), orient='index') ds_summary['Density'] = ds_summary['Pairs'] / (ds_summary['Users'] * ds_summary['Items']) ds_summary # In[24]: def pct_fmt(p): return '{:.4f}%'.format(p * 100) def n_fmt(n): return '{:,d}'.format(n) print(ds_summary.to_latex(formatters={ 'Users': n_fmt, 'Items': n_fmt, 'Pairs': n_fmt, 'Density': pct_fmt })) # ### Distributions # What is the rating distribution for explicit-feedback data sets? # In[25]: exp_re = re.compile(r'^\w\w(-E|$)') [ds for ds in ratings.keys() if exp_re.match(ds)] # In[26]: exp_rate_stats = pd.concat( (rates.groupby('rating').item.count().reset_index(name='count').assign(Set=ds) for (ds, rates) in ratings.items() if exp_re.match(ds)) , ignore_index=True) exp_rate_stats.head() # In[27]: grid = sns.FacetGrid(col='Set', data=exp_rate_stats, sharex=False, sharey=False) grid.map(sns.barplot, 'rating', 'count') # In[28]: user_means = pd.concat( (rates.groupby('user').rating.mean().reset_index(name='AvgRating').assign(Set=ds) for (ds, rates) in ratings.items() if exp_re.match(ds)) , ignore_index=True) user_means.head() # In[29]: grid = sns.FacetGrid(col='Set', data=user_means, sharey=False, sharex=False) grid.map(sns.distplot, 'AvgRating') # In[30]: item_means = pd.concat( (rates.groupby('item').rating.mean().reset_index(name='AvgRating').assign(Set=ds) for (ds, rates) in ratings.items() if exp_re.match(ds)) , ignore_index=True) item_means.head() # In[31]: grid = sns.FacetGrid(col='Set', data=item_means, sharey=False, sharex=False) grid.map(sns.distplot, 'AvgRating') # ## Count and Integrate # # Now that we have the data loaded, we need to do a few things: # # 1. Connect with gender info # 2. Count books (or ratings) by gender. All kinds of unlinked gender are mapped to `unlinked`. # 3. Integrate into a single set of lists # # To start, we'll define a helper function for summarizing a frame of interactions by gender: # In[32]: def summarize_by_gender(rate_frame, gender_col='gender'): # count ratings per book i_counts = rate_frame['item'].value_counts().to_frame(name='ratings') # join with gender books = i_counts.join(book_gender) # count by gender counts = books.groupby(gender_col)['ratings'].agg(['count', 'sum']) counts.rename(columns={ 'count': 'Books', 'sum': 'Ratings' }, inplace=True) return counts # Let's see the function in action: # In[33]: summarize_by_gender(ratings['BX-E']) # Now build up a full frame of everything: # In[34]: eprint('summarizing LOC') summaries = {'LOC': summarize_by_gender(loc_books).assign(ratings=np.nan) } for ds, f in ratings.items(): eprint('summarizing', ds) summaries[ds] = summarize_by_gender(f) gender_stats = pd.concat(summaries, names=['DataSet']) gender_stats.info() # In[35]: eprint('summarizing LOC') fsums = {'LOC': summarize_by_gender(loc_books, 'gender_status')} for ds, f in ratings.items(): eprint('summarizing', ds) fsums[ds] = summarize_by_gender(f, 'gender_status') full_stats = pd.concat(fsums, names=['DataSet']) full_stats.info() # In[36]: book_counts = full_stats['Books'].unstack() book_counts # In[37]: book_counts[['no-book', 'no-loc-author', 'no-viaf-author']].sum(axis=1) # In[38]: book_fracs = book_counts.divide(book_counts.sum(axis=1), axis=0) book_fracs # In[39]: # book_counts.divide(book_counts.sum(axis=1), axis=0) * 100 # In[40]: print((book_counts.divide(book_counts.sum(axis=1), axis=0) * 100).to_latex(float_format='%.1f%%')) # To facilitate plotting, we need to do a few more transformations: # # 1. Shift into a tall format with a `Scope` # 2. Convert counts to percents # 3. Drop the LOC Ratings, because it is meaningless # In[41]: gs_tall = pd.DataFrame({'Count': gender_stats.stack()}) gs_tall.index.rename(['DataSet', 'Gender', 'Scope'], inplace=True) gs_tall = gs_tall.reorder_levels(['DataSet', 'Scope', 'Gender']).sort_index() gs_tall['Fraction'] = gs_tall['Count'] / gs_tall.groupby(level=['DataSet', 'Scope'])['Count'].sum() gs_tall.drop(('LOC', 'Ratings'), inplace=True) gs_tall.sort_index(inplace=True) gs_tall.reset_index(inplace=True) gs_tall['Gender'].cat.rename_categories({ 'female': 'F', 'male': 'M', 'ambiguous': 'Amb.', 'unknown': 'UnK', 'unlinked': 'UnL' }, inplace=True) gs_tall['Gender'].cat.reorder_categories([ 'F', 'M', 'Amb.', 'UnK', 'UnL' ], inplace=True) gs_tall['DataSet'] = gs_tall['DataSet'].astype('category') gs_tall['DataSet'].cat.reorder_categories(['LOC', 'AZ', 'BX-I', 'BX-E', 'GR-I', 'GR-E'], inplace=True) gs_tall # Finally, we can plot it: # In[42]: sns.catplot(x='Gender', y='Fraction', col='DataSet', col_wrap=2, hue='Scope', data=gs_tall.reset_index(), kind='bar', sharey=False, height=2, aspect=2) # Manual plotting logic for the paper: # In[43]: make_plot(gs_tall, p.aes('Gender', 'Fraction', fill='Scope'), p.geom_bar(stat='identity', position='dodge'), p.geom_text(p.aes(label='Fraction*100'), format_string='{:.1f}%', size=5, position=p.position_dodge(width=1), va='bottom'), p.facet_wrap('~DataSet', ncol=2), p.scale_fill_brewer('qual', 'Dark2'), p.scale_y_continuous(labels=lbl_pct), p.ylab('% of Books or Ratings'), legend_position='top', legend_title=p.element_blank(), file='link-stats.pdf', width=7, height=4.5) # Known-gender books: # In[44]: k_bc = book_counts[['male', 'female']] k_bf = k_bc.divide(k_bc.sum(axis=1), axis=0) k_bf = k_bf.loc[['LOC', 'AZ', 'BX-I', 'GR-I']] k_bf # In[45]: print((k_bf * 100).to_latex(float_format='%.1f%%')) # In[46]: k_bf.columns = k_bf.columns.astype('str') k_bft = k_bf.reset_index().melt(id_vars='DataSet', var_name='gender') k_bft['gender'] = k_bft.gender.astype('category').cat.reorder_categories(['male', 'female']) k_bft['DataSet'] = k_bft.DataSet.astype('category').cat.reorder_categories(['LOC', 'AZ', 'BX-I', 'GR-I']) # In[47]: make_plot(k_bft, p.aes('DataSet', 'value', fill='gender'), p.geom_bar(stat='identity'), p.scale_fill_brewer('qual', 'Dark2'), p.labs(x='Data Set', y='% of Books', fill='Gender'), p.scale_y_continuous(labels=lbl_pct), file='frac-known-books.pdf', width=4, height=2.5) # And do that again for ratings. # In[48]: rate_counts = full_stats['Ratings'].unstack() k_rc = rate_counts[['male', 'female']] k_rf = k_rc.divide(k_rc.sum(axis=1), axis=0) k_rf = k_rf.loc[datasets] k_rf # In[49]: all_cts = full_stats.reorder_levels([1,0]).loc[['male', 'female']].reorder_levels([1,0]).unstack() all_cts.sort_index(axis=1, inplace=True) print(all_cts.divide(all_cts.sum(axis=1, level=0), axis=0, level=0).to_latex(float_format=lambda f: '{:.1f}%'.format(f*100))) # In[50]: k_rf.columns = k_rf.columns.astype('str') k_rft = k_rf.reset_index().melt(id_vars='DataSet', var_name='gender') k_rft['gender'] = k_rft.gender.astype('category').cat.reorder_categories(['male', 'female']) k_rft['DataSet'] = k_rft.DataSet.astype('category').cat.reorder_categories(datasets) # In[51]: make_plot(k_rft, p.aes('DataSet', 'value', fill='gender'), p.geom_bar(stat='identity'), p.scale_fill_brewer('qual', 'Dark2'), p.scale_y_continuous(labels=lbl_pct), p.labs(x='Data Set', y='% of Ratings', fill='Gender'), file='frac-known-rates.pdf', width=4, height=2.5) # ## Popularity and Gender Distributions # # We now want to look at popularity and assorted distributions. # # We will start by computing item statistics. # In[52]: def _ds_stats(ds, df): eprint('summarizing ', ds) stats = df.groupby('item').user.count().reset_index(name='nratings') stats = stats.join(book_gender, on='item') stats['PopRank'] = stats['nratings'].rank() stats['PopRank'] = stats['PopRank'] / stats['PopRank'].max() stats['PopQ'] = (stats['PopRank'] * 100).round().astype('i4') stats['Set'] = ds return stats item_stats = pd.concat(_ds_stats(ds, df) for (ds, df) in ratings.items() if not ds.endswith('-E')) item_stats['Set'] = item_stats['Set'].astype('category') item_stats.head() # Compute rating count histograms: # In[53]: nr_hist = item_stats.groupby(['Set', 'nratings'])['item'].count().reset_index(name='items') make_plot(nr_hist, p.aes(x='nratings', y='items', color='Set'), p.geom_point(), p.scale_x_log10(), p.scale_y_log10()) # Let's look at rating count per book by gender resolution: # In[54]: rate_rates = item_stats.groupby(['Set', 'gender'])['nratings'].agg(['mean', 'median']) rr_stat = rate_rates.unstack().swaplevel(axis=1).loc[:, ['male', 'female']].sort_index(axis=1) print(rr_stat.to_latex(float_format='%.2f')) # Now compute gender histograms by percentile so we can stack: # In[55]: pop_g = item_stats.groupby(['Set', 'PopQ', 'gender'], observed=True)['item'].count().unstack() pop_g.fillna(0, inplace=True) pop_g = pop_g.divide(pop_g.sum(axis=1), axis=0) pop_g.sort_index(inplace=True) pop_g.head() # Propagate to percentile 0, so we can plot the whole width: # In[56]: for ds in pop_g.index.levels[0].categories: dspg = pop_g.loc[ds, :] pop_g.loc[(ds, 0), :] = dspg.iloc[0, :] pop_g.sort_index(inplace=True) # Stack for plotting: # In[57]: pop_g = pop_g.stack().reset_index(name='items') pop_g.head() # In[58]: pop_g['gender'].cat.reorder_categories([ 'male', 'female', 'ambiguous', 'unknown', 'unlinked' ], inplace=True) # And make an area plot. # In[59]: make_plot(pop_g, p.aes(x='PopQ', y='items', fill='gender'), p.geom_area(), p.scale_fill_brewer('qual', 'Set2'), p.scale_y_continuous(labels=lbl_pct), p.scale_x_continuous(expand=(0,0)), p.facet_grid('Set ~'), p.labs(x='Item Popularity Percentile (100 is most popular)', y='% of Books', fill='Gender'), file='gender-by-pop', width=8, height=5) # In[ ]: