#!/usr/bin/env python
# coding: utf-8

# # Data Set Summary Info
# 
# This notebook provides summary information and descriptive statistics for our data sets.

# ## Setup

# In[1]:


import sys
import re


# In[2]:


from pathlib import Path


# In[3]:


import itertools as it
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import plotnine as p


# In[4]:


import bookgender.datatools as dt
from bookgender.nbutils import *


# In[5]:


def eprint(*args):
    print(*args, file=sys.stderr)


# In[6]:


fig_dir = init_figs('DataSummary')


# In[7]:


def lbl_pct(fs):
    return ['{:.0f}%'.format(f*100) for f in fs]


# Function to make plots:

# ## Load Data Files

# Load book author gender info:

# In[8]:


datasets =  sorted(list(dt.datasets.keys()))


# In[9]:


book_gender = pd.read_parquet('data/author-gender.parquet')
book_gender['gender'] = book_gender['gender'].astype('category')
book_gender.info()


# In[10]:


book_gender = pd.read_csv('data/author-gender.csv.gz', dtype={'gender': 'category'})
book_gender.info()


# Book gender will be more useful if we index it, and it's basically now a series.

# In[11]:


book_gender = book_gender.set_index('item')['gender']
book_gender


# Load the Library of Congress book list:

# In[12]:


loc_books = pd.read_csv('data/loc-books.csv.gz')
loc_books.info()


# Load rating data sets:

# In[13]:


ratings = {}
for ds in datasets:
    eprint('loading ratings for', ds)
    ratings[ds] = pd.read_parquet(f'data/{ds}/ratings.parquet')


# ### Fill and Expand Gender
# 
# For later computations, we want to upgrade the book-gender frame so it has the following properties:
# 
# * All available books have a gender record
# * Both full status and simplified unlink status are available for each book
# 
# This will simplify combining other records with the book gender data later.
# 
# Let's start by making a huge array of all available book IDs:

# In[14]:


item_lists = [loc_books['item'].unique()]
for rdf in ratings.values():
    item_lists.append(rdf['item'].unique())
all_item_ids = np.unique(np.concatenate(item_lists))
all_item_ids.shape


# How does that compare to the book gender frame?

# In[15]:


book_gender.count()


# Add a category to `gender` for no-matching-book, and put an order on the categories (we're also going to make `book_gender` refer to the series, to simplify code):

# In[16]:


book_gender.cat.add_categories(['no-book'], inplace=True)
book_gender.cat.reorder_categories(['no-book', 'no-loc-author', 'no-viaf-author',
                                    'unknown', 'ambiguous', 'female', 'male'],
                                   inplace=True)


# Reindex to match our list of book IDs, and fill in the missing value:

# In[17]:


book_gender = book_gender.reindex(all_item_ids, fill_value='no-book')
book_gender


# Now the index should be both monotonic and unique - this should simplify later use. Double-check:

# In[18]:


book_gender.index.is_unique


# In[19]:


book_gender.index.is_monotonic


# Let's quick look at a histogram:

# In[20]:


sns.countplot(book_gender)


# Ok. Last thing we need to do here is create a simplified column that collapsed our various types of link failure into 'unlinked'.  We'll put this in the `gender` column, and make the existing series `gender_status`:

# In[21]:


book_gender = pd.DataFrame({
    'gender_status': book_gender,
    'gender': book_gender.cat.rename_categories({
        'no-book': 'unlinked'
    }).cat.remove_categories([
        'no-loc-author', 'no-viaf-author'
    ]).fillna('unlinked')
})
book_gender


# And see that histogram:

# In[22]:


sns.countplot(book_gender['gender'])


# ## Basic Data Set Stats

# In[23]:


ds_summary = pd.DataFrame.from_dict(dict(
    (n, {'Users': f['user'].nunique(), 'Items': f['item'].nunique(), 'Pairs': len(f)})
    for (n, f) in ratings.items()
), orient='index')
ds_summary['Density'] = ds_summary['Pairs'] / (ds_summary['Users'] * ds_summary['Items'])
ds_summary


# In[24]:


def pct_fmt(p):
    return '{:.4f}%'.format(p * 100)
def n_fmt(n):
    return '{:,d}'.format(n)
print(ds_summary.to_latex(formatters={
    'Users': n_fmt,
    'Items': n_fmt,
    'Pairs': n_fmt,
    'Density': pct_fmt
}))


# ### Distributions

# What is the rating distribution for explicit-feedback data sets?

# In[25]:


exp_re = re.compile(r'^\w\w(-E|$)')
[ds for ds in ratings.keys() if exp_re.match(ds)]


# In[26]:


exp_rate_stats = pd.concat(
    (rates.groupby('rating').item.count().reset_index(name='count').assign(Set=ds)
     for (ds, rates) in ratings.items() if exp_re.match(ds))
, ignore_index=True)
exp_rate_stats.head()


# In[27]:


grid = sns.FacetGrid(col='Set', data=exp_rate_stats, sharex=False, sharey=False)
grid.map(sns.barplot, 'rating', 'count')


# In[28]:


user_means = pd.concat(
    (rates.groupby('user').rating.mean().reset_index(name='AvgRating').assign(Set=ds)
     for (ds, rates) in ratings.items() if exp_re.match(ds))
, ignore_index=True)
user_means.head()


# In[29]:


grid = sns.FacetGrid(col='Set', data=user_means, sharey=False, sharex=False)
grid.map(sns.distplot, 'AvgRating')


# In[30]:


item_means = pd.concat(
    (rates.groupby('item').rating.mean().reset_index(name='AvgRating').assign(Set=ds)
     for (ds, rates) in ratings.items() if exp_re.match(ds))
, ignore_index=True)
item_means.head()


# In[31]:


grid = sns.FacetGrid(col='Set', data=item_means, sharey=False, sharex=False)
grid.map(sns.distplot, 'AvgRating')


# ## Count and Integrate
# 
# Now that we have the data loaded, we need to do a few things:
# 
# 1. Connect with gender info
# 2. Count books (or ratings) by gender. All kinds of unlinked gender are mapped to `unlinked`.
# 3. Integrate into a single set of lists
# 
# To start, we'll define a helper function for summarizing a frame of interactions by gender:

# In[32]:


def summarize_by_gender(rate_frame, gender_col='gender'):
    # count ratings per book
    i_counts = rate_frame['item'].value_counts().to_frame(name='ratings')
    # join with gender
    books = i_counts.join(book_gender)
    # count by gender
    counts = books.groupby(gender_col)['ratings'].agg(['count', 'sum'])
    counts.rename(columns={
        'count': 'Books',
        'sum': 'Ratings'
    }, inplace=True)
    return counts


# Let's see the function in action:

# In[33]:


summarize_by_gender(ratings['BX-E'])


# Now build up a full frame of everything:

# In[34]:


eprint('summarizing LOC')
summaries = {'LOC':  summarize_by_gender(loc_books).assign(ratings=np.nan) }
for ds, f in ratings.items():
    eprint('summarizing', ds)
    summaries[ds] = summarize_by_gender(f)
gender_stats = pd.concat(summaries, names=['DataSet'])
gender_stats.info()


# In[35]:


eprint('summarizing LOC')
fsums = {'LOC': summarize_by_gender(loc_books, 'gender_status')}
for ds, f in ratings.items():
    eprint('summarizing', ds)
    fsums[ds] = summarize_by_gender(f, 'gender_status')
full_stats = pd.concat(fsums, names=['DataSet'])
full_stats.info()


# In[36]:


book_counts = full_stats['Books'].unstack()
book_counts


# In[37]:


book_counts[['no-book', 'no-loc-author', 'no-viaf-author']].sum(axis=1)


# In[38]:


book_fracs = book_counts.divide(book_counts.sum(axis=1), axis=0)
book_fracs


# In[39]:


# book_counts.divide(book_counts.sum(axis=1), axis=0) * 100


# In[40]:


print((book_counts.divide(book_counts.sum(axis=1), axis=0) * 100).to_latex(float_format='%.1f%%'))


# To facilitate plotting, we need to do a few more transformations:
# 
# 1. Shift into a tall format with a `Scope`
# 2. Convert counts to percents
# 3. Drop the LOC Ratings, because it is meaningless

# In[41]:


gs_tall = pd.DataFrame({'Count': gender_stats.stack()})
gs_tall.index.rename(['DataSet', 'Gender', 'Scope'], inplace=True)
gs_tall = gs_tall.reorder_levels(['DataSet', 'Scope', 'Gender']).sort_index()
gs_tall['Fraction'] = gs_tall['Count'] / gs_tall.groupby(level=['DataSet', 'Scope'])['Count'].sum()
gs_tall.drop(('LOC', 'Ratings'), inplace=True)
gs_tall.sort_index(inplace=True)
gs_tall.reset_index(inplace=True)
gs_tall['Gender'].cat.rename_categories({
    'female': 'F',
    'male': 'M',
    'ambiguous': 'Amb.',
    'unknown': 'UnK',
    'unlinked': 'UnL'
}, inplace=True)
gs_tall['Gender'].cat.reorder_categories([
    'F',
    'M',
    'Amb.',
    'UnK',
    'UnL'
], inplace=True)
gs_tall['DataSet'] = gs_tall['DataSet'].astype('category')
gs_tall['DataSet'].cat.reorder_categories(['LOC', 'AZ', 'BX-I', 'BX-E', 'GR-I', 'GR-E'], inplace=True)
gs_tall


# Finally, we can plot it:

# In[42]:


sns.catplot(x='Gender', y='Fraction', col='DataSet', col_wrap=2, hue='Scope',
            data=gs_tall.reset_index(),
            kind='bar', sharey=False, height=2, aspect=2)


# Manual plotting logic for the paper:

# In[43]:


make_plot(gs_tall, p.aes('Gender', 'Fraction', fill='Scope'),
    p.geom_bar(stat='identity', position='dodge'),
    p.geom_text(p.aes(label='Fraction*100'), format_string='{:.1f}%', size=5, 
                position=p.position_dodge(width=1), va='bottom'),
    p.facet_wrap('~DataSet', ncol=2),
    p.scale_fill_brewer('qual', 'Dark2'),
    p.scale_y_continuous(labels=lbl_pct),
    p.ylab('% of Books or Ratings'),
    legend_position='top', legend_title=p.element_blank(),
    file='link-stats.pdf', width=7, height=4.5)


# Known-gender books:

# In[44]:


k_bc = book_counts[['male', 'female']]
k_bf = k_bc.divide(k_bc.sum(axis=1), axis=0)
k_bf = k_bf.loc[['LOC', 'AZ', 'BX-I', 'GR-I']]
k_bf


# In[45]:


print((k_bf * 100).to_latex(float_format='%.1f%%'))


# In[46]:


k_bf.columns = k_bf.columns.astype('str')
k_bft = k_bf.reset_index().melt(id_vars='DataSet', var_name='gender')
k_bft['gender'] = k_bft.gender.astype('category').cat.reorder_categories(['male', 'female'])
k_bft['DataSet'] = k_bft.DataSet.astype('category').cat.reorder_categories(['LOC', 'AZ', 'BX-I', 'GR-I'])


# In[47]:


make_plot(k_bft, p.aes('DataSet', 'value', fill='gender'),
          p.geom_bar(stat='identity'),
          p.scale_fill_brewer('qual', 'Dark2'),
          p.labs(x='Data Set', y='% of Books', fill='Gender'),
          p.scale_y_continuous(labels=lbl_pct),
          file='frac-known-books.pdf', width=4, height=2.5)


# And do that again for ratings.

# In[48]:


rate_counts = full_stats['Ratings'].unstack()
k_rc = rate_counts[['male', 'female']]
k_rf = k_rc.divide(k_rc.sum(axis=1), axis=0)
k_rf = k_rf.loc[datasets]
k_rf


# In[49]:


all_cts = full_stats.reorder_levels([1,0]).loc[['male', 'female']].reorder_levels([1,0]).unstack()
all_cts.sort_index(axis=1, inplace=True)
print(all_cts.divide(all_cts.sum(axis=1, level=0), axis=0, level=0).to_latex(float_format=lambda f: '{:.1f}%'.format(f*100)))


# In[50]:


k_rf.columns = k_rf.columns.astype('str')
k_rft = k_rf.reset_index().melt(id_vars='DataSet', var_name='gender')
k_rft['gender'] = k_rft.gender.astype('category').cat.reorder_categories(['male', 'female'])
k_rft['DataSet'] = k_rft.DataSet.astype('category').cat.reorder_categories(datasets)


# In[51]:


make_plot(k_rft, p.aes('DataSet', 'value', fill='gender'),
          p.geom_bar(stat='identity'),
          p.scale_fill_brewer('qual', 'Dark2'),
          p.scale_y_continuous(labels=lbl_pct),
          p.labs(x='Data Set', y='% of Ratings', fill='Gender'),
          file='frac-known-rates.pdf', width=4, height=2.5)


# ## Popularity and Gender Distributions
# 
# We now want to look at popularity and assorted distributions.
# 
# We will start by computing item statistics.

# In[52]:


def _ds_stats(ds, df):
    eprint('summarizing ', ds)
    stats = df.groupby('item').user.count().reset_index(name='nratings')
    stats = stats.join(book_gender, on='item')
    stats['PopRank'] = stats['nratings'].rank()
    stats['PopRank'] = stats['PopRank'] / stats['PopRank'].max()
    stats['PopQ'] = (stats['PopRank'] * 100).round().astype('i4')
    stats['Set'] = ds
    return stats
item_stats = pd.concat(_ds_stats(ds, df) for (ds, df) in ratings.items() if not ds.endswith('-E'))
item_stats['Set'] = item_stats['Set'].astype('category')
item_stats.head()


# Compute rating count histograms:

# In[53]:


nr_hist = item_stats.groupby(['Set', 'nratings'])['item'].count().reset_index(name='items')
make_plot(nr_hist, p.aes(x='nratings', y='items', color='Set'),
          p.geom_point(),
          p.scale_x_log10(),
          p.scale_y_log10())


# Let's look at rating count per book by gender resolution:

# In[54]:


rate_rates = item_stats.groupby(['Set', 'gender'])['nratings'].agg(['mean', 'median'])
rr_stat = rate_rates.unstack().swaplevel(axis=1).loc[:, ['male', 'female']].sort_index(axis=1)
print(rr_stat.to_latex(float_format='%.2f'))


# Now compute gender histograms by percentile so we can stack:

# In[55]:


pop_g = item_stats.groupby(['Set', 'PopQ', 'gender'], observed=True)['item'].count().unstack()
pop_g.fillna(0, inplace=True)
pop_g = pop_g.divide(pop_g.sum(axis=1), axis=0)
pop_g.sort_index(inplace=True)
pop_g.head()


# Propagate to percentile 0, so we can plot the whole width:

# In[56]:


for ds in pop_g.index.levels[0].categories:
    dspg = pop_g.loc[ds, :]
    pop_g.loc[(ds, 0), :] = dspg.iloc[0, :]
pop_g.sort_index(inplace=True)


# Stack for plotting:

# In[57]:


pop_g = pop_g.stack().reset_index(name='items')
pop_g.head()


# In[58]:


pop_g['gender'].cat.reorder_categories([
    'male', 'female', 'ambiguous',
    'unknown', 'unlinked'
], inplace=True)


# And make an area plot.

# In[59]:


make_plot(pop_g, p.aes(x='PopQ', y='items', fill='gender'),
          p.geom_area(),
          p.scale_fill_brewer('qual', 'Set2'),
          p.scale_y_continuous(labels=lbl_pct),
          p.scale_x_continuous(expand=(0,0)),
          p.facet_grid('Set ~'),
          p.labs(x='Item Popularity Percentile (100 is most popular)',
                 y='% of Books',
                 fill='Gender'),
          file='gender-by-pop', width=8, height=5)


# In[ ]: