#!/usr/bin/env python
# coding: utf-8

# # Recommendation List Data Prep
# 
# This notebook does the data preparation for the recommendation list analysis.

# ## Setup

# In[1]:


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product


# In[2]:


import ujson


# In[3]:


from bookgender.config import data_dir


# ## Load Data

# Load book gender data and clean it up:

# In[4]:


book_gender = pd.read_csv('data/author-gender.csv.gz')
book_gender = book_gender.set_index('item')['gender']
book_gender.loc[book_gender.str.startswith('no-')] = 'unknown'
book_gender.loc[book_gender == 'unlinked'] = 'unknown'
book_gender = book_gender.astype('category')
book_gender.describe()


# In[5]:


book_gender.head()


# And load hashes:

# In[6]:


book_hash = pd.read_parquet('data/book-hash.parquet').rename(columns={'cluster': 'item'})
book_hash['dcode'] = book_hash['md5'].apply(lambda x: int(x[-1], 16) % 2)
book_hash = book_hash.set_index('item')
book_hash.head()


# Load the user profile data:

# In[7]:


profiles = pd.read_pickle('data/profile-data.pkl')
profiles.head()


# In[8]:


datasets = list(profiles.index.levels[0])
datasets


# And load the recommendations:

# In[9]:


recs = pd.read_parquet('data/study-recs.parquet')
recs.rename(columns={'dataset': 'Set', 'algorithm': 'Algorithm'}, inplace=True)
recs.head()


# The original paper truncated recommendation lists to 50. Let's do that too:

# In[10]:


recs = recs[recs['rank'] <= 50]


# In[11]:


recs.Set.unique()


# In[12]:


recs.Algorithm.unique()


# We will need to extract implicit/explicit from those.  In the new paper, we are going to separate out implicit and explicit data for presentation; these functions will help with that.

# In[13]:


def select_implicit(data, reset=True):
    if reset:
        data = data.reset_index()
    implicit = data['Algorithm'].str.endswith('-imp') | data['Set'].str.endswith('-I')
    data = data[implicit].assign(Set=data['Set'].str.replace('-I', ''),
                                 Algorithm=data['Algorithm'].str.replace('-imp', ''))
    data['Algorithm'] = data['Algorithm'].str.replace('wrls', 'als')
    return data


# In[14]:


def select_explicit(data, reset=True):
    if reset:
        data = data.reset_index()
    implicit = data['Algorithm'].str.endswith('-imp') | data['Set'].str.endswith('-I')
    data = data[~implicit].assign(Set=data['Set'].str.replace('-E', ''))
    return data


# And give ourselves a handy way to relable algorithms:

# In[15]:


algo_labels = {
    'als': 'ALS',
    'bpr': 'BPR',
    'item-item': 'II',
    'user-user': 'UU'
}


# ## Analyze Rec List Composition

# In the mean time, let's proceed by computing recommendation list gender data.

# In[16]:


recs.drop(columns=['gender'], errors='ignore', inplace=True)
recs = recs.join(book_gender, on='item', how='left')
recs['gender'] = recs['gender'].fillna('unknown')
recs['gender'].describe()


# And mix in the dummy code data:

# In[17]:


recs.drop(columns=['dcode'], errors='ignore', inplace=True)
recs = recs.join(book_hash['dcode'], on='item', how='left')
recs.head()


# Count up the statistics for each list by gender:

# In[18]:


rec_stats = recs.groupby(['Set', 'Algorithm', 'user'])['gender'].value_counts().unstack(fill_value=0)
rec_stats.columns = rec_stats.columns.astype('object')
rec_stats['Total'] = rec_stats.sum(axis=1)
rec_stats['Known'] = rec_stats['male'].fillna(0) + rec_stats['female'].fillna(0)
rec_stats['PropKnown'] = rec_stats['Known'] / rec_stats['Total']
rec_stats['PropFemale'] = rec_stats['female'] / rec_stats['Known']
rec_stats


# In[19]:


rec_stats.info()


# Mix in info from dummy codes:

# In[20]:


rec_dc_stats = recs.groupby(['Set', 'Algorithm', 'user'])['dcode'].agg(['count', 'sum', 'mean'])
rec_dc_stats.rename(columns={'count': 'dcknown', 'sum': 'dcyes', 'mean': 'PropDC'}, inplace=True)
rec_dc_stats['dcyes'] = rec_dc_stats['dcyes'].astype('i4')
rec_dc_stats.head()


# In[21]:


rec_stats = rec_stats.join(rec_dc_stats)
rec_stats.head()


# Quick status-check on the number of recommendation lists per algorithm, implicit feedback:

# In[22]:


select_implicit(rec_stats).groupby(['Set', 'Algorithm'])['Total'].count().unstack()


# Explicit feedback:

# In[23]:


select_explicit(rec_stats).groupby(['Set', 'Algorithm'])['Total'].count().unstack()


# ## Non-personalized Recommendations
# 
# We also want to compute the makeup of non-personalized recommendations, to get a baseline level for each algorithm.

# In[24]:


az_ratings = pd.read_parquet('data/AZ/ratings.parquet')
bxi_ratings = pd.read_parquet('data/BX-I/ratings.parquet')
bxe_ratings = pd.read_parquet('data/BX-E/ratings.parquet')
gre_ratings = pd.read_parquet('data/GR-E/ratings.parquet')
gri_ratings = pd.read_parquet('data/GR-I/ratings.parquet')


# ### Popularity

# In[25]:


istats = pd.concat({
    'AZ': az_ratings.groupby('item')['user'].count().nlargest(50),
    'BX-I': bxi_ratings.groupby('item')['user'].count().nlargest(50),
    'BX-E': bxe_ratings.groupby('item')['user'].count().nlargest(50),
    'GR-I': gri_ratings.groupby('item')['user'].count().nlargest(50),
    'GR-E': gre_ratings.groupby('item')['user'].count().nlargest(50)
}, names=['Set'])
istats = istats.reset_index(name='count')
istats.head()


# In[26]:


irecs = istats.join(book_gender, on='item', how='left')
irecs['gender'] = irecs['gender'].fillna('unknown')
irecs.head()


# In[27]:


pop_gender = irecs.groupby(['Set', 'gender']).item.count().unstack().fillna(0).astype('i4')
pop_gender.columns = pop_gender.columns.astype('object')
pop_gender['Total'] = pop_gender.sum(axis=1)
pop_gender['Known'] = pop_gender['male'] + pop_gender['female']
pop_gender['PropKnown'] = pop_gender['Known'] / pop_gender['Total']
pop_gender['PropFemale'] = pop_gender['female'] / pop_gender['Known']
pop_gender


# ### Highest Average Rating

# In[28]:


astats = pd.concat({
    'AZ': az_ratings.groupby('item')['rating'].mean().nlargest(50),
    'BX-E': bxe_ratings.groupby('item')['rating'].mean().nlargest(50),
    'GR-E': gre_ratings.groupby('item')['rating'].mean().nlargest(50)
}, names=['Set'])
astats = astats.reset_index(name='count')
astats.head()


# In[29]:


arecs = astats.join(book_gender, on='item', how='left')
arecs['gender'] = arecs['gender'].fillna('unknown')
arecs.head()


# In[30]:


avg_gender = arecs.groupby(['Set', 'gender']).item.count().unstack().fillna(0).astype('i4')
avg_gender.columns = avg_gender.columns.astype('object')
avg_gender['Total'] = avg_gender.sum(axis=1)
avg_gender['Known'] = avg_gender['male'] + avg_gender['female']
avg_gender['PropKnown'] = avg_gender['Known'] / avg_gender['Total']
avg_gender['PropFemale'] = avg_gender['female'] / avg_gender['Known']
avg_gender


# ## Recommendation Coverage & Diversity
# 
# We want to understand how the recommendation lists work to better understand how many items we get.

# In[31]:


list_counts = recs.groupby(['Set', 'Algorithm'])['user'].nunique()
list_counts.name = 'Lists'


# In[32]:


item_counts = recs.groupby(['Set', 'Algorithm'])['item'].agg(['count', 'nunique'])
item_counts.rename(columns={'count': 'Recs', 'nunique': 'Distinct'}, inplace=True)
item_counts = item_counts.join(list_counts)
item_counts['FracDistinct'] = item_counts['Distinct'] / item_counts['Recs']


# What does this look like for implicit?

# In[33]:


df = select_implicit(item_counts).set_index(['Algorithm', 'Set']).stack().reorder_levels([0, 2, 1]).unstack().unstack()
df = df.rename(index=algo_labels)
df


# In[34]:


def f_n(n):
    return '{:,.0f}'.format(n)
def f_pct(n):
    return '{:.1f}%'.format(n * 100)


# In[35]:


print(df.swaplevel(axis=1).loc[:, ['Recs', 'Distinct', 'FracDistinct']].to_latex(formatters=[
    f_n, f_n, f_pct,
    f_n, f_n, f_pct,
    f_n, f_n, f_pct
]))


# And explicit?

# In[36]:


df = select_explicit(item_counts).set_index(['Algorithm', 'Set']).stack().reorder_levels([0, 2, 1]).unstack().unstack()
df = df.rename(index=algo_labels)
df


# In[37]:


print(df.swaplevel(axis=1).loc[:, ['Recs', 'Distinct', 'FracDistinct']].to_latex(formatters=[
    f_n, f_n, f_pct,
    f_n, f_n, f_pct,
    f_n, f_n, f_pct
]))


# ## Dist. Table

# In[38]:


select_implicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.mean().unstack()


# In[39]:


np.sqrt(select_implicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.var()).unstack()


# In[40]:


select_explicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.mean().unstack()


# In[41]:


np.sqrt(select_explicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.var()).unstack()


# ## Rec List Distributions
# 
# Now that we have all of this, we can start to look at recommendation list distributions.  How is Proportion Female distributed?

# In[42]:


grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_implicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropFemale', kde=False, norm_hist=True)


# In[43]:


grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_implicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropKnown', kde=False, norm_hist=True)


# In[44]:


grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_explicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropFemale', kde=False, norm_hist=True)


# In[45]:


grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_explicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropKnown', kde=False, norm_hist=True)


# ## Dummy Code Distributions

# In[46]:


grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_implicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropDC', kde=False, norm_hist=True)


# In[47]:


grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_explicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropDC', kde=False, norm_hist=True)


# ## Prepare for Modeling
# 
# With this analysis, we need to prepare our recommendation data for modeling.

# Because ALS on BX-E behaves _so_ badly, we can't really use it. Drop from further analysis.

# In[48]:


rec_stats = rec_stats.drop(('BX-E', 'als'))


# In[49]:


rec_stats.to_pickle('data/rec-data.pkl')


# We also want to save this data for STAN.

# In[50]:


def inf_dir(sname):
    return data_dir / sname / 'inference'


# In[51]:


for sname, frame in rec_stats.groupby('Set'):
    print('preparing STAN input for', sname)
    lists = frame.reset_index().astype({'Algorithm': 'category'})
    algos = lists['Algorithm'].cat.categories
    print(sname, 'has algorithms', algos)
    
    # set up the users
    users = profiles.loc[sname, :]
    users = users.assign(unum=np.arange(len(users), dtype='i4') + 1)
    lists = lists.join(users[['unum']], on='user')
    
    data = {
        'A': len(algos),
        'J': len(users),
        'NL': len(lists),
        'n': users['Known'],
        'y': users['female'],
        'ru': lists['unum'],
        'ra': lists['Algorithm'].cat.codes + 1,
        'rn': lists['Known'],
        'ry': lists['female']
    }
    
    # and write
    dir = inf_dir(sname)
    dir.mkdir(exist_ok=True)
    in_fn = dir / 'full-inputs.json'
    in_fn.write_text(ujson.dumps(data))
    # in_fn.write_text(ujson.dumps(stan_inputs(frame, 'Known', 'female')))


# In[ ]: