#!/usr/bin/env python # coding: utf-8 # # Recommendation List Data Prep # # This notebook does the data preparation for the recommendation list analysis. # ## Setup # In[1]: import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from itertools import product # In[2]: import ujson # In[3]: from bookgender.config import data_dir # ## Load Data # Load book gender data and clean it up: # In[4]: book_gender = pd.read_csv('data/author-gender.csv.gz') book_gender = book_gender.set_index('item')['gender'] book_gender.loc[book_gender.str.startswith('no-')] = 'unknown' book_gender.loc[book_gender == 'unlinked'] = 'unknown' book_gender = book_gender.astype('category') book_gender.describe() # In[5]: book_gender.head() # And load hashes: # In[6]: book_hash = pd.read_parquet('data/book-hash.parquet').rename(columns={'cluster': 'item'}) book_hash['dcode'] = book_hash['md5'].apply(lambda x: int(x[-1], 16) % 2) book_hash = book_hash.set_index('item') book_hash.head() # Load the user profile data: # In[7]: profiles = pd.read_pickle('data/profile-data.pkl') profiles.head() # In[8]: datasets = list(profiles.index.levels[0]) datasets # And load the recommendations: # In[9]: recs = pd.read_parquet('data/study-recs.parquet') recs.rename(columns={'dataset': 'Set', 'algorithm': 'Algorithm'}, inplace=True) recs.head() # The original paper truncated recommendation lists to 50. Let's do that too: # In[10]: recs = recs[recs['rank'] <= 50] # In[11]: recs.Set.unique() # In[12]: recs.Algorithm.unique() # We will need to extract implicit/explicit from those. In the new paper, we are going to separate out implicit and explicit data for presentation; these functions will help with that. # In[13]: def select_implicit(data, reset=True): if reset: data = data.reset_index() implicit = data['Algorithm'].str.endswith('-imp') | data['Set'].str.endswith('-I') data = data[implicit].assign(Set=data['Set'].str.replace('-I', ''), Algorithm=data['Algorithm'].str.replace('-imp', '')) data['Algorithm'] = data['Algorithm'].str.replace('wrls', 'als') return data # In[14]: def select_explicit(data, reset=True): if reset: data = data.reset_index() implicit = data['Algorithm'].str.endswith('-imp') | data['Set'].str.endswith('-I') data = data[~implicit].assign(Set=data['Set'].str.replace('-E', '')) return data # And give ourselves a handy way to relable algorithms: # In[15]: algo_labels = { 'als': 'ALS', 'bpr': 'BPR', 'item-item': 'II', 'user-user': 'UU' } # ## Analyze Rec List Composition # In the mean time, let's proceed by computing recommendation list gender data. # In[16]: recs.drop(columns=['gender'], errors='ignore', inplace=True) recs = recs.join(book_gender, on='item', how='left') recs['gender'] = recs['gender'].fillna('unknown') recs['gender'].describe() # And mix in the dummy code data: # In[17]: recs.drop(columns=['dcode'], errors='ignore', inplace=True) recs = recs.join(book_hash['dcode'], on='item', how='left') recs.head() # Count up the statistics for each list by gender: # In[18]: rec_stats = recs.groupby(['Set', 'Algorithm', 'user'])['gender'].value_counts().unstack(fill_value=0) rec_stats.columns = rec_stats.columns.astype('object') rec_stats['Total'] = rec_stats.sum(axis=1) rec_stats['Known'] = rec_stats['male'].fillna(0) + rec_stats['female'].fillna(0) rec_stats['PropKnown'] = rec_stats['Known'] / rec_stats['Total'] rec_stats['PropFemale'] = rec_stats['female'] / rec_stats['Known'] rec_stats # In[19]: rec_stats.info() # Mix in info from dummy codes: # In[20]: rec_dc_stats = recs.groupby(['Set', 'Algorithm', 'user'])['dcode'].agg(['count', 'sum', 'mean']) rec_dc_stats.rename(columns={'count': 'dcknown', 'sum': 'dcyes', 'mean': 'PropDC'}, inplace=True) rec_dc_stats['dcyes'] = rec_dc_stats['dcyes'].astype('i4') rec_dc_stats.head() # In[21]: rec_stats = rec_stats.join(rec_dc_stats) rec_stats.head() # Quick status-check on the number of recommendation lists per algorithm, implicit feedback: # In[22]: select_implicit(rec_stats).groupby(['Set', 'Algorithm'])['Total'].count().unstack() # Explicit feedback: # In[23]: select_explicit(rec_stats).groupby(['Set', 'Algorithm'])['Total'].count().unstack() # ## Non-personalized Recommendations # # We also want to compute the makeup of non-personalized recommendations, to get a baseline level for each algorithm. # In[24]: az_ratings = pd.read_parquet('data/AZ/ratings.parquet') bxi_ratings = pd.read_parquet('data/BX-I/ratings.parquet') bxe_ratings = pd.read_parquet('data/BX-E/ratings.parquet') gre_ratings = pd.read_parquet('data/GR-E/ratings.parquet') gri_ratings = pd.read_parquet('data/GR-I/ratings.parquet') # ### Popularity # In[25]: istats = pd.concat({ 'AZ': az_ratings.groupby('item')['user'].count().nlargest(50), 'BX-I': bxi_ratings.groupby('item')['user'].count().nlargest(50), 'BX-E': bxe_ratings.groupby('item')['user'].count().nlargest(50), 'GR-I': gri_ratings.groupby('item')['user'].count().nlargest(50), 'GR-E': gre_ratings.groupby('item')['user'].count().nlargest(50) }, names=['Set']) istats = istats.reset_index(name='count') istats.head() # In[26]: irecs = istats.join(book_gender, on='item', how='left') irecs['gender'] = irecs['gender'].fillna('unknown') irecs.head() # In[27]: pop_gender = irecs.groupby(['Set', 'gender']).item.count().unstack().fillna(0).astype('i4') pop_gender.columns = pop_gender.columns.astype('object') pop_gender['Total'] = pop_gender.sum(axis=1) pop_gender['Known'] = pop_gender['male'] + pop_gender['female'] pop_gender['PropKnown'] = pop_gender['Known'] / pop_gender['Total'] pop_gender['PropFemale'] = pop_gender['female'] / pop_gender['Known'] pop_gender # ### Highest Average Rating # In[28]: astats = pd.concat({ 'AZ': az_ratings.groupby('item')['rating'].mean().nlargest(50), 'BX-E': bxe_ratings.groupby('item')['rating'].mean().nlargest(50), 'GR-E': gre_ratings.groupby('item')['rating'].mean().nlargest(50) }, names=['Set']) astats = astats.reset_index(name='count') astats.head() # In[29]: arecs = astats.join(book_gender, on='item', how='left') arecs['gender'] = arecs['gender'].fillna('unknown') arecs.head() # In[30]: avg_gender = arecs.groupby(['Set', 'gender']).item.count().unstack().fillna(0).astype('i4') avg_gender.columns = avg_gender.columns.astype('object') avg_gender['Total'] = avg_gender.sum(axis=1) avg_gender['Known'] = avg_gender['male'] + avg_gender['female'] avg_gender['PropKnown'] = avg_gender['Known'] / avg_gender['Total'] avg_gender['PropFemale'] = avg_gender['female'] / avg_gender['Known'] avg_gender # ## Recommendation Coverage & Diversity # # We want to understand how the recommendation lists work to better understand how many items we get. # In[31]: list_counts = recs.groupby(['Set', 'Algorithm'])['user'].nunique() list_counts.name = 'Lists' # In[32]: item_counts = recs.groupby(['Set', 'Algorithm'])['item'].agg(['count', 'nunique']) item_counts.rename(columns={'count': 'Recs', 'nunique': 'Distinct'}, inplace=True) item_counts = item_counts.join(list_counts) item_counts['FracDistinct'] = item_counts['Distinct'] / item_counts['Recs'] # What does this look like for implicit? # In[33]: df = select_implicit(item_counts).set_index(['Algorithm', 'Set']).stack().reorder_levels([0, 2, 1]).unstack().unstack() df = df.rename(index=algo_labels) df # In[34]: def f_n(n): return '{:,.0f}'.format(n) def f_pct(n): return '{:.1f}%'.format(n * 100) # In[35]: print(df.swaplevel(axis=1).loc[:, ['Recs', 'Distinct', 'FracDistinct']].to_latex(formatters=[ f_n, f_n, f_pct, f_n, f_n, f_pct, f_n, f_n, f_pct ])) # And explicit? # In[36]: df = select_explicit(item_counts).set_index(['Algorithm', 'Set']).stack().reorder_levels([0, 2, 1]).unstack().unstack() df = df.rename(index=algo_labels) df # In[37]: print(df.swaplevel(axis=1).loc[:, ['Recs', 'Distinct', 'FracDistinct']].to_latex(formatters=[ f_n, f_n, f_pct, f_n, f_n, f_pct, f_n, f_n, f_pct ])) # ## Dist. Table # In[38]: select_implicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.mean().unstack() # In[39]: np.sqrt(select_implicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.var()).unstack() # In[40]: select_explicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.mean().unstack() # In[41]: np.sqrt(select_explicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.var()).unstack() # ## Rec List Distributions # # Now that we have all of this, we can start to look at recommendation list distributions. How is Proportion Female distributed? # In[42]: grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_implicit(rec_stats), sharey=False, margin_titles=True) grid.map(sns.distplot, 'PropFemale', kde=False, norm_hist=True) # In[43]: grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_implicit(rec_stats), sharey=False, margin_titles=True) grid.map(sns.distplot, 'PropKnown', kde=False, norm_hist=True) # In[44]: grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_explicit(rec_stats), sharey=False, margin_titles=True) grid.map(sns.distplot, 'PropFemale', kde=False, norm_hist=True) # In[45]: grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_explicit(rec_stats), sharey=False, margin_titles=True) grid.map(sns.distplot, 'PropKnown', kde=False, norm_hist=True) # ## Dummy Code Distributions # In[46]: grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_implicit(rec_stats), sharey=False, margin_titles=True) grid.map(sns.distplot, 'PropDC', kde=False, norm_hist=True) # In[47]: grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_explicit(rec_stats), sharey=False, margin_titles=True) grid.map(sns.distplot, 'PropDC', kde=False, norm_hist=True) # ## Prepare for Modeling # # With this analysis, we need to prepare our recommendation data for modeling. # Because ALS on BX-E behaves _so_ badly, we can't really use it. Drop from further analysis. # In[48]: rec_stats = rec_stats.drop(('BX-E', 'als')) # In[49]: rec_stats.to_pickle('data/rec-data.pkl') # We also want to save this data for STAN. # In[50]: def inf_dir(sname): return data_dir / sname / 'inference' # In[51]: for sname, frame in rec_stats.groupby('Set'): print('preparing STAN input for', sname) lists = frame.reset_index().astype({'Algorithm': 'category'}) algos = lists['Algorithm'].cat.categories print(sname, 'has algorithms', algos) # set up the users users = profiles.loc[sname, :] users = users.assign(unum=np.arange(len(users), dtype='i4') + 1) lists = lists.join(users[['unum']], on='user') data = { 'A': len(algos), 'J': len(users), 'NL': len(lists), 'n': users['Known'], 'y': users['female'], 'ru': lists['unum'], 'ra': lists['Algorithm'].cat.codes + 1, 'rn': lists['Known'], 'ry': lists['female'] } # and write dir = inf_dir(sname) dir.mkdir(exist_ok=True) in_fn = dir / 'full-inputs.json' in_fn.write_text(ujson.dumps(data)) # in_fn.write_text(ujson.dumps(stan_inputs(frame, 'Known', 'female'))) # In[ ]: