#!/usr/bin/env python # coding: utf-8 # # Collaborative filtering with side information # ** * # This IPython notebook illustrates the usage of the [cmfrec](https://github.com/david-cortes/cmfrec) Python package for collective matrix factorization using the [MovieLens-100k data](https://grouplens.org/datasets/movielens/100k/), consisting of ratings from users about movies + user demographic information + movie genres. # # Collective matrix factorization is a technique for collaborative filtering with additional information about the users and items, based on low-rank joint factorization of different matrices with shared factors – for more details see the paper [_Singh, A. P., & Gordon, G. J. (2008, August). Relational learning via collective matrix factorization. In Proceedings of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 650-658). ACM._](http://ra.adm.cs.cmu.edu/anon/usr/ftp/ml2008/CMU-ML-08-109.pdf). # # ** Small note: if the TOC here is not clickable or the math symbols don't show properly, try visualizing this same notebook from nbviewer following [this link](http://nbviewer.jupyter.org/github/david-cortes/cmfrec/blob/master/example/cmfrec_movielens_sideinfo.ipynb). ** # ** * # ## Sections # # [1. Basic model - only movie ratings](#p1) # * [1.1 Loading the ratings data](#p11) # * [1.2 Train and test split](#p12) # * [1.3 Fitting and evaluating the model](#p13) # # [2. Adding movie genres](#p2) # * [2.1 Loading the movie genres info](#p21) # * [2.2 Fitting and evaluating model with genres](#p22) # # [3. Adding movie genres and user demographic info](#p3) # * [3.1 Loading the user demographic info](#p31) # * [3.2 Getting user region from zip codes](#p32) # * [3.2 Fitting and evaluating the full model](#p33) # # [4. Comparing recommendations](#p4) # ** * # # ## 1. Basic model - only movie ratings # ** * # As a starting point, I'll first try the basic low-rank factorization model using ratings data alone - that is, trying to minimize the following function: # # $$ Loss\:(U, V) = \lVert X - UV^T\lVert^2\: + \:\lambda\: (\lVert U \lVert^2 + \lVert V \lVert^2) $$ # # Where $U$ and $V$ are lower-dimensional matrices mapping users and items into a latent space - this is the classic model popularized by Funk. The predicted rating from this model for a given user $i$ and movie $j$ can be calculated as $U[i,:]*V[j,:]^T$ # # # ### 1.1 Loading the ratings data # # Here I'll load the MovieLens-100k ratings data, which can be downloaded from the link presented at the beginning: # In[1]: import pandas as pd, time from datetime import datetime ratings=pd.read_table('D:\\Downloads\\movielens\\ml-100k\\ml-100k\\u.data',sep='\t',engine='python',names=['UserId','ItemId','Rating','Timestamp']) ratings['Timestamp']=ratings.Timestamp.map(lambda x: datetime(*time.localtime(x)[:6])).map(lambda x: pd.to_datetime(x)) ratings=ratings.sort_values(['UserId','ItemId']).reset_index(drop=True) ratings.head() # # ### 1.2 Train and test split # # In order to evaluate the model, I'll create a train and test set split to use throughout the whole notebook. As this kind of model can only recommend items that were in the training set to users who also were in the training set, I'll make the test set contain only elements that were present in the train set. # # In order to make this more realistic, I'll make it as a temporal split, i.e. splitting the ratings as those who were submitted before and after a certain time cutoff. # In[2]: time_cutoff='1998-01-01' train=ratings.loc[ratings.Timestamp<=time_cutoff] test=ratings.loc[ratings.Timestamp>time_cutoff] users_train=set(list(train.UserId)) items_train=set(list(train.ItemId)) test=test.loc[test.UserId.map(lambda x: x in users_train)] test=test.loc[test.ItemId.map(lambda x: x in items_train)] print(train.shape) print(test.shape) # Note that this is a **very** small sample, in a typical setting you would have 3 or 4 orders of magnitude more. Nevertheless, this smallish data is enough to see a difference between models. # In[3]: print(len(users_train)) print(len(items_train)) # # ### 1.3 Fitting and evaluating the model # # Traditionally, recommendations have been evaluated by their cross-validated RMSE (root mean squared error), but this is not really a good metric and higher values might not translate into better-liked recommendations. There are many additional metrics that can be used, but to keep this example simple, I’ll evaluate the rating that users would have given to the Top-5 recommendations from this model and compare this to recommendations by item popularity and to random recommendations. # In[4]: from cmfrec import CMF import numpy as np # Number of latent factors k=40 # Regularization parameter reg=10 # Fitting the model rec=CMF(k=k, reg_param=reg) rec.fit(train, random_seed=12345) # Making predictions test['Predicted']=test.apply(lambda x: rec.predict(x['UserId'],x['ItemId']),axis=1) # Evaluating Hold-Out RMSE (the hyperparameters had already been somewhat tuned by cross-validation) # In[5]: np.sqrt(np.mean((test.Predicted-test.Rating)**2)) # Basic evaluation of this model: # In[6]: avg_ratings=train.groupby('ItemId')['Rating'].mean().to_frame().rename(columns={"Rating":"AvgRating"}) test2=pd.merge(test,avg_ratings,left_on='ItemId',right_index=True,how='left') print('Averge movie rating:',test2.groupby('UserId')['Rating'].mean().mean()) print('Average rating for top-5 rated by each user:',test2.sort_values(['UserId','Rating'],ascending=False).groupby('UserId')['Rating'].head(5).mean()) print('Average rating for bottom-5 rated by each user:',test2.sort_values(['UserId','Rating'],ascending=True).groupby('UserId')['Rating'].head(5).mean()) print('Average rating for top-5 recommendations of best-rated movies:',test2.sort_values(['UserId','AvgRating'],ascending=False).groupby('UserId')['Rating'].head(5).mean()) print('----------------------') print('Average rating for top-5 recommendations from this model:',test2.sort_values(['UserId','Predicted'],ascending=False).groupby('UserId')['Rating'].head(5).mean()) print('Average rating for bottom-5 (non-)recommendations from this model:',test2.sort_values(['UserId','Predicted'],ascending=True).groupby('UserId')['Rating'].head(5).mean()) # The recommendations from this model are not bad, but the average rating of the Top-5 doesn't manage to beat a most-popular recommendation! This is not surprising given the small size of the ratings data though. # # ## 2. Adding movie genres # ** * # The previous model can be extended by adding some additional information about the movies - this can be done by also factorizing the movie-genre matrix and sharing the item-factor matrix in the factorization of the user-item ratings. Now the model becomes: # # $$ Loss\:(U, V, Z) = \lVert X - UV^T\lVert^2\: + \:\lVert M-VZ^T \lVert^2\: + \:\lambda\: (\lVert U \lVert^2 + \lVert V \lVert^2 + \lVert Z \lVert^2) $$ # # Where $U$, $V$ and $Z$ are lower-dimensional matrices mapping users, items and genres into a latent space. The predicted rating from this model for a given user $i$ and movie $j$ is still calculated the same as before: $U[i,:]*V[j,:]^T$. However, we can intuitively think that an item-factor matrix that also represents genres might be better than one that does not, and less likely to overfit, as these factors are not so free. # # The matrix $V$ however doesn't need to be exactly the same in both terms - we can also add some additional factors that appear in only one factorization, making the follwing formula: # # $$ Loss\:(U, V, Z) = \lVert X - UV_{main}^T\lVert^2\: + \:\lVert M-V_{sec}Z^T \lVert^2\: + \:\lambda\: (\lVert U \lVert^2 + \lVert V \lVert^2 + \lVert Z \lVert^2) $$ # # Where $ V_{main} = V_{[1\:to\:k_{main} + k_{shared} ,\:\cdot]}$ and $V_{sec} = V_{[k_{main} +1 \:to\: k_{main} + k_{shared} + k_{sec},\:\cdot]}$ # # # # ### 2.1 Loading the movie genres info # # The MovieLens-100k data also comes with a file containing movie information that we can use to enhance the model - note that the package requires the item side information to have a column named _ItemId_ when you pass it to the API. If your data doesn't require any reindexing, you can also pass it as a numpy array and set the option reindex to False. # In[7]: colnames=['ItemId','Title','ReleaseDate','Sep','Link']+['genre'+str(i) for i in range(19)] genres=pd.read_table('D:\\Downloads\\movielens\\ml-100k\\ml-100k\\u.item',sep="|",engine='python',names=colnames) # will save the movie titles for later movie_id_to_title={i.ItemId:i.Title for i in genres.itertuples()} genres=genres[['ItemId']+['genre'+str(i) for i in range(19)]] genres.head() # # ### 2.2 Fitting and evaluating model with genres # # These hypterparameters (number of factors and regularization) were also somewhat tuned beforehand: # In[8]: # Number of latent factors k=30 k_main=10 k_sec=10 # Regularization parameter reg=10 # Fitting the model rec2=CMF(k=k, k_main=k_main, k_item=k_sec, reg_param=reg) rec2.fit(train, genres, random_seed=10000) # Making predictions test['Predicted']=test.apply(lambda x: rec2.predict(x['UserId'],x['ItemId']),axis=1) # RMSE now: # In[9]: np.sqrt(np.mean((test.Predicted-test.Rating)**2)) # Same evaluation as before: # In[10]: test2=pd.merge(test,avg_ratings,left_on='ItemId',right_index=True,how='left') print('Averge movie rating:',test2.groupby('UserId')['Rating'].mean().mean()) print('Average rating for top-5 rated by each user:',test2.sort_values(['UserId','Rating'],ascending=False).groupby('UserId')['Rating'].head(5).mean()) print('Average rating for bottom-5 rated by each user:',test2.sort_values(['UserId','Rating'],ascending=True).groupby('UserId')['Rating'].head(5).mean()) print('Average rating for top-5 recommendations of best-rated movies:',test2.sort_values(['UserId','AvgRating'],ascending=False).groupby('UserId')['Rating'].head(5).mean()) print('----------------------') print('Average rating for top-5 recommendations from this model:',test2.sort_values(['UserId','Predicted'],ascending=False).groupby('UserId')['Rating'].head(5).mean()) print('Average rating for bottom-5 (non-)recommendations from this model:',test2.sort_values(['UserId','Predicted'],ascending=True).groupby('UserId')['Rating'].head(5).mean()) # Now we see a bit of an improvement - it's not too large, but it's nevertheless an improvement, and this time these personalized recommendations get overall higher ratings than most-popular recommendations with as little as 50k ratings. # # Knowing these generic genres shouldn't be a complete game changer so this is expected. # # ## 2. Adding movie genres and user demographic info # ** * # The previous model can be extended to incorporate user information in the same way as it added movie genres: # # # $$ Loss\:(U, V, Z, P) = \lVert X - UV^T\lVert^2\: + \:\lVert M-VZ^T \lVert^2\: + \:\lVert Q-UP^T \lVert^2\: + \:\lambda\: (\lVert U \lVert^2 + \lVert V \lVert^2 + \lVert Z \lVert^2 + \lVert P \lVert^2) $$ # # Where $Q$ is the user attribute matrix and $P$ is the new attribute-factor matrix - same as before, some of the factors can be shared and some be specific to one factorization. # # Intuitively, since in a typical setting there are usually more users than items (not in this particular example though), and each user has on average fewer rated movies than movies have users rating them, it would be logical to assume that detailed user information should be more valuable than detailed item information. # # # ### 3.1 Loading the user demographic info # # The MovieLens-100k data also comes with user demographic information - same as before, the data frame passed to the package API should have a column named _UserId_: # In[11]: user_info=pd.read_table('D:\\Downloads\\movielens\\ml-100k\\ml-100k\\u.user',sep="|",engine='python', names=['UserId','Age','Gender','Occupation','Zipcode']) user_info.head() # This time, unfortunately, not all the information can be used as it is in the file. The zip code can still provide valuable information if we can link it to a broader geographical area. As these are mostly US users, I'll try to link it to US regions here. # # In order to do so, I’m using a [publicly available table](http://federalgovernmentzipcodes.us/) mapping zip codes to states, [another one](http://www.fonz.net/blog/archives/2008/04/06/csv-of-states-and-state-abbreviations/) mapping state names to their abbreviations, and finally classifying the states into regions according to [usual definitions](https://www.infoplease.com/us/states/sizing-states). # # # ### 3.2 Getting user region from zip codes # In[12]: import re zipcode_abbs=pd.read_csv("D:\\Downloads\\movielens\\zips\\states.csv") zipcode_abbs_dct={z.State:z.Abbreviation for z in zipcode_abbs.itertuples()} us_regs_table=[ ('New England', 'Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont'), ('Middle Atlantic', 'Delaware, Maryland, New Jersey, New York, Pennsylvania'), ('South', 'Alabama, Arkansas, Florida, Georgia, Kentucky, Louisiana, Mississippi, Missouri, North Carolina, South Carolina, Tennessee, Virginia, West Virginia'), ('Midwest', 'Illinois, Indiana, Iowa, Kansas, Michigan, Minnesota, Nebraska, North Dakota, Ohio, South Dakota, Wisconsin'), ('Southwest', 'Arizona, New Mexico, Oklahoma, Texas'), ('West', 'Alaska, California, Colorado, Hawaii, Idaho, Montana, Nevada, Oregon, Utah, Washington, Wyoming') ] us_regs_table=[(x[0],[i.strip() for i in x[1].split(",")]) for x in us_regs_table] us_regs_dct=dict() for r in us_regs_table: for s in r[1]: us_regs_dct[zipcode_abbs_dct[s]]=r[0] zipcode_info=pd.read_csv("D:\\Downloads\\movielens\\free-zipcode-database.csv") zipcode_info=zipcode_info.groupby('Zipcode').first().reset_index() zipcode_info['State'].loc[zipcode_info.Country!="US"]='UnknownOrNonUS' zipcode_info['Region']=zipcode_info['State'].copy() zipcode_info['Region'].loc[zipcode_info.Country=="US"]=zipcode_info.Region.loc[zipcode_info.Country=="US"].map(lambda x: us_regs_dct[x] if x in us_regs_dct else 'UsOther') zipcode_info=zipcode_info[['Zipcode', 'Region']] zipcode_info.head() def process_zip(zp): try: zp=np.int(zp) return zp except: return np.nan user_info["Zipcode"]=user_info.Zipcode.map(process_zip) user_info=pd.merge(user_info,zipcode_info,on='Zipcode',how='left') user_info['Region']=user_info.Region.fillna('UnknownOrNonUS') user_info=pd.get_dummies(user_info[['UserId','Age','Gender','Occupation','Region']]) users_w_side_info=set(list(user_info.UserId)) ratings=ratings.loc[ratings.UserId.map(lambda x: x in users_w_side_info)] user_info.head() # # ### 3.3 Fitting and evaluating the full model # # Adding explicit information gives the latent factors a more solid base, so fewer of them are needed the more side info there is available. # In[13]: # Number of latent factors k=30 k_main=5 k_genre=5 k_demo=5 # Regularization parameter reg=50 # This time I'll weight the ratings matrix higher w_main=4 # Fitting the model rec3=CMF(k=k, k_main=k_main, k_item=k_genre, k_user=k_demo, w_main=w_main, reg_param=reg) rec3.fit(train, genres, user_info, random_seed=32545) # Making predictions test['Predicted']=test.apply(lambda x: rec3.predict(x['UserId'],x['ItemId']),axis=1) # Same metrics as before: # In[14]: np.sqrt(np.mean((test.Predicted-test.Rating)**2)) # In[15]: test2=pd.merge(test,avg_ratings,left_on='ItemId',right_index=True,how='left') print('Averge movie rating:',test2.groupby('UserId')['Rating'].mean().mean()) print('Average rating for top-5 rated by each user:',test2.sort_values(['UserId','Rating'],ascending=False).groupby('UserId')['Rating'].head(5).mean()) print('Average rating for bottom-5 rated by each user:',test2.sort_values(['UserId','Rating'],ascending=True).groupby('UserId')['Rating'].head(5).mean()) print('Average rating for top-5 recommendations of best-rated movies:',test2.sort_values(['UserId','AvgRating'],ascending=False).groupby('UserId')['Rating'].head(5).mean()) print('----------------------') print('Average rating for top-5 recommendations from this model:',test2.sort_values(['UserId','Predicted'],ascending=False).groupby('UserId')['Rating'].head(5).mean()) print('Average rating for bottom-5 (non-)recommendations from this model:',test2.sort_values(['UserId','Predicted'],ascending=True).groupby('UserId')['Rating'].head(5).mean()) # This time the improvement was bigger and the Top-5 recommendations seem now to have increased by a bigger margin - with just adding the most basic demographic information! # # ## 4. Comparing recommendations # ** * # # Now let's see what are of each these models recommending to some randomly picked users, along with the overall item popularity: # In[16]: # aggregate statistics avg_movie_rating=train.groupby('ItemId')['Rating'].mean() num_ratings_per_movie=train.groupby('ItemId')['Rating'].agg(lambda x: len(tuple(x))) # function to print recommended lists more nicely def print_reclist(reclist): list_w_info=[str(m+1)+") - "+movie_id_to_title[reclist[m]]+\ " - Average Rating: "+str(np.round(avg_movie_rating[reclist[m]],2))+\ " - Number of ratings: "+str(num_ratings_per_movie[reclist[m]]) for m in range(len(reclist))] print("\n".join(list_w_info)) # In[17]: # user 1 reclist1=rec.top_n(UserId=1, n=20) reclist2=rec2.top_n(UserId=1, n=20) reclist3=rec3.top_n(UserId=1, n=20) print('Recommendations from ratings-only model:') print_reclist(reclist1) print("------") print('Recommendations from ratings + genre model:') print_reclist(reclist2) print("------") print('Recommendations from ratings + genre + demographics model:') print_reclist(reclist3) # In[18]: # user 943 reclist1=rec.top_n(UserId=943, n=20) reclist2=rec2.top_n(UserId=943, n=20) reclist3=rec3.top_n(UserId=943, n=20) print_reclist(reclist1) print('Recommendations from ratings-only model:') print("------") print('Recommendations from ratings + genre model:') print_reclist(reclist2) print("------") print('Recommendations from ratings + genre + demographics model:') print_reclist(reclist3)