#!/usr/bin/env python # coding: utf-8 # # Retail Product Recommender Engine # **By [Czarina Luna](https://czarinaluna.com)** # # ### Contents # * [I. Overview](#I.-Overview) # * [II. Business Problem](#II.-Business-Problem) # * [III. Data Understanding](#III.-Data-Understanding) # * [IV. Recommendation Systems](#IV.-Recommendation-Systems) # * [Popularity Recommendations](#Popularity-Recommendations) # * [Content-Based Recommenders](#Content-Based-Recommenders) # * [Collaborative Filtering Systems](#Collaborative-Filtering-Systems) # * [V. Results and Recommendations](#V.-Results-and-Recommendations) # * [VI. Further Research](#VI.-Further-Research) # ## I. Overview # # A recommender engine is developed to increase revenue of clothing rental companies by predicting user preferences and recommending products for users to rent. I apply different algorithms to create personalized recommendations using content-based and collaborative filtering systems. The algorithm that attained the lowest mean absolute error of 0.5 is the Singular Value Decomposition. # ## II. Business Problem # # The clothing rental industry grows as more companies follow suit of the retailer Rent the Runway, which pioneered online services and subscriptions for designer rentals. To help grow the revenue of clothing rental companies, I develop recommendation systems that predict a set of user preferences and recommend the top preferences for the user. Doing so will conveniently expose users to relevant products to rent that tailor to their preferences. Using data from Rent the Runway, I conduct an analysis of the product reviews, model the data to predict user ratings, and provide recommendations accordingly. # ## III. Data Understanding # # The Rent the Runway reviews ([data source](https://cseweb.ucsd.edu/~jmcauley/datasets.html#clothing_fit)) contain 200,000 ratings of 6,000 unique items rented between 2010 and 2018 by over 100,000 unique users. # # A quick look at the data structure: # In[1]: import pandas as pd raw_data = pd.read_csv('data/data.csv') raw_data.head(2) # Missing values: # In[2]: raw_data.isna().sum() # Target variable: # In[3]: raw_data['rating'].value_counts() # **Data Cleaning** # # To perform the pre-processing steps, I define the function `preprocess_data`: # - Drop missing values for `rating` and change the scale for from 2-10 to 1-5. # - Remove the units of measurement for `weight` and `height` to only keep the numerical values. # - Impute missing values with the median for `age` and a few other features and with mode for `rented_for`. # - Impute missing value for `bust_size` with the median value by `body_type` and vice versa. # - Count the number of words in `text_summary` and `text_review` together to create new feature `length`. # - Create new features `review_month`, `review_season`, and `review_year` based on `review_date`. # In[4]: import numpy as np def convert_height(x): ''' Converts height from string format as feet and inches to integer in inches. ''' height = [int(i) for i in x.replace('\'', '').replace('"', '').split()] return height[0]*12 + height[1] def preprocess_data(df): ''' Cleans the dataframe using imputation and feature engineering. ''' df.columns = df.columns.str.replace(' ', '_') df = df.dropna(subset=['rating']) df['weight'] = df['weight'].str.replace('lbs', '') df['rating'] = df['rating']/2 df['height'] = df['height'].apply(lambda x: convert_height(x) if pd.notnull(x) else x) to_num = ['rating', 'weight', 'age'] df[to_num] = df[to_num].apply(pd.to_numeric, errors='coerce') for col in ['height', 'age']: df[col] = df[col].fillna(df[col].median()) weight_map = dict(df.groupby('height')['weight'].median()) df['weight'] = df['weight'].fillna(df['height'].map(weight_map)) for col in ['review_text', 'review_summary']: df[col] = df[col].replace('-', np.nan) df['review'] = df['review_summary'] + ' ' + df['review_text'] df['review'] = df['review'].fillna('') df['review_length'] = df['review_text'].fillna('').apply(lambda x: len(x.split())) age_limit = (df['age'] > 60) | (df['age'] < 13) df['age'] = np.where(age_limit==True, df['age'].median(), df['age']) for col in ['bust_size', 'body_type']: to_map = dict(df.groupby('size')[col].last()) df[col] = df[col].fillna(df['size'].map(to_map)) df['rented_for'] = df['rented_for'].fillna(df['rented_for'].value_counts().index[0]) df['review_date'] = pd.to_datetime(df['review_date']) df['review_month'] = pd.DatetimeIndex(df['review_date']).month df['review_season'] = pd.cut(df['review_month'].replace(12, 0), [0, 3, 6, 9, 11], include_lowest=True, labels=['Winter', 'Spring', 'Summer', 'Fall']) df['review_year'] = pd.DatetimeIndex(df['review_date']).year return df import warnings warnings.filterwarnings('ignore') # In[5]: # Create new dataframe for processed data data = preprocess_data(raw_data) # In[6]: pd.options.display.float_format = '{:.2f}'.format # Summary statistics for numerical features data.drop(columns=['user_id', 'item_id']).describe().T # ### Data Visualization # # Let's explore and visualize the processed data! # *** # # **User Data** I create a separate table for user information by grouping the data by `user_id` and adding new features: # - `rating_count` is the total number of items the user rated and reviewed. # - `rating_average` is the average rating of the items reviewed by the user. # - `rented_for_top` is the user's most common reason for renting an item. # - `category_top` is the most common clothing category among the items reviewed by the user. # - `review_length_average` is the average length of text review posted by the user. # - `review_month_top` and `review_season_top` are the most common month and season the user posted the review. # - `rented_for_all` is a list of all the user's reasons for renting the items. # - `category_for_all` is a list of all the clothing categories of the items reviewed by the user. # In[7]: def create_user_data(df): ''' Groups the data by user and returns dataframe containing user information. ''' user_df = pd.DataFrame(df.groupby('user_id').count().reset_index()['user_id']) for col in df.columns: if col in ['bust_size', 'weight']: feature = df.sort_values('review_date', ascending=False).groupby('user_id')[col].first() user_df = user_df.merge(feature, on='user_id') if col == 'item_id': feature = df.groupby(df['user_id']).count()[col] user_df = user_df.merge(feature, on='user_id') if col == 'rating': feature = df.groupby(df['user_id']).mean()[col] user_df = user_df.merge(feature, on='user_id') if col in ['body_type', 'height', 'size', 'age']: feature = df.sort_values('review_date', ascending=False).groupby('user_id')[col].first() user_df = user_df.merge(feature, on='user_id') if col == 'review_length': feature = df.groupby(df['user_id']).mean()[col] user_df = user_df.merge(feature, on='user_id') if col in ['review_month', 'review_season']: feature = df.sort_values('review_date', ascending=False).groupby('user_id')[col].agg(lambda x: x.value_counts().index[0]) user_df = user_df.merge(feature, on='user_id') if col in ['rented_for', 'category']: feature = df.sort_values('review_date', ascending=False).groupby('user_id')[col].agg(pd.Series.mode).apply(lambda x: x[0] if type(x)==np.ndarray else x) user_df = user_df.merge(feature, on='user_id') else: continue for col in ['rented_for', 'category']: feature = df.groupby('user_id')[col].apply(set).apply(lambda x: list(x)) user_df = user_df.merge(feature, on='user_id') user_df.columns = ['user_id', 'bust_size', 'rating_count', 'weight', 'rating_average', 'rented_for_top', 'body_type', 'category_top', 'height', 'size', 'age', 'review_length_average', 'review_month_top', 'review_season_top', 'rented_for_all', 'category_all'] return user_df # In[8]: # Create new dataframe for user data user_data = create_user_data(data) user_data.head(2) # In[9]: import seaborn as sns import matplotlib.pyplot as plt # Sort data by bust size bust_size_sorted_data = user_data.loc[(user_data['bust_size']>='32a') & (user_data['bust_size']<='38ddd/e')].sort_values('bust_size') sns.set_style('whitegrid') fig, axes = plt.subplots(nrows=2, figsize=(20, 18)) # Plot the distribution of users by bust size sns.countplot(x='bust_size', data=bust_size_sorted_data, palette='twilight', ax=axes[0]) axes[0].set_title('User Count by Bust Size', fontsize=16) axes[0].set_xlabel('Bust Size') axes[0].set_ylabel('User Count') axes[0].set_xticklabels(bust_size_sorted_data['bust_size'].unique(), rotation=90, fontsize=12) # Plot the distribution of users by size sns.countplot(x='size', data=user_data, palette='PuBu_r', ax=axes[1]) axes[1].set_title('User Count by Size', fontsize=16) axes[1].set_xlabel('Size') axes[1].set_ylabel('User Count') plt.savefig('data/images/fig0.png', dpi=200, transparent=True) plt.show() # In[10]: weight_data = user_data.loc[(user_data['weight']>=90) & (user_data['weight']<=210)] sns.set_style('whitegrid') fig, axes = plt.subplots(ncols=3, figsize=(15, 5)) # Plot the distribution of users by weight sns.histplot(x='weight', data=weight_data, bins=24, color='darksalmon', kde=True, ax=axes[0]) axes[0].set_title('User Count by Weight', fontsize=16) axes[0].set_xlabel('Weight') axes[0].set_ylabel('User Count') axes[0].grid(axis='x') # Plot the distribution of users by height sns.histplot(x='height', data=user_data, bins=24, color='midnightblue', ax=axes[1]) axes[1].set_title('User Count by Height', fontsize=16) axes[1].set_xlabel('Height') axes[1].set(ylabel=None) axes[1].grid(axis='x') # Plot the distribution of users by age sns.histplot(x='age', data=user_data, bins=24, color='rebeccapurple', kde=True, ax=axes[2]) axes[2].set_title('User Count by Age', fontsize=16) axes[2].set_xlabel('Age') axes[2].set(ylabel=None) axes[2].grid(axis='x') plt.savefig('data/images/fig1.png', dpi=200, transparent=True) plt.show() # Normally distributed and diverse ranges of weight, height, and age above. # In[11]: body_type_values = user_data['body_type'].value_counts().values body_type_names = user_data['body_type'].value_counts().index body_type_circle = plt.Circle((0,0), 0.7, color='white') plt.style.use('seaborn') plt.figure(figsize=(8,8)) colors = ['#DC7F8E', '#E5A1AA', '#F4BFBE', '#FFE0DA', '#F4C4B2', '#E8B08D', '#C68C73'] # Plot a donut chart of user body type plt.pie(body_type_values, labels=body_type_names, colors=colors, autopct='%1.0f%%', startangle=40, pctdistance=0.85) p = plt.gcf() p.gca().add_artist(body_type_circle) plt.title('User Percentage by Body Type', fontsize=16) plt.savefig('data/images/fig2.png', dpi=200, transparent=True) plt.show() # In[12]: sns.set_style('whitegrid') fig, axes = plt.subplots(ncols=2, figsize=(16, 6)) axes[0] = plt.subplot2grid((1, 5), (0, 0)) axes[1] = plt.subplot2grid((1, 5), (0, 1), colspan=4) # Categorize rating count into binary classes item_count_data = user_data.copy() item_count_data['rating_count'] = item_count_data['rating_count'].apply(lambda x: 'Only one' if x==1 else 'More than one') # Plot the distribution of binary classes sns.countplot(x='rating_count', data=item_count_data, palette=['#5d2349', '#861d23'], order=['Only one', 'More than one'], ax=axes[0]) axes[0].set(xlabel=None) axes[0].set_ylabel('User Count') rating_count_data = user_data.loc[(user_data['rating_count']>=2) & (user_data['rating_count']<=10)] # Show the distribution of the second class sns.countplot(x='rating_count', data=rating_count_data, palette='Reds_r', ax=axes[1]) axes[1].set_title('User Count by Number of Items Rated', fontsize=16) axes[1].set_xlabel('Item rating count', fontsize=12) axes[1].set(ylabel=None) plt.savefig('data/images/fig3.png', dpi=200, transparent=True) plt.show() # Overall, two thirds of users rented only one item and the remaining third rented more than one, on the left chart. Majority of those who rented more than once rented exactly two items, on the right chart. # In[13]: sns.set_style('whitegrid') fig, axes = plt.subplots(ncols=2, figsize=(15, 5)) # Plot the distribution of users by average rating sns.histplot(x='rating_average', data=user_data, bins=10, color='thistle', ax=axes[0]) axes[0].set_title('User Count by Average Rating', fontsize=16) axes[0].set_xlabel('Average Rating') axes[0].set_ylabel('User Count') axes[0].grid(axis='x') # Plot the distribution of users by average review length sns.histplot(x='review_length_average', data=user_data, bins=30, color='lightsteelblue', kde=True, ax=axes[1]) axes[1].set_title('User Count by Average Review Length', fontsize=16) axes[1].set_xlabel('Average Review Length') axes[1].set(ylabel=None) axes[1].grid(axis='x') plt.savefig('data/images/fig4.png', dpi=200, transparent=True) plt.show() # A left-skewed distribution for the average rating per user with most of them giving the highest rating, on the left chart. And a right-skewed distribution for the average length of text review per user, on the right chart. # In[14]: sns.set_style('whitegrid') fig, axes = plt.subplots(ncols=2, figsize=(15, 5)) # Plot the distribution of users by month of review posted sns.countplot(x='review_month_top', data=user_data, palette='PuBuGn', ax=axes[0]) axes[0].set_title('User Count by Top Month Rated', fontsize=16) axes[0].set_xlabel('Top Month Rated') axes[0].set_ylabel('User Count') # Plot the distribution of users by season of review posted sns.countplot(x='review_season_top', data=user_data, order=['Spring', 'Summer', 'Fall', 'Winter'], palette='PuBuGn', ax=axes[1]) axes[1].set_title('User Count by Top Season Rated', fontsize=16) axes[1].set_xlabel('Top Season Rated') axes[1].set(ylabel=None) plt.savefig('data/images/fig5.png', dpi=200, transparent=True) plt.show() # In[15]: sns.set_style('whitegrid') fig, axes = plt.subplots(ncols=2, figsize=(16, 8)) # Show top five clothing categories and top five reasons for rent category_top_data = user_data.loc[user_data['category_top'].isin(user_data['category_top'].value_counts()[:5].index.tolist())] rented_for_top_data = user_data.loc[user_data['rented_for_top'].isin(user_data['rented_for_top'].value_counts()[:5].index.tolist())] category_top_values = (category_top_data['category_top'].value_counts(normalize=True).values*100).tolist() category_top_labels = category_top_data['category_top'].value_counts().index.tolist() category_colors = ['#E5E4F4', '#E8F1DE', '#FDF9F0', '#F2E6F0', '#D9E4FB'] # Plot the distribution of users by clothing category axes[0].pie(category_top_values, labels=category_top_labels, colors=category_colors, autopct='%.0f%%') axes[0].set_title('User Percentage by Top 5 Clothing Category', fontsize=16) rented_for_top_values = (rented_for_top_data['rented_for_top'].value_counts(normalize=True).values*100).tolist() rented_for_top_labels = rented_for_top_data['rented_for_top'].value_counts().index.tolist() rented_for_colors = ['#FBFBFB', '#FAEDDA', '#D2C1CE', '#E1CEC9', '#F3C0A1'] # Plot the distribution of users by reason for rent axes[1].pie(rented_for_top_values, labels=rented_for_top_labels, colors=rented_for_colors, autopct='%.0f%%') axes[1].set_title('User Percentage by Top 5 Reason for Rent', fontsize=16) plt.savefig('data/images/fig6.png', dpi=200, transparent=True) plt.show() # The most common clothing categories are dresses and gowns that align with the most common reasons for renting which are for wedding, formal affair, and party. # *** # **Item Data** I create a separate table for item information by grouping the data by `item_id` and adding new features: # - `fit_small`, `fit_large`, and `fit` are the count of users who rated the item as too small, too large, or right fit. # - `user_count` is the total numbers of users who rated and reviewed the item. # - `bust_size_top` and `body_type_top` are the most common bust size and body type of the users who rented the item. # - `mean` and `median` of the `weight`, `height`, `size`, and `age` of all users who rented the item. # - `rating_average` is the average of all user ratings of the item. # In[16]: def create_item_data(df): ''' Groups the data by item and returns dataframe containing item details. ''' item_df = pd.DataFrame(df.groupby('item_id').count().reset_index()['item_id']) for col in df.columns: if col == 'fit': feature_small = df.loc[df[col]=='small'].groupby('item_id').count()[col] feature_fit = df.loc[df[col]=='fit'].groupby('item_id').count()[col] feature_large = df.loc[df[col]=='large'].groupby('item_id').count()[col] for idx, feature in enumerate([feature_small, feature_fit, feature_large]): item_df = item_df.join(feature, on='item_id', rsuffix=idx).fillna(0) if col == 'user_id': feature = df.groupby(df['item_id']).count()[col] item_df = item_df.merge(feature, on='item_id') if col in ['bust_size', 'body_type']: feature = df.sort_values('review_date', ascending=False).groupby('item_id')[col].agg(pd.Series.mode).apply(lambda x: x[0] if type(x)==np.ndarray else x) item_df = item_df.merge(feature, on='item_id') if col in ['weight', 'height', 'size', 'age']: feature_mean = df.groupby(df['item_id']).mean()[col] feature_median = df.groupby(df['item_id']).median()[col] for feature in [feature_mean, feature_median]: item_df = item_df.merge(feature, on='item_id') if col in ['rating', 'review_length']: feature = df.groupby(df['item_id']).mean()[col] item_df = item_df.merge(feature, on='item_id') if col in ['rented_for', 'category']: feature = df.sort_values('review_date', ascending=False).groupby('item_id')[col].agg(pd.Series.mode).apply(lambda x: x[0] if type(x)==np.ndarray else x) item_df = item_df.merge(feature, on='item_id') if col == 'rented_for': feature = df.groupby('item_id')[col].apply(set).apply(lambda x: list(x)) item_df = item_df.merge(feature, on='item_id') if col in ['review_month', 'review_season']: feature = df.sort_values('review_date', ascending=False).groupby('item_id')[col].agg(lambda x: x.value_counts().index[0]) item_df = item_df.merge(feature, on='item_id') else: continue item_df.columns = ['item_id', 'fit_small', 'fit', 'fit_large', 'user_count', 'bust_size_top', 'weight_mean', 'weight_median', 'rating_average', 'rented_for_top', 'rented_for_all', 'body_type_top', 'category_top', 'height_mean', 'height_median', 'size_mean', 'size_median', 'age_mean', 'age_median', 'review_length_average', 'review_month_top', 'review_season_top'] return item_df # Table to use for item to item recommendations later: # In[17]: # Create new dataframe for item data item_data = create_item_data(data) item_data.head(2) # ### Time Series Analysis # In[18]: plt.style.use('seaborn') time_series_data = data.sort_values('review_date').set_index('review_date', drop=True).drop('2010-11-03') # Resample data to yearly count of reviews yearly_data = time_series_data.resample('Y').count() yearly_data = yearly_data.drop(yearly_data.index[-1]) # Plot the aggregated yearly count of reviews yearly_data['rating'].plot(figsize=(8,5), colormap='PRGn', xlabel='') plt.title('Total Count of Reviews By Year', fontsize=16) plt.savefig('data/images/fig7.png', dpi=200, transparent=True) plt.show() # The count of reviews increased over the years from 10,000 in 2013 to almost 70,000 by 2018. # In[19]: # Resample data to monthly count of reviews monthly_data = time_series_data[~(time_series_data['review_year']==2011)].resample('MS').count() monthly_data = monthly_data.drop(monthly_data.index[-1]) # Plot the aggregated monthly count of reviews monthly_data['rating'].plot(figsize=(8,5), colormap='seismic', xlabel='') plt.title('Total Count of Reviews By Month', fontsize=16) plt.savefig('data/images/fig8.png', dpi=200, transparent=True) plt.show() # The count of reviews peak during months of spring and fall with the highest spike of over 8,000 reviews in October of 2017. # In[20]: # Resample data to yearly average rating of reviews yearly_data = time_series_data.resample('Y').mean() yearly_data = yearly_data.drop(yearly_data.index[-1]) # Plot the aggregated to yearly average rating of reviews yearly_data['rating'].plot(figsize=(8,5), colormap='PRGn', xlabel='') plt.title('Average Rating of Reviews By Year', fontsize=16) plt.savefig('data/images/fig9.png', dpi=200, transparent=True) plt.show() # The average ratings steadily increased from over 4.45 in 2013 to 4.575 in 2016 but went down by less then 0.025 in 2017. # In[21]: # Resample data to monthly average rating of reviews monthly_data = time_series_data[~(time_series_data['review_year']==2011)].resample('MS').mean() monthly_data = monthly_data.drop(monthly_data.index[-1]) # Plot the aggregated to monthly average rating of reviews monthly_data['rating'].plot(figsize=(8,5), colormap='seismic', xlabel='') plt.title('Average Rating of Reviews By Month', fontsize=16) plt.savefig('data/images/fig10.png', dpi=200, transparent=True) plt.show() # The average ratings peak during the latter months of the year and aligned with the higher counts of rentals in the fall. # ## IV. Recommendation Systems # # > "Recommendation Systems are software agents that elicit the interests and preferences of individual consumers […] and make recommendations accordingly. They have the potential to support and improve the quality of the # decisions consumers make while searching for and selecting products online." [(Bo Xiao and Izak Benbasat)](https://misq.org/e-commerce-product-recommendation-agents-use-characteristics-and-impact.html) # # To start, I create a set of generalized recommendations that are based on all the data. For all the items, I calculate a weighted rating and return the top 10 highest-rated items across the board. To **personalize the recommendations**, I apply the different algorithms for Content-Based Recommenders and Collaborative Filtering Systems, which I implement using the `surprise` library later. # *** # # ### Popularity Recommendations # # ##### Bayesian Average # # $$W = \left(\frac{v}{v + m} \right)R + \left(\frac{m}{v + m} \right)C$$ # where: # # $W$ = Weighted rating
# $v$ = Number of ratings for the item
# $m$ = Minimum number of ratings required to be listed on top chart
# $R$ = Average rating of the item
# $C$ = Mean rating across the entire data # In[22]: m = item_data['user_count'].quantile(0.9) C = item_data['rating_average'].mean() def weighted_rating(x, m=m, C=C): ''' Calculates weighted rating based on Bayesian Average. ''' v = x['user_count'] R = x['rating_average'] return (v/(v+m) * R) + (m/(m+v) * C) def popular_recommendation(df=data, n=10): ''' Returns the most popular items according to the highest weighted ratings. ''' item_df = create_item_data(df) top_item_ratings = item_df.loc[(item_df['user_count']>=m)] top_item_ratings['score'] = top_item_ratings.apply(weighted_rating, axis=1) top_item_ratings = top_item_ratings.sort_values('score', ascending=False) return top_item_ratings.head(n) # Top 10 Popularity-Based Recommendations: # In[23]: pd.set_option('display.max_columns', 30) top10_overall = popular_recommendation() top10_overall # To simulate the online shopping experience, I can also filter the popularity-based recommendations on the data features such as `dress` for clothing category and `wedding` for reason to rent using the function I define as `filter_popular_recommendation`. # In[24]: column_list = [] operator_list = [] condition_list = [] def append_condition(column, operation, condition): ''' Appends a filter to column, operator, and condition lists. ''' column_list.append(column) operator_list.append(operation) condition_list.append(condition) def filter_popular_recommendation(df=data, n=10, bust_size=None, weight=None, rating=None, rented_for=None, body_type=None, category=None, height=None, size=None, age=None, review_month=None, review_season=None, review_year=None): ''' Returns the most popular recommendations filtered by the features passed as arguments. ''' if bust_size: append_condition('bust_size', '==', bust_size) if weight: append_condition('weight', '>=', weight-10) append_condition('weight', '<=', weight+10) if rented_for: append_condition('rented_for', '==', rented_for) if body_type: append_condition('body_type', '==', body_type) if category: append_condition('category', '==', category) if height: append_condition('height', '>=', height-2) append_condition('height', '>=', height+2) if size: append_condition('size', '==', size) if age: append_condition('age', '>=', age-4) append_condition('age', '<=', age+4) if review_month: append_condition('review_month', '==', review_month) if review_season: append_condition('review_season', '==', review_season) if review_year: append_condition('review_year', '==', review_year) condition = ' & '.join(f'{col} {op} {repr(cond)}' for col, op, cond in zip(column_list, operator_list, condition_list)) filtered_df = df.query(condition) return popular_recommendation(filtered_df, n) def reset_condition(): ''' Reinitializes lists for query for filtered popularity recommender. ''' column_list = [] operator_list = [] condition_list = [] return column_list, operator_list, condition_list # Top 10 Popular Recommendations for Dress: # In[25]: top10_dress = filter_popular_recommendation(category='dress') column_list, operator_list, condition_list = reset_condition() top10_dress # Top 10 Popular Recommendations for Wedding: # In[26]: top10_wedding = filter_popular_recommendation(rented_for='wedding') column_list, operator_list, condition_list = reset_condition() top10_wedding # ### Content-Based Recommenders # # Content-based recommendation systems are based on the idea that if a user likes an item, the user will also like items similar to it. To measure the similarity between the items, I calculate the Pearson correlation using numerical and categorical features from the table `item_data` created earlier. Then, I complete a `similarity_matrix` of all the items to use in the function `content_based_similarity` I define, which generates content-based recommendations for any `item_id`. Lastly, I use the text features later to create a **text review-based recommender** using Natural Language Processing. # In[27]: def item_similarity(item_df): ''' Measures pearson correlation of items from the table of item data and returns a similarity matrix. ''' item_df = item_df.drop(['fit_small', 'fit_large', 'weight_mean', 'rented_for_all', 'height_mean', 'size_mean', 'age_mean', 'review_month_top'], axis=1) similarity_features = item_df[['item_id', 'fit', 'user_count', 'weight_median', 'rating_average', 'rented_for_top', 'body_type_top', 'category_top', 'height_median', 'size_median', 'age_median', 'review_length_average', 'review_season_top']] similarity_features = similarity_features.set_index('item_id') similarity_features = pd.get_dummies(similarity_features, columns=['rented_for_top', 'body_type_top', 'category_top', 'review_season_top']) similarity_matrix = similarity_features.T similarity_matrix = similarity_matrix.corr(method='pearson') return similarity_features, similarity_matrix pd.set_option('display.max_columns', 30) def content_based_similarity(similarity_matrix, item_id, n=20): ''' Returns the most similar item recommendations to the given item based on the similarity matrix. ''' recommendations = similarity_matrix[item_id].sort_values(ascending=False) recommendations = recommendations.drop([item_id], axis=0).index recommendations_list = [] for i in range(n): recommendations_list.append(recommendations[i]) display(item_data.loc[item_data['item_id']==item_id]) print(f'----------------------------------------\nTop {n} Recommendations for Item #{item_id}:') recommendations_df = item_data.loc[item_data['item_id'].isin(recommendations_list)] return recommendations_df # In[28]: similarity_features, similarity_matrix = item_similarity(item_data) # In[29]: similarity_matrix # In[30]: # Example item content_based_similarity(similarity_matrix, 123373) # #### Text Review-Based Recommender # # To recommend items based on text reviews, Natural Language Processing (NLP) is used to: # - Clean the text by removing stopwords and performing lemmatization. # - Create the Term Frequency-Inverse Document Frequency (TF-IDF) vectors for the *documents*, which are the reviews. # - Compute the pairwise cosine similarity from the constructed matrix of TF-IDF scores. # In[31]: # Import the Natural Language Toolkit (nltk) import re import nltk nltk.download('stopwords') nltk.download('wordnet') stopwords = nltk.corpus.stopwords.words('english') # In[32]: from nltk.stem.wordnet import WordNetLemmatizer lemmatizer = nltk.stem.WordNetLemmatizer() def preprocess(text): ''' Text preprocessing to standardize, remove special characters and stopwords, and lemmatize. ''' text = text.apply(lambda x: x.lower()) text = text.apply(lambda x: re.sub(r'([^A-Za-z0-9|\s|[:punct:]]*)', '', x)) text = text.apply(lambda x: x.replace('[^a-zA-Z#]', ' ')) text = text.apply(lambda x: ' '.join([i for i in x.split() if len(i)>3])) text = text.apply(lambda x: x.split()) text = text.apply(lambda x: [lemmatizer.lemmatize(word) for word in x]) text = text.apply(lambda x: [word for word in x if word not in stopwords]) text = text.apply(lambda x: ' '.join(x)) return text def create_text_df(df=data, item_df=item_data, text_review=True, category=False): ''' Creates new feature combining review summary and review text, to add to item data. ''' if item_df is None: item_df = create_item_df(df) text_df = df.copy() text_df['review'] = text_df['review_summary'] + ' ' + text_df['review_text'] text_df['review'] = text_df['review'].fillna('') text_df['review'] = preprocess(text_df['review']) if text_review: text_df = text_df[['item_id', 'review']].groupby('item_id').agg(' '.join).reset_index() text_item_df = item_df.merge(text_df, on='item_id') if text_review == False and category == True: text_df = text_df[['item_id', 'rented_for']].groupby('item_id').agg(' '.join).reset_index() text_item_df = item_df.merge(text_df, on='item_id') return text_item_df # In[33]: # Create new dataframe for text item data text_item_data = create_text_df() # In[34]: from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer count = CountVectorizer() tfidf = TfidfVectorizer(ngram_range=(1,3)) # > "The TF-IDF score is the frequency of a word occurring in a document, down-weighted by the number of documents in which it occurs." [(Aditya Sharma)](https://www.datacamp.com/community/tutorials/recommender-systems-python]) # # Finally, to compute the cosine similarity score between the text reviews, the dot product between each TF-IDF vector is calculated in the function `text_based_recommendation` below: # In[35]: from sklearn.metrics.pairwise import cosine_similarity, linear_kernel def text_based_recommendation(text_item_df, item_id, n=10, text_review=True, category=False): ''' Returns the most similar item recommendations to the given item based on text reviews. ''' if text_review: tfidf_matrix = tfidf.fit_transform(text_item_df['review']) cosine_similarity_ = linear_kernel(tfidf_matrix, tfidf_matrix) if text_review == False and category == True: count_matrix = count.fit_transform(text_item_df['rented_for']) cosine_similarity_ = cosine_similarity(count_matrix, count_matrix) indices = pd.Series(text_item_df.index, index=text_item_df['item_id']).drop_duplicates() idx = indices[item_id] similarity_scores = list(enumerate(cosine_similarity_[idx])) similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True) top_similarity_scores = similarity_scores[1:n+1] item_indices = [i[0] for i in top_similarity_scores] top_text_based_recommendations = text_item_df['item_id'].iloc[item_indices] display(item_data.loc[item_data['item_id']==item_id]) print(f'----------------------------------------\nTop {n} Recommendations for Item #{item_id}:') recommendations_df = item_data.loc[item_data['item_id'].isin(top_text_based_recommendations)] return recommendations_df # In[36]: # Same example item text_based_recommendation(text_item_data, 123373, n=10) # **Key Differences** between the text-based recommendations and the content-based recommendations to the same item: # # |Feature|Content-based|Text-based|Item| # |---|---|---:|:---| # |rating_average|4.38 - 4.69|4.43 - 4.77|4.40| # |rented_for_top|party, formal affair, wedding|formal affair (across the board)|formal affair| # |body_type_top|hourglass, athlete|hourglass (across the board)|hourglass| # |category_top|dress, gown, sheath|gown (across the board)|gown| # # *** # # ### Collaborative Filtering Systems # # Collaborative filtering systems recommend items to a user based on the user's past ratings *and* on the past ratings and preferences of other similar users. I apply the different implementations of collaborative filtering recommendation systems using the Python library [`surprise`](https://surprise.readthedocs.io/en/stable/index.html): # # # |Prediction Algorithm|Description| # |:---|:---| # |[Normal Predictor](https://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor)|Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal. # |[Baseline Only](https://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly)|Algorithm predicting the baseline estimate for given user and item.| # |[KNN Basic](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic)|A basic collaborative filtering algorithm.| # |[KNN Baseline](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline)|A basic collaborative filtering algorithm, taking into account the mean ratings of each user.| # |[KNN with Means](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans)|A basic collaborative filtering algorithm, taking into account the z-score normalization of each user.| # |[KNN with Z-Score](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithZScore)|A basic collaborative filtering algorithm taking into account a *baseline* rating. # |[Single Value Decomposition](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)|The famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize. When baselines are not used, this is equivalent to Probabilistic Matrix Factorization.| # |[Single Value Decomposition ++](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp)|The SVD++ algorithm, an extension of SVD taking into account implicit ratings.| # |[Non-Negative Matrix Factorization](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF)|A collaborative filtering algorithm based on Non-negative Matrix Factorization.| # |[SlopeOne](https://surprise.readthedocs.io/en/stable/slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne)|A simple yet accurate collaborative filtering algorithm.| # |[CoClustering](https://surprise.readthedocs.io/en/stable/co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering)|A collaborative filtering algorithm based on co-clustering.| # In[37]: data = data.rename(columns={'user_id': 'userID', 'item_id': 'itemID'}) df_columns = ['userID', 'itemID', 'rating'] df = data[df_columns] # Only use items with more than 25 ratings df['reviews'] = df.groupby(['itemID'])['rating'].transform('count') df = df.loc[df['reviews']>25, df_columns] # In[38]: from surprise import Reader, Dataset reader = Reader(rating_scale=(1,5)) read_data = Dataset.load_from_df(df, reader) # Data Modeling: # In[39]: from surprise import NormalPredictor, BaselineOnly, SVD, SVDpp, NMF, SlopeOne, CoClustering from surprise.model_selection import cross_validate from surprise.prediction_algorithms import knns sim_cos = {'name':'cosine', 'user_based':False} evaluation = [] recommendation_systems = [NormalPredictor(), BaselineOnly(), knns.KNNBasic(sim_options=sim_cos), knns.KNNBaseline(sim_options=sim_cos), knns.KNNWithMeans(sim_options=sim_cos), knns.KNNWithZScore(sim_options=sim_cos), SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering()] # Evaluate recommendation systems using Mean Absolute Error for system in recommendation_systems: score = cross_validate(system, read_data, measures=['MAE'], cv=3, verbose=False) evaluation.append((str(system).split(' ')[0].split('.')[-1], score['test_mae'].mean())) pd.options.display.float_format = '{:.4f}'.format evaluation = pd.DataFrame(evaluation, columns=['system', 'mae']) # To evaluate, I use the mean absolute error which measures the difference between the rating predicted by the model and the actual rating by the user: # In[40]: evaluation # In[41]: # Switch similarity measure from cosine to pearson sim_pearson = {'name':'pearson', 'user_based':False} pearson_evaluation = [] pearson_knns = [knns.KNNBasic(sim_options=sim_pearson), knns.KNNBaseline(sim_options=sim_pearson), knns.KNNWithMeans(sim_options=sim_pearson), knns.KNNWithZScore(sim_options=sim_pearson)] for system in pearson_knns: pearson_score = cross_validate(system, read_data, measures=['MAE'], cv=3, verbose=False) pearson_evaluation.append((str(system).split(' ')[0].split('.')[-1], pearson_score['test_mae'].mean())) pearson_evaluation = pd.DataFrame(pearson_evaluation, columns=['system', 'mae']) pearson_evaluation # The mean absolute errors of `KNNBaseline`, `KNNWithMeans`, and `KNNWithZScore` decreased by 0.01 and `KNNBaseline` becomes second to `SVD`. # # *** # # #### Hyperparamater Tuning # # `GridSearchCV` is performed to optimize the Single Value Decomposition models: # In[42]: from surprise.model_selection import GridSearchCV def grid_search(system, params): ''' Implements grid search and returns best cross validation scores and parameters. ''' model = GridSearchCV(system, param_grid=params, n_jobs=-1) model.fit(read_data) print(model.best_score) print(model.best_params) # In[43]: params_svd1 = {'n_factors': [10, 50, 100], 'n_epochs': [10, 20, 100], 'lr_all': [0.001, 0.005, 0.01], 'reg_all': [0.02, 0.05, 0.1]} grid_search(SVD, params_svd1) # In[44]: params_svdpp1 = {'n_factors': [10, 50, 100], 'n_epochs': [10, 20, 100], 'lr_all': [0.001, 0.005, 0.01], 'reg_all': [0.02, 0.05, 0.1]} grid_search(SVDpp, params_svdpp1) # In[45]: svd_evaluation = [] # Evaluate tuned Singular Value Decomposition models for system in [SVD(n_factors=10, n_epochs=20, lr_all=0.01, reg_all=0.02), SVDpp(n_factors=10, n_epochs=100, lr_all=0.005, reg_all=0.1)]: svd_score = cross_validate(system, read_data, measures=['MAE'], cv=3, verbose=False) svd_evaluation.append((str(system).split(' ')[0].split('.')[-1], svd_score['test_mae'].mean())) svd_evaluation = pd.DataFrame(svd_evaluation, columns=['system', 'mae']) svd_evaluation # ## V. Results and Recommendations # # #### Systems Performance: # In[46]: all_systems = pd.concat([evaluation, pearson_evaluation, svd_evaluation], ignore_index=True) all_systems # The results show that the tuned Singular Value Decomposition algorithm attains the lowest Mean Absolute Error of 0.5302 with a rating scale of 1 to 5. # In[47]: sns.set_style('whitegrid') fig, ax = plt.subplots(figsize=(20,8)) plt.subplots_adjust(bottom=0.2) # Plot the Mean Absolute Error of the models sns.barplot(all_systems.index, all_systems['mae'], palette='tab20b') ax.set(xlim=[-0.5,16.5], xlabel='Recommendation System', ylabel='Mean Absolute Error') ax.set_title('Collaborative Filtering and Recommender Systems Evaluation', fontsize=20) labels = ['Normal Predictor', 'Baseline Only', 'KNN Basic Cosine', 'KNN Baseline Cosine', 'KNN Means Cosine', 'KNN Z-Score Cosine', 'Default SVD', 'Default SVD++', 'NMF', 'Slope One', 'Co-Clustering', 'KNN Basic Pearson', 'KNN Baseline Pearson', 'KNN Means Pearson', 'KNN Z-Score Pearson', 'Tuned SVD', 'Tuned SVD++'] plt.xticks(all_systems.index, labels, rotation=45) plt.savefig('data/images/fig11.png', dpi=200, transparent=True) plt.show() # #### Recommender Engine: # In[48]: def svd_recommendation(user_id, n=10): ''' Returns top item recommendations generated by the Single Value Decomposition model. ''' unique_ids = df['itemID'].unique() item_user_id = df.loc[df['userID']==user_id, 'itemID'] items_to_predict = np.setdiff1d(unique_ids, item_user_id) engine = SVD(n_factors=10, n_epochs=20, lr_all=0.01, reg_all=0.02) engine.fit(read_data.build_full_trainset()) svd_recommendations = [] for i in items_to_predict: svd_recommendations.append((i, engine.predict(uid=user_id, iid=i).est)) display(user_data.loc[user_data['user_id']==user_id]) print(f'----------------------------------------\nTop {n} Recommendations for User #{user_id}:') svd_recommendations = pd.DataFrame(svd_recommendations, columns=['item_id', 'predicted_rating']) svd_recommendations = svd_recommendations.sort_values('predicted_rating', ascending=False).head(n) svd_recommendations = svd_recommendations.merge(item_data, on='item_id') return svd_recommendations # In[49]: # Sample user sample = user_data.sort_values('rating_count', ascending=False) sample.loc[((user_data['bust_size']=='32a') & (user_data['height']==62))].head(5) # In[50]: svd_recommendation(480611) # ## VI. Further Research # # For further research, the data should be updated with more recent rentals, and more features should be added such as prices for the products as well as product description. # # Contact # # Feel free to contact me for any questions and connect with me on [Linkedin](https://www.linkedin.com/in/czarinagluna/).