Improve Product Recommendation using Sentiment Analysis¶

Watch Other Interesting Data Science Topics
Subscribe on YouTube
Created on: 26-MAY-2022
Last Updated on: 26-MAY-2022

Recommendation¶

In [1]:

################################
## STEP 01: Import Libraries  ##
################################
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split 
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from IPython.display import display

In [2]:

#############################
## STEP 02: Read Data    ####
#############################
# Reading ratings file
ratings = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_ratings_final.csv',\
                      encoding='latin-1')
# ratings.reset_index(drop=True, inplace=True)
display(ratings.sample(n=5, random_state=42))

	userId	rating	prod_name
28266	daitaliana23	5	Storkcraft Tuscany Glider and Ottoman, Beige C...
15603	beverly	5	Lysol Concentrate Deodorizing Cleaner, Origina...
7839	amy77	5	Clorox Disinfecting Wipes Value Pack Scented 1...
4850	dmann10101	5	The Resident Evil Collection 5 Discs (blu-Ray)
4699	morenito021582	5	The Resident Evil Collection 5 Discs (blu-Ray)

In [3]:

#################################
## STEP 03: Data Preparation ####
#################################

def apply_pivot(df,fillby = None):
    if fillby is not None:
        return df.pivot_table(index='userId', columns='prod_name',values='rating').fillna(fillby)
    return df.pivot_table(index='userId', columns='prod_name',values='rating')


#3.1 Dividing the dataset into train and test
train, test = train_test_split(ratings, test_size=0.30, random_state=42)
test = test[test.userId.isin(train.userId)]
#3.2 Apply pivot operation and fillna used to replace NaN values with 0 i.e. where user didn't made any rating
df_train_pivot = apply_pivot(df = train, fillby = 0)
df_test_pivot = apply_pivot(df = test, fillby = 0)
#3.3 dummy dataset (train and test)
## Train
dummy_train = train.copy()
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_train = apply_pivot(df = dummy_train, fillby = 1)
## Test
dummy_test = test.copy()
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x>=1 else 0)
dummy_test = apply_pivot(df = dummy_test, fillby = 0)

In [4]:

df_train_pivot[(df_train_pivot['0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest']!=0) | \
               (df_train_pivot['4C Grated Parmesan Cheese 100% Natural 8oz Shaker']!=0)]

Out[4]:

prod_name	0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest	100:Complete First Season (blu-Ray)	2017-2018 Brownline174 Duraflex 14-Month Planner 8 1/2 X 11 Black	2x Ultra Era with Oxi Booster, 50fl oz	42 Dual Drop Leaf Table with 2 Madrid Chairs"	4C Grated Parmesan Cheese 100% Natural 8oz Shaker	Africa's Best No-Lye Dual Conditioning Relaxer System Super	Alberto VO5 Salon Series Smooth Plus Sleek Shampoo	All,bran Complete Wheat Flakes, 18 Oz.	Ambi Complexion Cleansing Bar	...	Vicks Vaporub, Regular, 3.53oz	Voortman Sugar Free Fudge Chocolate Chip Cookies	Wagan Smartac 80watt Inverter With Usb	Wallmount Server Cabinet (450mm, 9 RU)	Way Basics 3-Shelf Eco Narrow Bookcase Storage Shelf, Espresso - Formaldehyde Free - Lifetime Guarantee	Wedding Wishes Wedding Guest Book	Weleda Everon Lip Balm	Windex Original Glass Cleaner Refill 67.6oz (2 Liter)	Yes To Carrots Nourishing Body Wash	Yes To Grapefruit Rejuvenating Body Wash
userId
brewno	3.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
deelee	0.0	0.0	0.0	0.0	0.0	5.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
embum	5.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
erinn	0.0	0.0	0.0	0.0	0.0	5.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
rmtarboro	0.0	0.0	0.0	0.0	0.0	5.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
smokey bear	3.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
spicesea	5.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

7 rows × 231 columns

In [6]:

#####################################
## STEP 04: User-User Similarity ####
#####################################

# to calculate mean, use only ratings given by user instead of fillna by 0 as it increase denominator in mean
mean = np.nanmean(apply_pivot(df = train), axis = 1)
df_train_subtracted = (apply_pivot(df = train).T-mean).T
# Make rating=0 where user hasn't given any rating
df_train_subtracted.fillna(0, inplace = True)
# Creating the User Similarity Matrix using pairwise_distance function. shape of user_correlation is userXuser i.e. 18025X18025
user_correlation = 1 - pairwise_distances(df_train_subtracted, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
# user_correlation[user_correlation<0] = 0
# Convert the user_correlation matrix into dataframe
user_correlation_df = pd.DataFrame(user_correlation)
user_correlation_df['userId'] = df_train_subtracted.index
user_correlation_df.set_index('userId',inplace=True)
user_correlation_df.columns = df_train_subtracted.index.tolist()

In [8]:

user_correlation.shape,df_train_pivot.shape

Out[8]:

((18025, 18025), (18025, 231))

In [9]:

###########################################
## STEP 05: Predict Rating (User-User) ####
###########################################
# Rating predicted by the user (for rated & non rated product both) is the weighted sum of correlation with the product rating (as present in the rating dataset). 
user_predicted_ratings = np.dot(user_correlation, df_train_pivot)

# To find only product not rated by the user, ignore the product rated by the user by making it zero. 
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)

# scaler = MinMaxScaler(feature_range=(1, 5))
# scaler.fit(user_final_rating)
# user_final_rating = scaler.transform(user_final_rating)

In [10]:

################################################################
## STEP 06: Find Top N recommendation for User (User-User) #####
################################################################

def find_top_recommendations(pred_rating_df, userid, topn):
    recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]
    recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})
    return recommendation

user_input = str(input("Enter your user id"))
recommendation_user_user = find_top_recommendations(user_final_rating, user_input, 5)
recommendation_user_user['userId'] = user_input

Enter your user idjoshua

In [11]:

print("Recommended products for user id:{} as below".format(user_input))
display(recommendation_user_user)
print("Earlier rated products by user id:{} as below".format(user_input))
display(train[train['userId']==user_input].sort_values(['rating'],ascending=False))

Recommended products for user id:joshua as below

	prod_name	predicted_ratings	userId
0	Clorox Disinfecting Wipes Value Pack Scented 1...	5.226926	joshua
1	Lysol Concentrate Deodorizing Cleaner, Origina...	3.750000	joshua
2	Head & Shoulders Dandruff Shampoo Ocean Lift 2...	3.535534	joshua
3	Bounce Dryer Sheets, Fresh Linen, 160 sheets	3.535534	joshua
4	The Resident Evil Collection 5 Discs (blu-Ray)	3.345348	joshua

Earlier rated products by user id:joshua as below

	userId	rating	prod_name
0	joshua	5	Pink Friday: Roman Reloaded Re-Up (w/dvd)
17718	joshua	5	Smead174 Recycled Letter Size Manila File Back...
22379	joshua	5	Cheetos Crunchy Flamin' Hot Cheese Flavored Sn...
1541	joshua	3	Dark Shadows (includes Digital Copy) (ultravio...

In [12]:

################################################
## STEP 07: Evaluation (User-User) on test #####
################################################s

#Filter user correlation only for user which is in test, test is subset/equal of train in terms of userId

user_correlation_test_df = user_correlation_df[user_correlation_df.index.isin(test.userId)]
user_correlation_test_df = user_correlation_test_df[list(set(test.userId))]
# user_correlation_test_df[user_correlation_test_df<0]=0

#Get test user predicted rating
test_user_predicted_ratings = np.dot(user_correlation_test_df, df_test_pivot)
test_user_predicted_ratings = np.multiply(test_user_predicted_ratings,dummy_test)
#Get NaN where user never rated as it shouldn't contribute in calculating RMSE
test_user_predicted_ratings = test_user_predicted_ratings[test_user_predicted_ratings>0]
scaler = MinMaxScaler(feature_range=(1, 5))
scaler.fit(test_user_predicted_ratings)
test_user_predicted_ratings = scaler.transform(test_user_predicted_ratings)

total_non_nan = np.count_nonzero(~np.isnan(test_user_predicted_ratings))
rmse = (np.sum(np.sum((apply_pivot(df = test) - test_user_predicted_ratings)**2))/total_non_nan)**0.5
print(rmse)

2.506663023687151

C:\Users\aakashgoel\Anaconda3\envs\forecast_anaconda\lib\site-packages\sklearn\preprocessing\_data.py:464: RuntimeWarning: All-NaN slice encountered
  data_min = np.nanmin(X, axis=0)
C:\Users\aakashgoel\Anaconda3\envs\forecast_anaconda\lib\site-packages\sklearn\preprocessing\_data.py:465: RuntimeWarning: All-NaN slice encountered
  data_max = np.nanmax(X, axis=0)

In [13]:

############################
## STEP 08: Save Model  ####
############################
pickle.dump(user_final_rating,open('./model/user_final_rating.pkl','wb'))

Sentiment¶

In [1]:

################################
## STEP 01: Import Libraries  ##
################################
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from imblearn import over_sampling
from IPython.display import display

In [2]:

#############################
## STEP 02: Read Data    ####
#############################
# Reading product review sentiment file
df_prod_review = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_review_sentiment.csv',\
                      encoding='latin-1')
display(df_prod_review.sample(n=5, random_state=42))

	Review	user_sentiment
9329	fresh clean smell everything need quick clean ...	1
4160	great vacuum love lightweight vacuum easy carr...	1
18500	smell great wipe easy use work smell great	1
8840	product count use clorox wipe everything trave...	1
5098	great movie excellent movie add blu ray collec...	1

In [3]:

#################################
## STEP 03: Data Preparation ####
#################################
x=df_prod_review['Review']
y=df_prod_review['user_sentiment']
print("Checking distribution of +ve and -ve review sentiment: \n{}".format(y.value_counts(normalize=True)))
# Split the dataset into test and train
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=50)

#As we saw above that data is imbalanced, balance training data using over sampling

ros = over_sampling.RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(pd.DataFrame(X_train), pd.Series(y_train))
print("Checking distribution of +ve and -ve review sentiment after oversampling: \n{}".format(y_train.value_counts(normalize=True)))
#convert into list of string
X_train = X_train['Review'].tolist()

Checking distribution of +ve and -ve review sentiment: 
1    0.888401
0    0.111599
Name: user_sentiment, dtype: float64
Checking distribution of +ve and -ve review sentiment after oversampling: 
1    0.5
0    0.5
Name: user_sentiment, dtype: float64

In [4]:

################################################################
## STEP 04: Feature Engineering (Convert text into numbers) ####
################################################################
word_vectorizer = TfidfVectorizer(strip_accents='unicode', token_pattern=r'\w{1,}',\
                                ngram_range=(1, 3), stop_words='english', sublinear_tf=True, max_df = 0.80, min_df = 0.01)

# Fiting it on Train
word_vectorizer.fit(X_train)
# transforming the train and test datasets
X_train_transformed = word_vectorizer.transform(X_train)
X_test_transformed = word_vectorizer.transform(X_test.tolist())

In [5]:

X_train_transformed.shape, X_test_transformed.shape

Out[5]:

((33468, 263), (8062, 263))

In [ ]:

# print(list(word_vectorizer.get_feature_names()))

In [6]:

###############################################
## STEP 05: ML Model (Logistic Regression) ####
###############################################

def evaluate_model(y_pred,y_actual):
    print(classification_report(y_true = y_actual, y_pred = y_pred))
    #confusion matrix
    cm = confusion_matrix(y_true = y_actual, y_pred = y_pred)
    TN = cm[0, 0] 
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]
    #Calculating the Sensitivity
    sensitivity = round(TP/float(FN + TP),2)
    print("sensitivity: {}".format(sensitivity))
    #Calculating the Specificity
    specificity = round(TN / float(TN + FP),2)
    print("specificity: {}".format(specificity))

#4.1 Model Training
logit = LogisticRegression()
logit.fit(X_train_transformed,y_train)
#4.2 Prediction on Train Data
y_pred_train= logit.predict(X_train_transformed)
#4.3 Prediction on Test Data
y_pred_test = logit.predict(X_test_transformed)
#4.4 Evaluation on Train
print("Evaluation on Train dataset ..")
evaluate_model(y_pred = y_pred_train, y_actual = y_train)
print("Evaluation on Test dataset ..")
#4.5 Evaluation on Test
evaluate_model(y_pred = y_pred_test, y_actual = y_test)

Evaluation on Train dataset ..
              precision    recall  f1-score   support

           0       0.82      0.83      0.82     16734
           1       0.83      0.81      0.82     16734

    accuracy                           0.82     33468
   macro avg       0.82      0.82      0.82     33468
weighted avg       0.82      0.82      0.82     33468

sensitivity: 0.81
specificity: 0.83
Evaluation on Test dataset ..
              precision    recall  f1-score   support

           0       0.35      0.80      0.49       922
           1       0.97      0.81      0.88      7140

    accuracy                           0.81      8062
   macro avg       0.66      0.81      0.69      8062
weighted avg       0.90      0.81      0.84      8062

sensitivity: 0.81
specificity: 0.8

In [7]:

############################
## STEP 06: Save Model  ####
############################
pickle.dump(logit,open('./model/logit_model.pkl', 'wb'))
pickle.dump(word_vectorizer,open('./model/word_vectorizer.pkl','wb'))

Connecting dot -- Use Sentiment in Improving Recommendation¶

In [7]:

################################
## STEP 01: Import Libraries  ##
################################
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler

In [2]:

#############################
## STEP 02: Read Data    ####
#############################
# Reading product review data
df_prod_review = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_review.csv',\
                      encoding='latin-1')
display(df_prod_review.sample(n=5, random_state=42))

	prod_name	Review
2501	Hawaiian Punch Berry Limeade Blast Juice	pretty good stuff much sugar kid like
21252	Godzilla 3d Includes Digital Copy Ultraviolet ...	enteraining great interesting version classic ...
23503	Godzilla 3d Includes Digital Copy Ultraviolet ...	best godzilla date like previous godzilla film...
26827	Storkcraft Tuscany Glider and Ottoman, Beige C...	comfy good put baby sleep calming sister mom n...
18210	Clorox Disinfecting Bathroom Cleaner	product easy use product easy use open use har...

In [3]:

###########################
## STEP 03: Load Model ####
###########################

model =  pickle.load(open('./model/logit_model.pkl', 'rb'))
word_vectorizer = pickle.load(open('./model/word_vectorizer.pkl','rb'))
user_final_rating = pickle.load(open('./model/user_final_rating.pkl','rb'))

In [15]:

##########################################################################
## STEP 04: Get positive review Recommendation only for given user id ####
##########################################################################

def find_top_recommendations(pred_rating_df, userid, topn):
    recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]
    recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})
    return recommendation

def get_sentiment_product(x):
    ## Get review list for given product
    product_name_review_list = df_prod_review[df_prod_review['prod_name']== x]['Review'].tolist()
    ## Transform review list into DTM (Document/review Term Matrix)
    features= word_vectorizer.transform(product_name_review_list)
    ## Predict sentiment
    return model.predict(features).mean()

def find_top_pos_recommendation(user_final_rating, user_input, df_prod_review, word_vectorizer,\
                                 model, no_recommendation):
    ## 10 is manually coded, need to change 
    ## Generate top recommenddations using user-user based recommendation system w/o using sentiment analysis  
    recommendation_user_user = find_top_recommendations(user_final_rating, user_input, 10)
    recommendation_user_user['userId'] = user_input
    ## filter out recommendations where predicted rating is zero
    recommendation_user_user = recommendation_user_user[recommendation_user_user['predicted_ratings']!=0]
    print("Recommended products for user id:{} without using sentiment".format(user_input))
    display(recommendation_user_user)
    ## Get overall sentiment score for each recommended product
    recommendation_user_user['sentiment_score'] = recommendation_user_user['prod_name'].apply(get_sentiment_product)
    ## Transform scale of sentiment so that it can be manipulated with predicted rating score
    scaler = MinMaxScaler(feature_range=(1, 5))
    scaler.fit(recommendation_user_user[['sentiment_score']])
    recommendation_user_user['sentiment_score'] = scaler.transform(recommendation_user_user[['sentiment_score']])
    ## Get final product ranking score using 1*Predicted rating of recommended product + 2*normalized sentiment score on scale of 1–5 of recommended product 
    recommendation_user_user['product_ranking_score'] =  1*recommendation_user_user['predicted_ratings'] + \
                                                        2*recommendation_user_user['sentiment_score']
    print("Recommended products for user id:{} after using sentiment".format(user_input))
    ## Sort product ranking score in descending order and show only top `no_recommendation`
    display(recommendation_user_user.sort_values(by = ['product_ranking_score'],ascending = False).head(no_recommendation))

In [16]:

user_input = str(input("Enter your user id"))
find_top_pos_recommendation(user_final_rating, user_input, df_prod_review, word_vectorizer,\
                                 model, no_recommendation = 5)

Enter your user idjoshua
Recommended products for user id:joshua without using sentiment

	prod_name	predicted_ratings	userId
0	Clorox Disinfecting Wipes Value Pack Scented 1...	5.226926	joshua
1	Lysol Concentrate Deodorizing Cleaner, Origina...	3.750000	joshua
2	Head & Shoulders Dandruff Shampoo Ocean Lift 2...	3.535534	joshua
3	Bounce Dryer Sheets, Fresh Linen, 160 sheets	3.535534	joshua
4	The Resident Evil Collection 5 Discs (blu-Ray)	3.345348	joshua
5	Hormel Chili, No Beans	3.286511	joshua
6	Chester's Cheese Flavored Puffcorn Snacks	2.204404	joshua
7	Mike Dave Need Wedding Dates (dvd + Digital)	0.720898	joshua
8	Storkcraft Tuscany Glider and Ottoman, Beige C...	0.708318	joshua
9	Ceiling Fan With Light White 14.2 X 29.9 X 9.2...	0.708318	joshua

Recommended products for user id:joshua after using sentiment

	prod_name	predicted_ratings	userId	sentiment_score	product_ranking_score
0	Clorox Disinfecting Wipes Value Pack Scented 1...	5.226926	joshua	5.000000	15.226926
3	Bounce Dryer Sheets, Fresh Linen, 160 sheets	3.535534	joshua	4.390329	12.316191
8	Storkcraft Tuscany Glider and Ottoman, Beige C...	0.708318	joshua	4.978562	10.665442
5	Hormel Chili, No Beans	3.286511	joshua	3.202279	9.691070
6	Chester's Cheese Flavored Puffcorn Snacks	2.204404	joshua	3.641906	9.488215