################################
## STEP 01: Import Libraries ##
################################
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from IPython.display import display
#############################
## STEP 02: Read Data ####
#############################
# Reading ratings file
ratings = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_ratings_final.csv',\
encoding='latin-1')
# ratings.reset_index(drop=True, inplace=True)
display(ratings.sample(n=5, random_state=42))
userId | rating | prod_name | |
---|---|---|---|
28266 | daitaliana23 | 5 | Storkcraft Tuscany Glider and Ottoman, Beige C... |
15603 | beverly | 5 | Lysol Concentrate Deodorizing Cleaner, Origina... |
7839 | amy77 | 5 | Clorox Disinfecting Wipes Value Pack Scented 1... |
4850 | dmann10101 | 5 | The Resident Evil Collection 5 Discs (blu-Ray) |
4699 | morenito021582 | 5 | The Resident Evil Collection 5 Discs (blu-Ray) |
#################################
## STEP 03: Data Preparation ####
#################################
def apply_pivot(df,fillby = None):
if fillby is not None:
return df.pivot_table(index='userId', columns='prod_name',values='rating').fillna(fillby)
return df.pivot_table(index='userId', columns='prod_name',values='rating')
#3.1 Dividing the dataset into train and test
train, test = train_test_split(ratings, test_size=0.30, random_state=42)
test = test[test.userId.isin(train.userId)]
#3.2 Apply pivot operation and fillna used to replace NaN values with 0 i.e. where user didn't made any rating
df_train_pivot = apply_pivot(df = train, fillby = 0)
df_test_pivot = apply_pivot(df = test, fillby = 0)
#3.3 dummy dataset (train and test)
## Train
dummy_train = train.copy()
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_train = apply_pivot(df = dummy_train, fillby = 1)
## Test
dummy_test = test.copy()
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x>=1 else 0)
dummy_test = apply_pivot(df = dummy_test, fillby = 0)
df_train_pivot[(df_train_pivot['0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest']!=0) | \
(df_train_pivot['4C Grated Parmesan Cheese 100% Natural 8oz Shaker']!=0)]
prod_name | 0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest | 100:Complete First Season (blu-Ray) | 2017-2018 Brownline174 Duraflex 14-Month Planner 8 1/2 X 11 Black | 2x Ultra Era with Oxi Booster, 50fl oz | 42 Dual Drop Leaf Table with 2 Madrid Chairs" | 4C Grated Parmesan Cheese 100% Natural 8oz Shaker | Africa's Best No-Lye Dual Conditioning Relaxer System Super | Alberto VO5 Salon Series Smooth Plus Sleek Shampoo | All,bran Complete Wheat Flakes, 18 Oz. | Ambi Complexion Cleansing Bar | ... | Vicks Vaporub, Regular, 3.53oz | Voortman Sugar Free Fudge Chocolate Chip Cookies | Wagan Smartac 80watt Inverter With Usb | Wallmount Server Cabinet (450mm, 9 RU) | Way Basics 3-Shelf Eco Narrow Bookcase Storage Shelf, Espresso - Formaldehyde Free - Lifetime Guarantee | Wedding Wishes Wedding Guest Book | Weleda Everon Lip Balm | Windex Original Glass Cleaner Refill 67.6oz (2 Liter) | Yes To Carrots Nourishing Body Wash | Yes To Grapefruit Rejuvenating Body Wash |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
userId | |||||||||||||||||||||
brewno | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
deelee | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
embum | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
erinn | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
rmtarboro | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
smokey bear | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
spicesea | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7 rows × 231 columns
#####################################
## STEP 04: User-User Similarity ####
#####################################
# to calculate mean, use only ratings given by user instead of fillna by 0 as it increase denominator in mean
mean = np.nanmean(apply_pivot(df = train), axis = 1)
df_train_subtracted = (apply_pivot(df = train).T-mean).T
# Make rating=0 where user hasn't given any rating
df_train_subtracted.fillna(0, inplace = True)
# Creating the User Similarity Matrix using pairwise_distance function. shape of user_correlation is userXuser i.e. 18025X18025
user_correlation = 1 - pairwise_distances(df_train_subtracted, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
# user_correlation[user_correlation<0] = 0
# Convert the user_correlation matrix into dataframe
user_correlation_df = pd.DataFrame(user_correlation)
user_correlation_df['userId'] = df_train_subtracted.index
user_correlation_df.set_index('userId',inplace=True)
user_correlation_df.columns = df_train_subtracted.index.tolist()
user_correlation.shape,df_train_pivot.shape
((18025, 18025), (18025, 231))
###########################################
## STEP 05: Predict Rating (User-User) ####
###########################################
# Rating predicted by the user (for rated & non rated product both) is the weighted sum of correlation with the product rating (as present in the rating dataset).
user_predicted_ratings = np.dot(user_correlation, df_train_pivot)
# To find only product not rated by the user, ignore the product rated by the user by making it zero.
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
# scaler = MinMaxScaler(feature_range=(1, 5))
# scaler.fit(user_final_rating)
# user_final_rating = scaler.transform(user_final_rating)
################################################################
## STEP 06: Find Top N recommendation for User (User-User) #####
################################################################
def find_top_recommendations(pred_rating_df, userid, topn):
recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]
recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})
return recommendation
user_input = str(input("Enter your user id"))
recommendation_user_user = find_top_recommendations(user_final_rating, user_input, 5)
recommendation_user_user['userId'] = user_input
Enter your user idjoshua
print("Recommended products for user id:{} as below".format(user_input))
display(recommendation_user_user)
print("Earlier rated products by user id:{} as below".format(user_input))
display(train[train['userId']==user_input].sort_values(['rating'],ascending=False))
Recommended products for user id:joshua as below
prod_name | predicted_ratings | userId | |
---|---|---|---|
0 | Clorox Disinfecting Wipes Value Pack Scented 1... | 5.226926 | joshua |
1 | Lysol Concentrate Deodorizing Cleaner, Origina... | 3.750000 | joshua |
2 | Head & Shoulders Dandruff Shampoo Ocean Lift 2... | 3.535534 | joshua |
3 | Bounce Dryer Sheets, Fresh Linen, 160 sheets | 3.535534 | joshua |
4 | The Resident Evil Collection 5 Discs (blu-Ray) | 3.345348 | joshua |
Earlier rated products by user id:joshua as below
userId | rating | prod_name | |
---|---|---|---|
0 | joshua | 5 | Pink Friday: Roman Reloaded Re-Up (w/dvd) |
17718 | joshua | 5 | Smead174 Recycled Letter Size Manila File Back... |
22379 | joshua | 5 | Cheetos Crunchy Flamin' Hot Cheese Flavored Sn... |
1541 | joshua | 3 | Dark Shadows (includes Digital Copy) (ultravio... |
################################################
## STEP 07: Evaluation (User-User) on test #####
################################################s
#Filter user correlation only for user which is in test, test is subset/equal of train in terms of userId
user_correlation_test_df = user_correlation_df[user_correlation_df.index.isin(test.userId)]
user_correlation_test_df = user_correlation_test_df[list(set(test.userId))]
# user_correlation_test_df[user_correlation_test_df<0]=0
#Get test user predicted rating
test_user_predicted_ratings = np.dot(user_correlation_test_df, df_test_pivot)
test_user_predicted_ratings = np.multiply(test_user_predicted_ratings,dummy_test)
#Get NaN where user never rated as it shouldn't contribute in calculating RMSE
test_user_predicted_ratings = test_user_predicted_ratings[test_user_predicted_ratings>0]
scaler = MinMaxScaler(feature_range=(1, 5))
scaler.fit(test_user_predicted_ratings)
test_user_predicted_ratings = scaler.transform(test_user_predicted_ratings)
total_non_nan = np.count_nonzero(~np.isnan(test_user_predicted_ratings))
rmse = (np.sum(np.sum((apply_pivot(df = test) - test_user_predicted_ratings)**2))/total_non_nan)**0.5
print(rmse)
2.506663023687151
C:\Users\aakashgoel\Anaconda3\envs\forecast_anaconda\lib\site-packages\sklearn\preprocessing\_data.py:464: RuntimeWarning: All-NaN slice encountered data_min = np.nanmin(X, axis=0) C:\Users\aakashgoel\Anaconda3\envs\forecast_anaconda\lib\site-packages\sklearn\preprocessing\_data.py:465: RuntimeWarning: All-NaN slice encountered data_max = np.nanmax(X, axis=0)
############################
## STEP 08: Save Model ####
############################
pickle.dump(user_final_rating,open('./model/user_final_rating.pkl','wb'))
################################
## STEP 01: Import Libraries ##
################################
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from imblearn import over_sampling
from IPython.display import display
#############################
## STEP 02: Read Data ####
#############################
# Reading product review sentiment file
df_prod_review = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_review_sentiment.csv',\
encoding='latin-1')
display(df_prod_review.sample(n=5, random_state=42))
Review | user_sentiment | |
---|---|---|
9329 | fresh clean smell everything need quick clean ... | 1 |
4160 | great vacuum love lightweight vacuum easy carr... | 1 |
18500 | smell great wipe easy use work smell great | 1 |
8840 | product count use clorox wipe everything trave... | 1 |
5098 | great movie excellent movie add blu ray collec... | 1 |
#################################
## STEP 03: Data Preparation ####
#################################
x=df_prod_review['Review']
y=df_prod_review['user_sentiment']
print("Checking distribution of +ve and -ve review sentiment: \n{}".format(y.value_counts(normalize=True)))
# Split the dataset into test and train
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=50)
#As we saw above that data is imbalanced, balance training data using over sampling
ros = over_sampling.RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(pd.DataFrame(X_train), pd.Series(y_train))
print("Checking distribution of +ve and -ve review sentiment after oversampling: \n{}".format(y_train.value_counts(normalize=True)))
#convert into list of string
X_train = X_train['Review'].tolist()
Checking distribution of +ve and -ve review sentiment: 1 0.888401 0 0.111599 Name: user_sentiment, dtype: float64 Checking distribution of +ve and -ve review sentiment after oversampling: 1 0.5 0 0.5 Name: user_sentiment, dtype: float64
################################################################
## STEP 04: Feature Engineering (Convert text into numbers) ####
################################################################
word_vectorizer = TfidfVectorizer(strip_accents='unicode', token_pattern=r'\w{1,}',\
ngram_range=(1, 3), stop_words='english', sublinear_tf=True, max_df = 0.80, min_df = 0.01)
# Fiting it on Train
word_vectorizer.fit(X_train)
# transforming the train and test datasets
X_train_transformed = word_vectorizer.transform(X_train)
X_test_transformed = word_vectorizer.transform(X_test.tolist())
X_train_transformed.shape, X_test_transformed.shape
((33468, 263), (8062, 263))
# print(list(word_vectorizer.get_feature_names()))
###############################################
## STEP 05: ML Model (Logistic Regression) ####
###############################################
def evaluate_model(y_pred,y_actual):
print(classification_report(y_true = y_actual, y_pred = y_pred))
#confusion matrix
cm = confusion_matrix(y_true = y_actual, y_pred = y_pred)
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]
#Calculating the Sensitivity
sensitivity = round(TP/float(FN + TP),2)
print("sensitivity: {}".format(sensitivity))
#Calculating the Specificity
specificity = round(TN / float(TN + FP),2)
print("specificity: {}".format(specificity))
#4.1 Model Training
logit = LogisticRegression()
logit.fit(X_train_transformed,y_train)
#4.2 Prediction on Train Data
y_pred_train= logit.predict(X_train_transformed)
#4.3 Prediction on Test Data
y_pred_test = logit.predict(X_test_transformed)
#4.4 Evaluation on Train
print("Evaluation on Train dataset ..")
evaluate_model(y_pred = y_pred_train, y_actual = y_train)
print("Evaluation on Test dataset ..")
#4.5 Evaluation on Test
evaluate_model(y_pred = y_pred_test, y_actual = y_test)
Evaluation on Train dataset .. precision recall f1-score support 0 0.82 0.83 0.82 16734 1 0.83 0.81 0.82 16734 accuracy 0.82 33468 macro avg 0.82 0.82 0.82 33468 weighted avg 0.82 0.82 0.82 33468 sensitivity: 0.81 specificity: 0.83 Evaluation on Test dataset .. precision recall f1-score support 0 0.35 0.80 0.49 922 1 0.97 0.81 0.88 7140 accuracy 0.81 8062 macro avg 0.66 0.81 0.69 8062 weighted avg 0.90 0.81 0.84 8062 sensitivity: 0.81 specificity: 0.8
############################
## STEP 06: Save Model ####
############################
pickle.dump(logit,open('./model/logit_model.pkl', 'wb'))
pickle.dump(word_vectorizer,open('./model/word_vectorizer.pkl','wb'))
################################
## STEP 01: Import Libraries ##
################################
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler
#############################
## STEP 02: Read Data ####
#############################
# Reading product review data
df_prod_review = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_review.csv',\
encoding='latin-1')
display(df_prod_review.sample(n=5, random_state=42))
prod_name | Review | |
---|---|---|
2501 | Hawaiian Punch Berry Limeade Blast Juice | pretty good stuff much sugar kid like |
21252 | Godzilla 3d Includes Digital Copy Ultraviolet ... | enteraining great interesting version classic ... |
23503 | Godzilla 3d Includes Digital Copy Ultraviolet ... | best godzilla date like previous godzilla film... |
26827 | Storkcraft Tuscany Glider and Ottoman, Beige C... | comfy good put baby sleep calming sister mom n... |
18210 | Clorox Disinfecting Bathroom Cleaner | product easy use product easy use open use har... |
###########################
## STEP 03: Load Model ####
###########################
model = pickle.load(open('./model/logit_model.pkl', 'rb'))
word_vectorizer = pickle.load(open('./model/word_vectorizer.pkl','rb'))
user_final_rating = pickle.load(open('./model/user_final_rating.pkl','rb'))
##########################################################################
## STEP 04: Get positive review Recommendation only for given user id ####
##########################################################################
def find_top_recommendations(pred_rating_df, userid, topn):
recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]
recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})
return recommendation
def get_sentiment_product(x):
## Get review list for given product
product_name_review_list = df_prod_review[df_prod_review['prod_name']== x]['Review'].tolist()
## Transform review list into DTM (Document/review Term Matrix)
features= word_vectorizer.transform(product_name_review_list)
## Predict sentiment
return model.predict(features).mean()
def find_top_pos_recommendation(user_final_rating, user_input, df_prod_review, word_vectorizer,\
model, no_recommendation):
## 10 is manually coded, need to change
## Generate top recommenddations using user-user based recommendation system w/o using sentiment analysis
recommendation_user_user = find_top_recommendations(user_final_rating, user_input, 10)
recommendation_user_user['userId'] = user_input
## filter out recommendations where predicted rating is zero
recommendation_user_user = recommendation_user_user[recommendation_user_user['predicted_ratings']!=0]
print("Recommended products for user id:{} without using sentiment".format(user_input))
display(recommendation_user_user)
## Get overall sentiment score for each recommended product
recommendation_user_user['sentiment_score'] = recommendation_user_user['prod_name'].apply(get_sentiment_product)
## Transform scale of sentiment so that it can be manipulated with predicted rating score
scaler = MinMaxScaler(feature_range=(1, 5))
scaler.fit(recommendation_user_user[['sentiment_score']])
recommendation_user_user['sentiment_score'] = scaler.transform(recommendation_user_user[['sentiment_score']])
## Get final product ranking score using 1*Predicted rating of recommended product + 2*normalized sentiment score on scale of 1–5 of recommended product
recommendation_user_user['product_ranking_score'] = 1*recommendation_user_user['predicted_ratings'] + \
2*recommendation_user_user['sentiment_score']
print("Recommended products for user id:{} after using sentiment".format(user_input))
## Sort product ranking score in descending order and show only top `no_recommendation`
display(recommendation_user_user.sort_values(by = ['product_ranking_score'],ascending = False).head(no_recommendation))
user_input = str(input("Enter your user id"))
find_top_pos_recommendation(user_final_rating, user_input, df_prod_review, word_vectorizer,\
model, no_recommendation = 5)
Enter your user idjoshua Recommended products for user id:joshua without using sentiment
prod_name | predicted_ratings | userId | |
---|---|---|---|
0 | Clorox Disinfecting Wipes Value Pack Scented 1... | 5.226926 | joshua |
1 | Lysol Concentrate Deodorizing Cleaner, Origina... | 3.750000 | joshua |
2 | Head & Shoulders Dandruff Shampoo Ocean Lift 2... | 3.535534 | joshua |
3 | Bounce Dryer Sheets, Fresh Linen, 160 sheets | 3.535534 | joshua |
4 | The Resident Evil Collection 5 Discs (blu-Ray) | 3.345348 | joshua |
5 | Hormel Chili, No Beans | 3.286511 | joshua |
6 | Chester's Cheese Flavored Puffcorn Snacks | 2.204404 | joshua |
7 | Mike Dave Need Wedding Dates (dvd + Digital) | 0.720898 | joshua |
8 | Storkcraft Tuscany Glider and Ottoman, Beige C... | 0.708318 | joshua |
9 | Ceiling Fan With Light White 14.2 X 29.9 X 9.2... | 0.708318 | joshua |
Recommended products for user id:joshua after using sentiment
prod_name | predicted_ratings | userId | sentiment_score | product_ranking_score | |
---|---|---|---|---|---|
0 | Clorox Disinfecting Wipes Value Pack Scented 1... | 5.226926 | joshua | 5.000000 | 15.226926 |
3 | Bounce Dryer Sheets, Fresh Linen, 160 sheets | 3.535534 | joshua | 4.390329 | 12.316191 |
8 | Storkcraft Tuscany Glider and Ottoman, Beige C... | 0.708318 | joshua | 4.978562 | 10.665442 |
5 | Hormel Chili, No Beans | 3.286511 | joshua | 3.202279 | 9.691070 |
6 | Chester's Cheese Flavored Puffcorn Snacks | 2.204404 | joshua | 3.641906 | 9.488215 |