The objective of this project is to perform sentiment analysis (only positive and negative) on an imbalanced hotel review dataset.
This project covers:
The final LSTM model achieved an accuracy of ~81% in Test Dataset (75:25 split)
NB: This project also serves as my assignments for the course below -
!nvidia-smi
%%capture
!pip install tensorflow_text
!pip install tqdm
import os
# session crash issue
# https://stackoverflow.com/a/54927279/11105356
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from numpy import newaxis
from wordcloud import WordCloud, STOPWORDS
from tqdm import tqdm
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import xgboost as xgb
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation, GRU, BatchNormalization
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.utils import np_utils
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
plt.rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
nltk.download('stopwords')
stop_words = stopwords.words('english')
tf.test.gpu_device_name()
tf.__version__, hub.__version__, tensorflow_text.__version__
!pip freeze | grep hub
!pip freeze | grep tensorflow_text
!pip freeze | grep keras
!pip freeze | grep scikit-learn
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'
use = hub.load(module_url)
txt_1 = ["the bedroom is cozy"]
txt_2 = ["comfortable bedroom"]
emb_1 = use(txt_1)
emb_2 = use(txt_2)
print(emb_1.shape)
The USE is trained on a number of tasks but one of the main tasks is to identify the similarity between pairs of sentences. The authors note that the task was to identify "semantic textual similarity (STS) between sentence pairs scored by Pearson correlation with human judgments".
print(np.inner(emb_1, emb_2).flatten()[0])
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d jiashenliu/515k-hotel-reviews-data-in-europe
!unzip /content/515k-hotel-reviews-data-in-europe.zip
df_hotel_reviews = pd.read_csv("/content/Hotel_Reviews.csv")
df_hotel_reviews.head()
f"{df_hotel_reviews.shape[0]} rows, {df_hotel_reviews.shape[1]} columns"
df_hotel_reviews.columns
df_hotel_reviews.info()
df_hotel_reviews.describe().T
df_hotel_reviews.describe(include='object').T
df_hotel_reviews.Reviewer_Score.describe().T
df_hotel_reviews.Reviewer_Score.hist()
plt.title('Review Score Distribution');
df_hotel_reviews.plot(kind='scatter',
x='Review_Total_Positive_Word_Counts',
y='Review_Total_Negative_Word_Counts',
label='Total reviews',
s=df_hotel_reviews.Total_Number_of_Reviews/100,
c='Reviewer_Score',
cmap=plt.get_cmap('jet'),
colorbar=True,
alpha=0.4, figsize=(15,12),
sharex=False, # label not showing up
# https://stackoverflow.com/a/69661993/11105356
)
font_size = 15
plt.title("Review Sentiment Distribution", fontsize=font_size)
plt.xlabel("Total Positive Word Counts", fontsize=font_size)
plt.ylabel("Total Negative Word Counts", fontsize=font_size)
plt.legend()
plt.show()
df_hotel_reviews.Reviewer_Nationality.value_counts()[:20]
df_hotel_reviews.Average_Score.hist()
plt.title('Review Average Score Distribution');
abs(df_hotel_reviews.Review_Total_Positive_Word_Counts - df_hotel_reviews.Review_Total_Negative_Word_Counts).hist()
plt.title('Difference Between Total Positive and Negative Word Count Among Hotel Reviews');
df_hotel_reviews['Negative_Review'][1]
df_hotel_reviews.loc[:, 'Positive_Review'] = df_hotel_reviews.Positive_Review.apply(lambda x: x.replace('No Positive', ''))
df_hotel_reviews.loc[:, 'Negative_Review'] = df_hotel_reviews.Negative_Review.apply(lambda x: x.replace('No Negative', ''))
df_hotel_reviews['Negative_Review'][1]
df_hotel_reviews['review'] = df_hotel_reviews.Positive_Review + df_hotel_reviews.Negative_Review
df_hotel_reviews["review_type"] = df_hotel_reviews["Reviewer_Score"].apply(
lambda x: "bad" if x < 7 else "good")
df_reviews = df_hotel_reviews[["review", "review_type"]]
df_reviews
df_reviews.review_type.hist();
# imbalanced distribution
df_reviews[df_reviews.review_type == 'good'].review.value_counts()
df_reviews[df_reviews.review_type == 'bad'].review.value_counts()
good_reviews = df_reviews[df_reviews.review_type == "good"]
bad_reviews = df_reviews[df_reviews.review_type == "bad"]
good_reviews_text = " ".join(good_reviews.review.to_numpy().tolist())
bad_reviews_text = " ".join(bad_reviews.review.to_numpy().tolist())
# generate Word Cloud
def gen_wc(txt):
stopwords = set(STOPWORDS)
# crisp wordcloud : https://stackoverflow.com/a/28795577/11105356
wc = WordCloud(width=800, height=400,background_color="white", max_font_size=300, stopwords = stopwords).generate(txt)
plt.figure(figsize=(14,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()
gen_wc(good_reviews_text)