The objective of this project is to perform sentiment analysis (only positive and negative) on an imbalanced hotel review dataset.
This project covers:
The final LSTM model achieved an accuracy of ~81% in Test Dataset (75:25 split)
NB: This project also serves as my assignments for the course below -
!nvidia-smi
Tue Jan 4 11:28:52 2022 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 495.44 Driver Version: 460.32.03 CUDA Version: 11.2 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 | | N/A 73C P8 33W / 149W | 0MiB / 11441MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+
%%capture
!pip install tensorflow_text
!pip install tqdm
import os
# session crash issue
# https://stackoverflow.com/a/54927279/11105356
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from numpy import newaxis
from wordcloud import WordCloud, STOPWORDS
from tqdm import tqdm
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import xgboost as xgb
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation, GRU, BatchNormalization
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.utils import np_utils
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
plt.rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
nltk.download('stopwords')
stop_words = stopwords.words('english')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
tf.test.gpu_device_name()
'/device:GPU:0'
tf.__version__, hub.__version__, tensorflow_text.__version__
('2.7.0', '0.12.0', '2.7.3')
!pip freeze | grep hub
!pip freeze | grep tensorflow_text
!pip freeze | grep keras
!pip freeze | grep scikit-learn
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz tensorflow-hub==0.12.0 keras==2.7.0 keras-vis==0.4.1 scikit-learn==0.22.2.post1
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'
use = hub.load(module_url)
txt_1 = ["the bedroom is cozy"]
txt_2 = ["comfortable bedroom"]
emb_1 = use(txt_1)
emb_2 = use(txt_2)
print(emb_1.shape)
(1, 512)
The USE is trained on a number of tasks but one of the main tasks is to identify the similarity between pairs of sentences. The authors note that the task was to identify "semantic textual similarity (STS) between sentence pairs scored by Pearson correlation with human judgments".
print(np.inner(emb_1, emb_2).flatten()[0])
0.8467271
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d jiashenliu/515k-hotel-reviews-data-in-europe
mkdir: cannot create directory ‘/root/.kaggle’: File exists Downloading 515k-hotel-reviews-data-in-europe.zip to /content 73% 33.0M/45.1M [00:02<00:00, 16.0MB/s] 100% 45.1M/45.1M [00:02<00:00, 22.9MB/s]
!unzip /content/515k-hotel-reviews-data-in-europe.zip
Archive: /content/515k-hotel-reviews-data-in-europe.zip inflating: Hotel_Reviews.csv
df_hotel_reviews = pd.read_csv("/content/Hotel_Reviews.csv")
df_hotel_reviews.head()
Hotel_Address | Additional_Number_of_Scoring | Review_Date | Average_Score | Hotel_Name | Reviewer_Nationality | Negative_Review | Review_Total_Negative_Word_Counts | Total_Number_of_Reviews | Positive_Review | Review_Total_Positive_Word_Counts | Total_Number_of_Reviews_Reviewer_Has_Given | Reviewer_Score | Tags | days_since_review | lat | lng | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | s Gravesandestraat 55 Oost 1092 AA Amsterdam ... | 194 | 8/3/2017 | 7.7 | Hotel Arena | Russia | I am so angry that i made this post available... | 397 | 1403 | Only the park outside of the hotel was beauti... | 11 | 7 | 2.9 | [' Leisure trip ', ' Couple ', ' Duplex Double... | 0 days | 52.360576 | 4.915968 |
1 | s Gravesandestraat 55 Oost 1092 AA Amsterdam ... | 194 | 8/3/2017 | 7.7 | Hotel Arena | Ireland | No Negative | 0 | 1403 | No real complaints the hotel was great great ... | 105 | 7 | 7.5 | [' Leisure trip ', ' Couple ', ' Duplex Double... | 0 days | 52.360576 | 4.915968 |
2 | s Gravesandestraat 55 Oost 1092 AA Amsterdam ... | 194 | 7/31/2017 | 7.7 | Hotel Arena | Australia | Rooms are nice but for elderly a bit difficul... | 42 | 1403 | Location was good and staff were ok It is cut... | 21 | 9 | 7.1 | [' Leisure trip ', ' Family with young childre... | 3 days | 52.360576 | 4.915968 |
3 | s Gravesandestraat 55 Oost 1092 AA Amsterdam ... | 194 | 7/31/2017 | 7.7 | Hotel Arena | United Kingdom | My room was dirty and I was afraid to walk ba... | 210 | 1403 | Great location in nice surroundings the bar a... | 26 | 1 | 3.8 | [' Leisure trip ', ' Solo traveler ', ' Duplex... | 3 days | 52.360576 | 4.915968 |
4 | s Gravesandestraat 55 Oost 1092 AA Amsterdam ... | 194 | 7/24/2017 | 7.7 | Hotel Arena | New Zealand | You When I booked with your company on line y... | 140 | 1403 | Amazing location and building Romantic setting | 8 | 3 | 6.7 | [' Leisure trip ', ' Couple ', ' Suite ', ' St... | 10 days | 52.360576 | 4.915968 |
f"{df_hotel_reviews.shape[0]} rows, {df_hotel_reviews.shape[1]} columns"
'515738 rows, 17 columns'
df_hotel_reviews.columns
Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date', 'Average_Score', 'Hotel_Name', 'Reviewer_Nationality', 'Negative_Review', 'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews', 'Positive_Review', 'Review_Total_Positive_Word_Counts', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags', 'days_since_review', 'lat', 'lng'], dtype='object')
df_hotel_reviews.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 515738 entries, 0 to 515737 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Hotel_Address 515738 non-null object 1 Additional_Number_of_Scoring 515738 non-null int64 2 Review_Date 515738 non-null object 3 Average_Score 515738 non-null float64 4 Hotel_Name 515738 non-null object 5 Reviewer_Nationality 515738 non-null object 6 Negative_Review 515738 non-null object 7 Review_Total_Negative_Word_Counts 515738 non-null int64 8 Total_Number_of_Reviews 515738 non-null int64 9 Positive_Review 515738 non-null object 10 Review_Total_Positive_Word_Counts 515738 non-null int64 11 Total_Number_of_Reviews_Reviewer_Has_Given 515738 non-null int64 12 Reviewer_Score 515738 non-null float64 13 Tags 515738 non-null object 14 days_since_review 515738 non-null object 15 lat 512470 non-null float64 16 lng 512470 non-null float64 dtypes: float64(4), int64(5), object(8) memory usage: 66.9+ MB
df_hotel_reviews.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Additional_Number_of_Scoring | 515738.0 | 498.081836 | 500.538467 | 1.000000 | 169.000000 | 341.000000 | 660.000000 | 2682.000000 |
Average_Score | 515738.0 | 8.397487 | 0.548048 | 5.200000 | 8.100000 | 8.400000 | 8.800000 | 9.800000 |
Review_Total_Negative_Word_Counts | 515738.0 | 18.539450 | 29.690831 | 0.000000 | 2.000000 | 9.000000 | 23.000000 | 408.000000 |
Total_Number_of_Reviews | 515738.0 | 2743.743944 | 2317.464868 | 43.000000 | 1161.000000 | 2134.000000 | 3613.000000 | 16670.000000 |
Review_Total_Positive_Word_Counts | 515738.0 | 17.776458 | 21.804185 | 0.000000 | 5.000000 | 11.000000 | 22.000000 | 395.000000 |
Total_Number_of_Reviews_Reviewer_Has_Given | 515738.0 | 7.166001 | 11.040228 | 1.000000 | 1.000000 | 3.000000 | 8.000000 | 355.000000 |
Reviewer_Score | 515738.0 | 8.395077 | 1.637856 | 2.500000 | 7.500000 | 8.800000 | 9.600000 | 10.000000 |
lat | 512470.0 | 49.442439 | 3.466325 | 41.328376 | 48.214662 | 51.499981 | 51.516288 | 52.400181 |
lng | 512470.0 | 2.823803 | 4.579425 | -0.369758 | -0.143372 | 0.010607 | 4.834443 | 16.429233 |
df_hotel_reviews.describe(include='object').T
count | unique | top | freq | |
---|---|---|---|---|
Hotel_Address | 515738 | 1493 | 163 Marsh Wall Docklands Tower Hamlets London ... | 4789 |
Review_Date | 515738 | 731 | 8/2/2017 | 2585 |
Hotel_Name | 515738 | 1492 | Britannia International Hotel Canary Wharf | 4789 |
Reviewer_Nationality | 515738 | 227 | United Kingdom | 245246 |
Negative_Review | 515738 | 330011 | No Negative | 127890 |
Positive_Review | 515738 | 412601 | No Positive | 35946 |
Tags | 515738 | 55242 | [' Leisure trip ', ' Couple ', ' Double Room '... | 5101 |
days_since_review | 515738 | 731 | 1 days | 2585 |
df_hotel_reviews.Reviewer_Score.describe().T
count 515738.000000 mean 8.395077 std 1.637856 min 2.500000 25% 7.500000 50% 8.800000 75% 9.600000 max 10.000000 Name: Reviewer_Score, dtype: float64
df_hotel_reviews.Reviewer_Score.hist()
plt.title('Review Score Distribution');
df_hotel_reviews.plot(kind='scatter',
x='Review_Total_Positive_Word_Counts',
y='Review_Total_Negative_Word_Counts',
label='Total reviews',
s=df_hotel_reviews.Total_Number_of_Reviews/100,
c='Reviewer_Score',
cmap=plt.get_cmap('jet'),
colorbar=True,
alpha=0.4, figsize=(15,12),
sharex=False, # label not showing up
# https://stackoverflow.com/a/69661993/11105356
)
font_size = 15
plt.title("Review Sentiment Distribution", fontsize=font_size)
plt.xlabel("Total Positive Word Counts", fontsize=font_size)
plt.ylabel("Total Negative Word Counts", fontsize=font_size)
plt.legend()
plt.show()