!nvidia-smi %%capture !pip install tensorflow_text !pip install tqdm import os # session crash issue # https://stackoverflow.com/a/54927279/11105356 os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' from tqdm import tqdm import numpy as np import pandas as pd import matplotlib import matplotlib.pyplot as plt import matplotlib.patches as mpatches import seaborn as sns from numpy import newaxis from wordcloud import WordCloud, STOPWORDS from tqdm import tqdm from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import TruncatedSVD from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression import nltk from nltk import word_tokenize from nltk.corpus import stopwords import xgboost as xgb import tensorflow as tf import tensorflow_hub as hub import tensorflow_text from keras.models import Sequential from keras.layers import Dense, Dropout, LSTM, Activation, GRU, BatchNormalization from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D from keras.layers.embeddings import Embedding from keras.utils import np_utils from keras.preprocessing import sequence, text from keras.callbacks import EarlyStopping from tensorflow.keras.optimizers import Adam %matplotlib inline sns.set(style='whitegrid', palette='muted', font_scale=1.2) plt.rcParams['figure.figsize'] = 12, 8 RANDOM_SEED = 42 nltk.download('stopwords') stop_words = stopwords.words('english') tf.test.gpu_device_name() tf.__version__, hub.__version__, tensorflow_text.__version__ !pip freeze | grep hub !pip freeze | grep tensorflow_text !pip freeze | grep keras !pip freeze | grep scikit-learn module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3' use = hub.load(module_url) txt_1 = ["the bedroom is cozy"] txt_2 = ["comfortable bedroom"] emb_1 = use(txt_1) emb_2 = use(txt_2) print(emb_1.shape) print(np.inner(emb_1, emb_2).flatten()[0]) !mkdir ~/.kaggle !cp kaggle.json ~/.kaggle/ !chmod 600 ~/.kaggle/kaggle.json !kaggle datasets download -d jiashenliu/515k-hotel-reviews-data-in-europe !unzip /content/515k-hotel-reviews-data-in-europe.zip df_hotel_reviews = pd.read_csv("/content/Hotel_Reviews.csv") df_hotel_reviews.head() f"{df_hotel_reviews.shape[0]} rows, {df_hotel_reviews.shape[1]} columns" df_hotel_reviews.columns df_hotel_reviews.info() df_hotel_reviews.describe().T df_hotel_reviews.describe(include='object').T df_hotel_reviews.Reviewer_Score.describe().T df_hotel_reviews.Reviewer_Score.hist() plt.title('Review Score Distribution'); df_hotel_reviews.plot(kind='scatter', x='Review_Total_Positive_Word_Counts', y='Review_Total_Negative_Word_Counts', label='Total reviews', s=df_hotel_reviews.Total_Number_of_Reviews/100, c='Reviewer_Score', cmap=plt.get_cmap('jet'), colorbar=True, alpha=0.4, figsize=(15,12), sharex=False, # label not showing up # https://stackoverflow.com/a/69661993/11105356 ) font_size = 15 plt.title("Review Sentiment Distribution", fontsize=font_size) plt.xlabel("Total Positive Word Counts", fontsize=font_size) plt.ylabel("Total Negative Word Counts", fontsize=font_size) plt.legend() plt.show() df_hotel_reviews.Reviewer_Nationality.value_counts()[:20] df_hotel_reviews.Average_Score.hist() plt.title('Review Average Score Distribution'); abs(df_hotel_reviews.Review_Total_Positive_Word_Counts - df_hotel_reviews.Review_Total_Negative_Word_Counts).hist() plt.title('Difference Between Total Positive and Negative Word Count Among Hotel Reviews'); df_hotel_reviews['Negative_Review'][1] df_hotel_reviews.loc[:, 'Positive_Review'] = df_hotel_reviews.Positive_Review.apply(lambda x: x.replace('No Positive', '')) df_hotel_reviews.loc[:, 'Negative_Review'] = df_hotel_reviews.Negative_Review.apply(lambda x: x.replace('No Negative', '')) df_hotel_reviews['Negative_Review'][1] df_hotel_reviews['review'] = df_hotel_reviews.Positive_Review + df_hotel_reviews.Negative_Review df_hotel_reviews["review_type"] = df_hotel_reviews["Reviewer_Score"].apply( lambda x: "bad" if x < 7 else "good") df_reviews = df_hotel_reviews[["review", "review_type"]] df_reviews df_reviews.review_type.hist(); # imbalanced distribution df_reviews[df_reviews.review_type == 'good'].review.value_counts() df_reviews[df_reviews.review_type == 'bad'].review.value_counts() good_reviews = df_reviews[df_reviews.review_type == "good"] bad_reviews = df_reviews[df_reviews.review_type == "bad"] good_reviews_text = " ".join(good_reviews.review.to_numpy().tolist()) bad_reviews_text = " ".join(bad_reviews.review.to_numpy().tolist()) # generate Word Cloud def gen_wc(txt): stopwords = set(STOPWORDS) # crisp wordcloud : https://stackoverflow.com/a/28795577/11105356 wc = WordCloud(width=800, height=400,background_color="white", max_font_size=300, stopwords = stopwords).generate(txt) plt.figure(figsize=(14,10)) plt.imshow(wc, interpolation="bilinear") plt.axis('off') plt.show() gen_wc(good_reviews_text) gen_wc(bad_reviews_text) good_df = good_reviews.sample(n=len(bad_reviews), random_state=RANDOM_SEED) df_review_resampled = good_df.append(bad_reviews).reset_index(drop=True) df_review_resampled.shape df_review_resampled.head() sns.countplot( x='review_type', data=df_review_resampled, order=df_review_resampled.review_type.value_counts().index ) plt.xlabel("type") plt.title("Review type (resampled)"); # one_hot_encoder = preprocessing.OneHotEncoder(sparse=False) # encoded_review = one_hot_encoder.fit_transform(df_review_resampled.review_type.to_numpy().reshape(-1, 1)) label_enc = preprocessing.LabelEncoder() encoded_review = label_enc.fit_transform(df_review_resampled.review_type.values) train_reviews, test_reviews, y_train, y_test = train_test_split( df_review_resampled.review, encoded_review, test_size=0.25, random_state=RANDOM_SEED ) %%time tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') tfv.fit(list(train_reviews) + list(test_reviews)) X_train_tfv = tfv.transform(train_reviews) X_test_tfv = tfv.transform(test_reviews) def plot_LSA(test_data, test_labels): lsa = TruncatedSVD(n_components=2) lsa.fit(test_data) lsa_scores = lsa.transform(test_data) colors = ['orange','blue'] plt.scatter(lsa_scores[:,0], lsa_scores[:,1], s=8, alpha=.8, c=test_labels, cmap=matplotlib.colors.ListedColormap(colors)) orange_patch = mpatches.Patch(color='orange', label='Positive') blue_patch = mpatches.Patch(color='blue', label='Negative') plt.legend(handles=[orange_patch, blue_patch], prop={'size': 30}) fig = plt.figure(figsize=(12, 12)) plot_LSA(X_train_tfv, y_train) plt.show() %%time # issue: https://stackoverflow.com/a/66560912/11105356 clf = LogisticRegression(C=1.0, solver='lbfgs', max_iter=1000) clf.fit(X_train_tfv, y_train) predictions = clf.predict_proba(X_test_tfv) print (f"logloss: {metrics.log_loss(y_test, predictions):0.3f}") print (f"ROC AUC: {metrics.roc_auc_score(y_test, predictions[:, 1]):0.3f}") %%time ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words = 'english') # Fitting Count Vectorizer to both training and test sets (semi-supervised learning) ctv.fit(list(train_reviews) + list(test_reviews)) X_train_ctv= ctv.transform(train_reviews) X_test_ctv = ctv.transform(test_reviews) fig = plt.figure(figsize=(12, 12)) plot_LSA(X_train_ctv, y_train) plt.show() %%time clf = LogisticRegression(C=1.0, solver='lbfgs', max_iter=1000) clf.fit(X_train_ctv, y_train) predictions = clf.predict_proba(X_test_ctv) print (f"logloss: {metrics.log_loss(y_test, predictions):0.3f}") print (f"ROC AUC: {metrics.roc_auc_score(y_test, predictions[:, 1]):0.3f}") %%time clf = MultinomialNB() clf.fit(X_train_tfv, y_train) predictions = clf.predict_proba(X_test_tfv) print (f"logloss: {metrics.log_loss(y_test, predictions):0.3f}") print (f"ROC AUC: {metrics.roc_auc_score(y_test, predictions[:, 1]):0.3f}") %%time clf = MultinomialNB() clf.fit(X_train_ctv, y_train) predictions = clf.predict_proba(X_test_ctv) print (f"logloss: {metrics.log_loss(y_test, predictions):0.3f}") print (f"ROC AUC: {metrics.roc_auc_score(y_test, predictions[:, 1]):0.3f}") svd = decomposition.TruncatedSVD(n_components=120) # 120-200 is a good range svd.fit(X_train_tfv) X_train_svd = svd.transform(X_train_tfv) X_test_svd = svd.transform(X_test_tfv) scl = preprocessing.StandardScaler() scl.fit(X_train_svd) X_train_svd_scl = scl.transform(X_train_svd) X_test_svd_scl = scl.transform(X_test_svd) %%time # it never ends :v clf = SVC(C=1.0, probability=True) # since we need probabilities clf.fit(X_train_svd_scl, y_train) predictions = clf.predict_proba(X_test_svd_scl) print (f"logloss: {metrics.log_loss(y_test, predictions):0.3f}") print (f"ROC AUC: {metrics.roc_auc_score(y_test, predictions[:, 1]):0.3f}") %%time clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1) clf.fit(X_train_tfv, y_train) predictions = clf.predict_proba(X_test_tfv) print (f"logloss: {metrics.log_loss(y_test, predictions):0.3f}") print (f"ROC AUC: {metrics.roc_auc_score(y_test, predictions[:, 1]):0.3f}") model_scorer = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True) # model_scorer = metrics.make_scorer(metrics.roc_auc_score, greater_is_better=True, needs_proba=True) svd = TruncatedSVD() scl = preprocessing.StandardScaler() # https://stackoverflow.com/a/60868685/11105356 lr_model = LogisticRegression(solver='liblinear') # Solver lbfgs supports only 'l2' or 'none' penalties clf = pipeline.Pipeline([('svd', svd), ('scl', scl), ('lr', lr_model)]) param_grid = {'svd__n_components' : [120, 200], 'lr__C': [0.1, 1.0, 10], 'lr__penalty': ['l1', 'l2']} model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=model_scorer, verbose=10, n_jobs=-1, refit=True, cv=2) %%time model.fit(X_train_tfv, y_train) # we can use the full data here but im only using xtrain print(f"Best score: {model.best_score_:0.3f}") print("Best parameters set:") best_parameters = model.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) X_train = [] for r in tqdm(train_reviews): emb = use(r) review_emb = tf.reshape(emb, [-1]).numpy() X_train.append(review_emb) X_train = np.array(X_train) X_test = [] for r in tqdm(test_reviews): emb = use(r) review_emb = tf.reshape(emb, [-1]).numpy() X_test.append(review_emb) X_test = np.array(X_test) print(X_train.shape, X_test.shape) print(y_train.shape, y_test.shape) # plot accuracy and loss def plot_history(history): accuracy = history.history['accuracy'] val_accuracy= history.history['val_accuracy'] loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(accuracy) + 1) plt.plot(epochs, accuracy, 'bo', label='Training accuracy') plt.plot(epochs, val_accuracy, 'b', label='Validation accuracy') plt.title('Training and validation accuracy') plt.legend() plt.figure() plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.legend() plt.show() # plot model architecture def plot_model(model): model.summary() return tf.keras.utils.plot_model( model, to_file="model.png", show_shapes=True, show_dtype=False, show_layer_names=True, rankdir="TB", expand_nested=True, dpi=96, layer_range=None, ) def build_model_1(): model = Sequential() model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],))) model.add(Dropout(0.25)) model.add(Dense(128, activation='relu')) model.add(Dropout(0.25)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=Adam(learning_rate=0.001)) return model model_1 = build_model_1() plot_model(model_1) history = model_1.fit( X_train, y_train, epochs=10, batch_size=16, validation_split=0.1, verbose=1, shuffle=True ) plt.plot(history.history['loss'], label='train loss') plt.plot(history.history['val_loss'], label='val loss') plt.xlabel("epoch") plt.ylabel("Cross-entropy loss") plt.legend(); plt.plot(history.history['accuracy'], label='train accuracy') plt.plot(history.history['val_accuracy'], label='val accuracy') plt.xlabel("epoch") plt.ylabel("accuracy") plt.legend(); model_1.evaluate(X_test, y_test) # predict_classes() was removed in tf 2.6+, predict_proba(): use predict() # https://keras.rstudio.com/reference/predict_proba.html#details # # https://github.com/keras-team/keras/blob/f0eb8d538c82798944346b4b2df917a06bf5e9d4/keras/engine/sequential.py#L254 y_pred_probas = model_1.predict(X_test) y_pred_classes = (y_preds > 0.5).astype('int32') print(test_reviews.iloc[0]) print("Bad" if y_pred_classes[0] == 0 else "Good", y_pred_probas[0]) print(test_reviews.iloc[5]) print("Bad" if y_test[5] == 0 else "Good") print(y_pred_probas[5]) "Bad" if y_pred_classes[5] == 0 else "Good" y_train.shape X_train_reshaped = X_train[:, newaxis,:] X_test_reshaped = X_test[:, newaxis,:] y_train_reshaped = y_train[:, newaxis, newaxis] y_test_reshaped = y_test[:, newaxis, newaxis] y_train_reshaped.shape 1,X_train_reshaped.shape[2] def build_model_lstm(): model = Sequential() model.add(LSTM(256, activation='sigmoid', return_sequences=True, input_shape=(1,X_train_reshaped.shape[2]) )) model.add(Dense(128, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=Adam(learning_rate=0.0005)) return model model_lstm_1 = build_model_lstm() plot_model(model_lstm_1) %%time history_lstm_1 = model_lstm_1.fit( X_train_reshaped, y_train_reshaped, epochs=10, batch_size=16, validation_split=0.1, verbose=1, shuffle=True ) model_lstm_1.evaluate(X_test_reshaped, y_test_reshaped) plot_history(history_lstm_1) # https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM def build_model_lstm_2(): model = Sequential() model.add(LSTM(256, activation='relu', return_sequences=True, input_shape=(1,X_train_reshaped.shape[2]) )) model.add(LSTM(128, dropout=0.2, activation='relu', return_sequences=True)) model.add(LSTM(64, dropout=0.2, activation='relu', return_sequences=True)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=Adam(learning_rate=0.0005)) return model model_lstm_2 = build_model_lstm_2() plot_model(model_lstm_2) %%time history = model_lstm_2.fit( X_train_reshaped, y_train_reshaped, epochs=10, batch_size=16, validation_split=0.1, verbose=1, shuffle=True ) plot_history(history) model_lstm_2.evaluate(X_test_reshaped, y_test_reshaped) model_lstm_2.save("lstm_sentiment_model.h5") imported_model = tf.keras.models.load_model('/content/lstm_final_model.h5') imported_model.summary() text = 'This room is great' emb_test = use(text) emb_test.shape emb_test_reshaped = emb_test[:, newaxis, :] emb_test_reshaped.shape imported_model.predict(emb_test_reshaped) (imported_model.predict(emb_test_reshaped)[0][0][0] > 0.5).astype('int32') sentiment_val = (imported_model.predict(emb_test_reshaped) > 0.5).astype('int32') "Good" if sentiment_val == 1 else "Bad" text = 'This room is small' emb_test = use(text) emb_test_reshaped = emb_test[:, newaxis, :] emb_test_reshaped.shape imported_model.predict(emb_test_reshaped) sentimen_val = (imported_model.predict(emb_test_reshaped) > 0.5).astype('int32') "Good" if sentimen_val == 1 else "Bad" def predict_sentiment(txt): emb_txt = use(txt) emb_test_reshaped = emb_txt[:, newaxis, :] sentiment_val = (imported_model.predict(emb_test_reshaped) > 0.5).astype('int32') print("Score:", imported_model.predict(emb_test_reshaped).flatten()[0]) return "Positive" if sentiment_val == 1 else "Negative" sample_text = "I love the room service" f"The sentiment of this sentence is : {predict_sentiment(sample_text)}" sample_text = "I like the room service but the bathroom is small" f"The sentiment of this sentence is : {predict_sentiment(sample_text)}" sample_text = "The windows is so big. it has good air circulation" f"The sentiment of this sentence is : {predict_sentiment(sample_text)}" sample_text = "The windows is so small. it has bad air circulation" f"The sentiment of this sentence is : {predict_sentiment(sample_text)}"