# install kaggle to use kaggle sample data sets
!pip install kaggle

!mkdir ~/.kaggle

# install the main libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# download lexicon to work with pretrained text data
import nltk
nltk.download('vader_lexicon')

# import e.g. sentiment analyzer to categorise reviews
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiments = SentimentIntensityAnalyzer()

# use e.g. the available Hotel_Reviews.csv file, download from kaggle or from https://github.com/spribylova/Python_Hotels
# data = pd.read_csv("Hotel_Reviews.csv")

import pandas as pd
url = "https://raw.githubusercontent.com/spribylova/Python_Text_Sentiment/main/Hotel_Reviews.csv"
data = pd.read_csv(url)
data

# the sample csv file only contains US country
data["country"].unique()

# there is many US unique hotel names in the file 
data["name"].unique()

# see the unique values in rating column
data["reviews.rating"].unique()

# fill the null reviews of the Hotels with 0
data['reviews.rating']=data['reviews.rating'].fillna(0)

# create a range of 5 maximum review numbers, all values above 5 are equal to 5
for i in range(0,len(data)):
   if(data['reviews.rating'].loc[i] > 5):
       temp = data['reviews.rating'].loc[i]
       newtemp = (temp/10)*5
       data.at[i,'reviews.rating'] = newtemp

# see the number of reviews for each rating number
ratings = data["reviews.rating"].value_counts()
numbers = ratings.index
quantity = ratings.values
ratings

# round decimal places in rating and sort output by rating level
ratings=data['reviews.rating'].round(0).value_counts()[[5,4,3,2,1]]
ratings

# create indexes from rating value and counts
numbers = ratings.index
quantity = ratings.values

# assign color to each rating and revise the pie chart to see the share of rating values 
# the majority of people is giving review 5, only small share of people was not satisfied
custom_colors = ["tan", "beige", 'silver', "grey", "black"]
plt.figure(figsize=(5, 5))
plt.pie(quantity, labels=numbers, colors=custom_colors)
central_circle = plt.Circle((0, 0), 0.5, color='white')
fig = plt.gcf()
fig.gca().add_artist(central_circle)
plt.rc('font', size=12)
plt.title("Hotel Reviews Ratings", fontsize=20)
plt.show()

# you can use various libraries to work with reviews: SentimentIntensityAnalyzer, transformers and textblob

pip install -q transformers

# create data frame reviews text, rt. Each review is in a single row.
rt = data[["reviews.text"]]
#rt = str(rt).encode('utf-8')
#rt["reviews.text"] = pd.to_numeric(rt["reviews.text"])
# convert column "reviews.text" to string type
rt = rt.astype({"reviews.text": str})
rt

# transforers library is using various pretrained models to classify and calculate score
# well known model names are e.g. XLNet, BERT, XLM, RoBERTa, ...
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
classifier("Really lovely hotel. The best one. Tip top.")

# use analyzer and download e.g. vader pretrained lexicon (Valence Aware Dictionary for sEntiment Reasoning)
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")
import pandas as pd

# revise data types in rt data frame
rt.dtypes

sentiments = SentimentIntensityAnalyzer()

# based on polarity score classify each line of text to positive, negative, neutral and compound
rt["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in rt["reviews.text"]]
rt["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in rt["reviews.text"]]
rt["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in rt["reviews.text"]]
rt['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in rt["reviews.text"]]
rt.head()

# classify column Compound
score = rt["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
rt["Sentiment"] = sentiment
rt.head()

print(rt["Sentiment"].value_counts())

rt

# install libraries for pieplot and define colors
import matplotlib.pyplot as plt
import seaborn as sns 
cmap = ("beige", "silver","grey")
# print the shares
plt.pie(x=rt["Sentiment"].value_counts(), labels=rt["Sentiment"].unique(), colors=cmap , autopct='%1.0f%%')
plt.show()

# save data frame to csv
rt.to_csv("sentiment.csv")