# install kaggle to use kaggle sample data sets !pip install kaggle !mkdir ~/.kaggle # install the main libraries import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # download lexicon to work with pretrained text data import nltk nltk.download('vader_lexicon') # import e.g. sentiment analyzer to categorise reviews from nltk.sentiment.vader import SentimentIntensityAnalyzer sentiments = SentimentIntensityAnalyzer() # use e.g. the available Hotel_Reviews.csv file, download from kaggle or from https://github.com/spribylova/Python_Hotels # data = pd.read_csv("Hotel_Reviews.csv") import pandas as pd url = "https://raw.githubusercontent.com/spribylova/Python_Text_Sentiment/main/Hotel_Reviews.csv" data = pd.read_csv(url) data # the sample csv file only contains US country data["country"].unique() # there is many US unique hotel names in the file data["name"].unique() # see the unique values in rating column data["reviews.rating"].unique() # fill the null reviews of the Hotels with 0 data['reviews.rating']=data['reviews.rating'].fillna(0) # create a range of 5 maximum review numbers, all values above 5 are equal to 5 for i in range(0,len(data)): if(data['reviews.rating'].loc[i] > 5): temp = data['reviews.rating'].loc[i] newtemp = (temp/10)*5 data.at[i,'reviews.rating'] = newtemp # see the number of reviews for each rating number ratings = data["reviews.rating"].value_counts() numbers = ratings.index quantity = ratings.values ratings # round decimal places in rating and sort output by rating level ratings=data['reviews.rating'].round(0).value_counts()[[5,4,3,2,1]] ratings # create indexes from rating value and counts numbers = ratings.index quantity = ratings.values # assign color to each rating and revise the pie chart to see the share of rating values # the majority of people is giving review 5, only small share of people was not satisfied custom_colors = ["tan", "beige", 'silver', "grey", "black"] plt.figure(figsize=(5, 5)) plt.pie(quantity, labels=numbers, colors=custom_colors) central_circle = plt.Circle((0, 0), 0.5, color='white') fig = plt.gcf() fig.gca().add_artist(central_circle) plt.rc('font', size=12) plt.title("Hotel Reviews Ratings", fontsize=20) plt.show() # you can use various libraries to work with reviews: SentimentIntensityAnalyzer, transformers and textblob pip install -q transformers # create data frame reviews text, rt. Each review is in a single row. rt = data[["reviews.text"]] #rt = str(rt).encode('utf-8') #rt["reviews.text"] = pd.to_numeric(rt["reviews.text"]) # convert column "reviews.text" to string type rt = rt.astype({"reviews.text": str}) rt # transforers library is using various pretrained models to classify and calculate score # well known model names are e.g. XLNet, BERT, XLM, RoBERTa, ... from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline model_name = "nlptown/bert-base-multilingual-uncased-sentiment" model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer) classifier("Really lovely hotel. The best one. Tip top.") # use analyzer and download e.g. vader pretrained lexicon (Valence Aware Dictionary for sEntiment Reasoning) import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer nltk.download("vader_lexicon") import pandas as pd # revise data types in rt data frame rt.dtypes sentiments = SentimentIntensityAnalyzer() # based on polarity score classify each line of text to positive, negative, neutral and compound rt["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in rt["reviews.text"]] rt["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in rt["reviews.text"]] rt["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in rt["reviews.text"]] rt['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in rt["reviews.text"]] rt.head() # classify column Compound score = rt["Compound"].values sentiment = [] for i in score: if i >= 0.05 : sentiment.append('Positive') elif i <= -0.05 : sentiment.append('Negative') else: sentiment.append('Neutral') rt["Sentiment"] = sentiment rt.head() print(rt["Sentiment"].value_counts()) rt # install libraries for pieplot and define colors import matplotlib.pyplot as plt import seaborn as sns cmap = ("beige", "silver","grey") # print the shares plt.pie(x=rt["Sentiment"].value_counts(), labels=rt["Sentiment"].unique(), colors=cmap , autopct='%1.0f%%') plt.show() # save data frame to csv rt.to_csv("sentiment.csv")