import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')
[nltk_data] Downloading package punkt to [nltk_data] /Users/amitabhadey/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /Users/amitabhadey/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! [nltk_data] Downloading package maxent_ne_chunker to [nltk_data] /Users/amitabhadey/nltk_data... [nltk_data] Package maxent_ne_chunker is already up-to-date! [nltk_data] Downloading package words to [nltk_data] /Users/amitabhadey/nltk_data... [nltk_data] Package words is already up-to-date! [nltk_data] Downloading package vader_lexicon to [nltk_data] /Users/amitabhadey/nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
True
# Read in data
df = pd.read_csv('data/Reviews.csv')
print(df.shape)
df = df.head(500)
print(df.shape)
(568454, 10) (500, 10)
df.head()
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... |
1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... |
2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... |
3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... |
4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... |
ax = df['Score'].value_counts().sort_index() \
.plot(kind='bar',
title='Count of Reviews by Stars',
figsize=(10, 5))
ax.set_xlabel('Review Stars')
plt.show()
example = df['Text'][50]
print(example)
This oatmeal is not good. Its mushy, soft, I don't like it. Quaker Oats is the way to go.
tokens = nltk.word_tokenize(example)
tokens[:10]
['This', 'oatmeal', 'is', 'not', 'good', '.', 'Its', 'mushy', ',', 'soft']
tagged = nltk.pos_tag(tokens)
tagged[:10]
[('This', 'DT'), ('oatmeal', 'NN'), ('is', 'VBZ'), ('not', 'RB'), ('good', 'JJ'), ('.', '.'), ('Its', 'PRP$'), ('mushy', 'NN'), (',', ','), ('soft', 'JJ')]
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()
(S This/DT oatmeal/NN is/VBZ not/RB good/JJ ./. Its/PRP$ mushy/NN ,/, soft/JJ ,/, I/PRP do/VBP n't/RB like/VB it/PRP ./. (ORGANIZATION Quaker/NNP Oats/NNPS) is/VBZ the/DT way/NN to/TO go/VB ./.)
We will use NLTK's SentimentIntensityAnalyzer to get the neg/neu/pos scores of the text.
This uses a "bag of words" approach: Stop words are removed each word is scored and combined to a total score.
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
sia = SentimentIntensityAnalyzer()
sia.polarity_scores('I am so happy!')
{'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468}
sia.polarity_scores('This is the worst thing ever.')
{'neg': 0.451, 'neu': 0.549, 'pos': 0.0, 'compound': -0.6249}
sia.polarity_scores(example)
{'neg': 0.22, 'neu': 0.78, 'pos': 0.0, 'compound': -0.5448}
# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
text = row['Text']
myid = row['Id']
res[myid] = sia.polarity_scores(text)
0%| | 0/500 [00:00<?, ?it/s]
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')
# Now we have sentiment score and metadata
vaders.head()
Id | neg | neu | pos | compound | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.000 | 0.695 | 0.305 | 0.9441 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... |
1 | 2 | 0.138 | 0.862 | 0.000 | -0.5664 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... |
2 | 3 | 0.091 | 0.754 | 0.155 | 0.8265 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... |
3 | 4 | 0.000 | 1.000 | 0.000 | 0.0000 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... |
4 | 5 | 0.000 | 0.552 | 0.448 | 0.9468 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... |
ax = sns.barplot(data=vaders, x='Score', y='compound')
ax.set_title('Compund Score by Amazon Star Review')
plt.show()
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# VADER results on example
print(example)
sia.polarity_scores(example)
This oatmeal is not good. Its mushy, soft, I don't like it. Quaker Oats is the way to go.
{'neg': 0.22, 'neu': 0.78, 'pos': 0.0, 'compound': -0.5448}
# Run for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
'roberta_neg' : scores[0],
'roberta_neu' : scores[1],
'roberta_pos' : scores[2]
}
print(scores_dict)
{'roberta_neg': 0.97635514, 'roberta_neu': 0.02068747, 'roberta_pos': 0.0029573722}
def polarity_scores_roberta(example):
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
'roberta_neg' : scores[0],
'roberta_neu' : scores[1],
'roberta_pos' : scores[2]
}
return scores_dict
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
try:
text = row['Text']
myid = row['Id']
vader_result = sia.polarity_scores(text)
vader_result_rename = {}
for key, value in vader_result.items():
vader_result_rename[f"vader_{key}"] = value
roberta_result = polarity_scores_roberta(text)
both = {**vader_result_rename, **roberta_result}
res[myid] = both
except RuntimeError:
print(f'Broke for id {myid}')
0%| | 0/500 [00:00<?, ?it/s]
Broke for id 83 Broke for id 187
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')
results_df.columns
Index(['Id', 'vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 'roberta_neg', 'roberta_neu', 'roberta_pos', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'], dtype='object')
sns.pairplot(data=results_df,
vars=['vader_neg', 'vader_neu', 'vader_pos',
'roberta_neg', 'roberta_neu', 'roberta_pos'],
hue='Score',
palette='tab10')
plt.show()
Lets look at some examples where the model scoring and review score differ the most.
results_df.query('Score == 1') \
.sort_values('roberta_pos', ascending=False)['Text'].values[0]
'I felt energized within five minutes, but it lasted for about 45 minutes. I paid $3.99 for this drink. I could have just drunk a cup of coffee and saved my money.'
results_df.query('Score == 1') \
.sort_values('vader_pos', ascending=False)['Text'].values[0]
'So we cancelled the order. It was cancelled without any problem. That is a positive note...'
results_df.query('Score == 5') \
.sort_values('roberta_neg', ascending=False)['Text'].values[0]
'this was sooooo deliscious but too bad i ate em too fast and gained 2 pds! my fault'
results_df.query('Score == 5') \
.sort_values('vader_neg', ascending=False)['Text'].values[0]
'this was sooooo deliscious but too bad i ate em too fast and gained 2 pds! my fault'
from transformers import pipeline
# Specify the model explicitly
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
sent_pipeline = pipeline("sentiment-analysis", model=model_name)
sent_pipeline('I love sentiment analysis!')
[{'label': 'POSITIVE', 'score': 0.9997853636741638}]
sent_pipeline('Make sure to like and subscribe!')
[{'label': 'POSITIVE', 'score': 0.9991742968559265}]
sent_pipeline('booo')
[{'label': 'NEGATIVE', 'score': 0.9936267137527466}]