We Perform Text Classification on the data. The tweets have been pulled from Twitter and manual tagging has been done then. The names and usernames have been given codes to avoid any privacy concerns.
!wget https://raw.githubusercontent.com/tylerneylon/explacy/master/explacy.py
! pip install emoji
# visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from PIL import Image
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
# handle table-like data and matrices
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import re
import emoji
# natural language processing
import nltk
from nltk.corpus import stopwords
import string
import spacy
from spacy import displacy
import explacy
from wordcloud import WordCloud
# ignore warnings
import warnings
# model
from keras import backend as K
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.utils import plot_model
# to display the total number of rows and columns present in the dataset
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download datatattle/covid-19-nlp-text-classification
! unzip /content/covid-19-nlp-text-classification.zip
data_train = pd.read_csv('Corona_NLP_train.csv', encoding='latin1')
data_test = pd.read_csv('Corona_NLP_test.csv', encoding='latin1')
UserName 0 ScreenName 0 Location 8590 TweetAt 0 OriginalTweet 0 Sentiment 0 dtype: int64
UserName 0 ScreenName 0 Location 834 TweetAt 0 OriginalTweet 0 Sentiment 0 dtype: int64
null = data_train.isnull().sum().sort_values(ascending=False)
total = data_train.shape[0]
percent_missing = (data_train.isnull().sum()/total).sort_values(ascending=False)
missing_data= pd.concat([null, percent_missing], axis=1, keys=['Total missing', 'Percent missing'])
missing_data= missing_data.rename(columns= { 'index': ' column name'})
print ('Missing Values in Each Column:\n', missing_data)
Missing Values in Each Column: column name Total missing Percent missing 0 Location 8590 0.208713 1 UserName 0 0.000000 2 ScreenName 0 0.000000 3 TweetAt 0 0.000000 4 OriginalTweet 0 0.000000 5 Sentiment 0 0.000000
msno.matrix(data_train, figsize=(10, 5));
There are missing values in the location variable. Since we don't use the location for prediction, it doesn't matter.
let's find if we have duplicate rows.
UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | |
0 | 3799 | 48751 | London | 16-03-2020 | @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i... | Neutral |
1 | 3800 | 48752 | UK | 16-03-2020 | advice Talk to your neighbours family to excha... | Positive |
2 | 3801 | 48753 | Vagabonds | 16-03-2020 | Coronavirus Australia: Woolworths to give elde... | Positive |
3 | 3802 | 48754 | NaN | 16-03-2020 | My food stock is not the only one which is emp... | Positive |
4 | 3803 | 48755 | NaN | 16-03-2020 | Me, ready to go at supermarket during the #COV... | Extremely Negative |
data_train.shape, data_test.shape
((41157, 6), (3798, 6))
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41157 entries, 0 to 41156 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 UserName 41157 non-null int64 1 ScreenName 41157 non-null int64 2 Location 32567 non-null object 3 TweetAt 41157 non-null object 4 OriginalTweet 41157 non-null object 5 Sentiment 41157 non-null object dtypes: int64(2), object(4) memory usage: 1.9+ MB
count | unique | top | freq | |
Location | 32567 | 12220 | London | 540 |
TweetAt | 41157 | 30 | 20-03-2020 | 3448 |
OriginalTweet | 41157 | 41157 | @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i... | 1 |
Sentiment | 41157 | 5 | Positive | 11422 |
print('Unique Sentiments in Train Data:', data_train.Sentiment.unique(), '\n')
Unique Sentiments in Train Data: ['Neutral' 'Positive' 'Extremely Negative' 'Negative' 'Extremely Positive'] Positive 11422 Negative 9917 Neutral 7713 Extremely Positive 6624 Extremely Negative 5481 Name: Sentiment, dtype: int64
print('Unique Sentiments in Test Data:', data_test.Sentiment.unique(), '\n')
Unique Sentiments in Test Data: ['Extremely Negative' 'Positive' 'Extremely Positive' 'Negative' 'Neutral'] Negative 1041 Positive 947 Neutral 619 Extremely Positive 599 Extremely Negative 592 Name: Sentiment, dtype: int64
data_train['Sentiment'] = data_train['Sentiment'].map({'Extremely Positive':'Positive','Extremely Negative':'Negative','Neutral':'Neutral','Positive':'Positive','Negative':'Negative'})
data_test['Sentiment'] = data_test['Sentiment'].map({'Extremely Positive':'Positive','Extremely Negative':'Negative','Neutral': 'Neutral','Positive':'Positive','Negative':'Negative'})
print('Unique Sentiments in Train Data:', data_train.Sentiment.unique(), '\n')
class_d = data_train.groupby('Sentiment').count()['OriginalTweet'].reset_index().sort_values(by='OriginalTweet',ascending=False)
Unique Sentiments in Train Data: ['Neutral' 'Positive' 'Negative']
Sentiment | OriginalTweet | |
2 | Positive | 18046 |
0 | Negative | 15398 |
1 | Neutral | 7713 |
print('Unique Sentiments in Test Data:', data_test.Sentiment.unique(), '\n')
class_d = data_test.groupby('Sentiment').count()['OriginalTweet'].reset_index().sort_values(by='OriginalTweet',ascending=False)
Unique Sentiments in Test Data: ['Negative' 'Positive' 'Neutral']
Sentiment | OriginalTweet | |
0 | Negative | 1633 |
2 | Positive | 1546 |
1 | Neutral | 619 |
array(['Neutral', 'Positive', 'Negative'], dtype=object)
Negative 1633 Positive 1546 Neutral 619 Name: Sentiment, dtype: int64
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=['Positive', 'Negative', 'Neutral'], values=data_train['Sentiment'].value_counts(), name='Train Sentiment',
marker_colors=['gold', 'mediumturquoise', '#EC7063']), 1, 1)
fig.add_trace(go.Pie(labels=['Positive', 'Negative', 'Neutral'], values=[data_test['Sentiment'].value_counts()[1], data_test['Sentiment'].value_counts()[0], data_test['Sentiment'].value_counts()[2]], name='Test Sentiment',
marker_colors=['gold', 'mediumturquoise', '#EC7063']), 1, 2)
fig.update_traces(hole=0.5, textfont_size=20, marker=dict(line=dict(color='black', width=2)))
title_text='<b>Sentiment Distribution in Train and Test Data<b>',
# Add annotations in the center of the donut pies.
annotations=[dict(text='Sentiment Train', x=0.135, y=0.5, font_size=20, showarrow=False),
dict(text='Sentiment Test', x=0.86, y=0.5, font_size=20, showarrow=False)])
tweets_per_country = data_train['Location'].value_counts().loc[lambda x : x > 102].reset_index(name='counts')
fig = px.bar(x=tweets_per_country['index'], y=tweets_per_country['counts'], color_continuous_scale=px.colors.sequential.Teal_r,
title='<b>Tweets Count by Country<b>', text_auto=True, color=tweets_per_country['counts'])
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Count'}, 'xaxis': {'title':'Country'}})
tweet_len_pos = data_train[data_train['Sentiment']=='Positive']['OriginalTweet'].str.len()
tweet_len_neg = data_train[data_train['Sentiment']=='Negative']['OriginalTweet'].str.len()
tweet_len_neu = data_train[data_train['Sentiment']=='Neutral']['OriginalTweet'].str.len()
fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Histogram(x = tweet_len_pos, name='Positive', marker_color='#EB89B5') , 1, 1)
fig.add_trace(go.Histogram(x = tweet_len_neg, name='Negative', marker_color='#E9F00B') , 1, 2)
fig.add_trace(go.Histogram(x = tweet_len_neu, name='Neutral', marker_color='#5DADE2') , 1, 3)
fig.update_layout(title_text='<b>Characters in Tweets<b>')
tweet_words_pos = data_train[data_train['Sentiment']=='Positive']['OriginalTweet'].str.split().map(lambda x: len(x))
tweet_words_neg = data_train[data_train['Sentiment']=='Negative']['OriginalTweet'].str.split().map(lambda x: len(x))
tweet_words_neu = data_train[data_train['Sentiment']=='Neutral']['OriginalTweet'].str.split().map(lambda x: len(x))
fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Histogram(x = tweet_words_pos, name='Positive', marker_color='#00CC96') , 1, 1)
fig.add_trace(go.Histogram(x = tweet_words_neg, name='Negative', marker_color='#FFA15A') , 1, 2)
fig.add_trace(go.Histogram(x = tweet_words_neu, name='Neutral', marker_color='#E5527A') , 1, 3)
fig.update_layout(title_text='<b>Number of words in Tweets<b>')
tweet_words_pos = data_train[data_train['Sentiment']=='Positive']['OriginalTweet'].str.split().map(lambda x: len(x))
tweet_avgwords_pos = tweet_words_pos.map(lambda x: np.mean(x))
tweet_words_neg = data_train[data_train['Sentiment']=='Negative']['OriginalTweet'].str.split().map(lambda x: len(x))
tweet_avgwords_neg = tweet_words_neg.map(lambda x: np.mean(x))
tweet_words_neu = data_train[data_train['Sentiment']=='Neutral']['OriginalTweet'].str.split().map(lambda x: len(x))
tweet_avgwords_neu = tweet_words_neu.map(lambda x: np.mean(x))
fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Histogram(x = tweet_avgwords_pos, name='Positive', marker_color='purple') , 1, 1)
fig.add_trace(go.Histogram(x = tweet_avgwords_neg, name='Negative', marker_color='gold') , 1, 2)
fig.add_trace(go.Histogram(x = tweet_avgwords_neu, name='Neutral', marker_color='#C82735') , 1, 3)
fig.update_layout(title_text='<b>Average Word Length in Tweets<b>')
stop = set(stopwords.words('english'))
def create_corpus(target):
corpus = []
for x in data_train[data_train['Sentiment']==target]['OriginalTweet'].str.split():
for i in x:
return corpus
corpus_pos = create_corpus('Positive')
dic_pos = defaultdict(int)
for word in corpus_pos:
if word in stop:
top_stop_pos = sorted(dic_pos.items(), key=lambda x:x[1], reverse=True)[:10]
w_pos = []
n_pos = []
for i in range(len(top_stop_pos)):
fig = px.bar(x=w_pos, y=n_pos, color_continuous_scale=px.colors.sequential.Bluyl,
title='<b>Most Common Stop Words in Positive Sentences<b>', text_auto=True, color=n_pos)
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Count'}, 'xaxis': {'title':'Stop Words'}})
corpus_neg = create_corpus('Negative')
dic_neg = defaultdict(int)
for word in corpus_neg:
if word in stop:
top_stop_neg = sorted(dic_neg.items(), key=lambda x:x[1], reverse=True)[:10]
w_neg = []
n_neg = []
for i in range(len(top_stop_neg)):
fig = px.bar(x=w_neg, y=n_neg, color_continuous_scale=px.colors.sequential.dense,
title='<b>Most Common Stop Words in Negative Sentences<b>', text_auto=True, color=n_neg)
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Count'}, 'xaxis': {'title':'Stop Words'}})
corpus_neu = create_corpus('Neutral')
dic_neu = defaultdict(int)
for word in corpus_neu:
if word in stop:
top_stop_neu = sorted(dic_neu.items(), key=lambda x:x[1], reverse=True)[:10]
w_neu = []
n_neu = []
for i in range(len(top_stop_neu)):
fig = px.bar(x=w_neu, y=n_neu, color_continuous_scale=px.colors.sequential.PuRd,
title='<b>Most Common Stop Words in Neutral Sentences<b>', text_auto=True, color=n_neu)
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Count'}, 'xaxis': {'title':'Stop Words'}})
corpus_pos = create_corpus('Positive')
dic_pos = defaultdict(int)
for word in corpus_pos:
if word in string.punctuation:
top_stop_pos = sorted(dic_pos.items(), key=lambda x:x[1], reverse=True)[:10]
w_pos = []
n_pos = []
for i in range(len(top_stop_pos)):
fig = px.bar(x=w_pos, y=n_pos, color_continuous_scale=px.colors.sequential.Cividis,
title='<b>Most Common Punctuations in Positive Sentences<b>', text_auto=True, color=n_pos)
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Count'}, 'xaxis': {'title':'Punctuations'}})
corpus_neg = create_corpus('Negative')
dic_neg = defaultdict(int)
for word in corpus_neg:
if word in string.punctuation:
top_stop_neg = sorted(dic_neg.items(), key=lambda x:x[1], reverse=True)[:10]
w_neg = []
n_neg = []
for i in range(len(top_stop_neg)):
fig = px.bar(x=w_neg, y=n_neg, color_continuous_scale=px.colors.sequential.Plotly3,
title='<b>Most Common Punctuations in Negative Sentences<b>', text_auto=True, color=n_neg)
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Count'}, 'xaxis': {'title':'Punctuations'}})
corpus_neu = create_corpus('Neutral')
dic_neu = defaultdict(int)
for word in corpus_neu:
if word in string.punctuation:
top_stop_neu = sorted(dic_neu.items(), key=lambda x:x[1], reverse=True)[:10]
w_neu = []
n_neu = []
for i in range(len(top_stop_neu)):
fig = px.bar(x=w_neu, y=n_neu, color_continuous_scale=px.colors.sequential.haline,
title='<b>Most Common Punctuations in Neutral Sentences<b>', text_auto=True, color=n_neu)
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Count'}, 'xaxis': {'title':'Punctuations'}})
def create_corpus():
corpus = []
for x in data_train['OriginalTweet'].str.split():
for i in x:
return corpus
counter = Counter(create_corpus())
common = counter.most_common()
for word,count in common[:40]:
if (word not in stop) :
fig = px.bar(x=y[::-1], y=x[::-1], color_continuous_scale=px.colors.sequential.Reds,
title='<b>Most Common Words<b>', text_auto=True, color=y[::-1])
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Words'}, 'xaxis': {'title':'Count'}})
hashtag_list = []
for word in create_corpus():
if word[0] == '#':
counter_hash = Counter(hashtag_list)
common_hash = counter_hash.most_common()
for word,count in common_hash[:12]:
if (word not in stop) :
fig = px.bar(x=y[::-1], y=x[::-1], color_continuous_scale=px.colors.sequential.Greens,
title='<b>Most Common Hashtags<b>', text_auto=True, color=y[::-1])
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Hashtags'}, 'xaxis': {'title':'Count'}})
mention_list = []
for word in create_corpus():
if word[0] == '@':
counter_mention = Counter(mention_list)
common_mention = counter_mention.most_common()
for word,count in common_mention[1:13]:
if (word not in stop) :
fig = px.bar(x=y[::-1], y=x[::-1], color_continuous_scale=px.colors.sequential.Blues,
title='<b>Most Common Mentions<b>', text_auto=True, color=y[::-1])
fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Mentions'}, 'xaxis': {'title':'Count'}})
nlp = spacy.load('en_core_web_sm')
explacy.print_parse_info(nlp, data_train['OriginalTweet'][8])
Dep tree Token Dep type Lemma Part of Sp ───────────────── ─────────────────────── ──────── ─────────────────────── ────────── ┌────►┌┬── Due prep due ADP │ │└─► to pcomp to PART │ └──► COVID-19 pobj covid-19 VERB │ ┌──► our poss our PRON │ │┌─► retail amod retail ADJ │┌─►┌─┼┼── store nsubj store NOUN ││ │ │└─► and cc and CCONJ ││ │ └──► classroom conj classroom NOUN ││ └─►┌── in prep in ADP ││ └─► Atlanta pobj Atlanta PROPN ││ ┌──► will aux will AUX ││ │┌─► not neg not PART ┌┬┬┬┬──┴┴────┴┼── be ROOT be AUX │││││ └─► open acomp open ADJ ││││└─►┌───────── for prep for ADP ││││ │ ┌─►┌┬── walk amod walk NOUN ││││ │ │ │└─► - punct - PUNCT ││││ │ │ └──► in prt in ADP ││││ └─►└──┬┬── business pobj business NOUN ││││ │└─► or cc or CCONJ ││││ └──► classes conj class NOUN │││└────►┌─────── for prep for ADP │││ │ ┌───► the det the DET │││ │ │┌──► next amod next ADJ │││ │ ││┌─► two nummod two NUM │││ └─►└┴┴── weeks pobj week NOUN ││└─────────────► , punct , PUNCT │└─────►┌──────── beginning advcl begin VERB │ └─►┌──┬── Monday npadvmod Monday PROPN │ │ └─► , punct , PUNCT │ └─►┌── March appos March PROPN │ └─► 16 nummod 16 NUM └───────────────► . punct . PUNCT ┌───► dep SPACE │┌──► We nsubj we PRON ││┌─► will aux will AUX ┌┬───────┴┴┴── continue ROOT continue VERB ││ ┌─► to aux to PART │└─►┌┬─────┴── process xcomp process VERB │ ││ ┌─►┌── online amod online ADV │ ││ │ └─► and cc and CCONJ │ ││ │ ┌─► phone compound phone NOUN │ │└─►└──┴── orders dobj order NOUN │ └─────►┌── as prep as ADP │ └─► normal amod normal ADJ └────────────► ! punct ! PUNCT ┌┬─────┬── Thank ROOT thank VERB ││ └─► you dobj you PRON │└─►┌───── for prep for ADP │ │ ┌─► your poss your PRON │ └─►└── understanding pobj understanding NOUN └────────► ! punct ! PUNCT https://t.co/kw91zJ5O5i ROOT https://t.co/kw91zJ5O5i PROPN
doc = nlp(data_train['OriginalTweet'][8])
# Count the frequencies of different coarse-grained POS tags:
POS_counts = doc.count_by(spacy.attrs.POS)
for k,v in sorted(POS_counts.items()):
print(f'{k}. {doc.vocab[k].text:{5}}: {v}')
84. ADJ : 4 85. ADP : 7 86. ADV : 1 87. AUX : 3 89. CCONJ: 3 90. DET : 1 92. NOUN : 9 93. NUM : 2 94. PART : 3 95. PRON : 4 96. PROPN: 4 97. PUNCT: 6 100. VERB : 5 103. SPACE: 1
# Render the dependency parse
displacy.render(doc, style='dep', jupyter=True, options={'distance': 100, 'color': 'black', 'bg': '#e6ffff'})
colors = {'GPE': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 'DATE': 'radial-gradient(yellow, green)'}
options = {'ents': ['GPE', 'DATE'], 'colors':colors}
for sent in doc.sents:
displacy.render(nlp(sent.text), style='ent', jupyter=True, options=options)
Lower Case
data_train['OriginalTweet'] = data_train['OriginalTweet'].agg(lambda x:x.lower())
data_test['OriginalTweet'] = data_test['OriginalTweet'].agg(lambda x:x.lower())
Remove Stopwords
nlp = spacy.load('en_core_web_sm')
stop = set(nlp.Defaults.stop_words)
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
Remove Urls and Mentions
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(lambda x: re.sub(r"(?:\@|https?\://)\S+", "", x))
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(lambda x: re.sub(r"(?:\@|https?\://)\S+", "", x))
Remove Emoji
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(lambda x: re.sub(emoji.get_emoji_regexp(), r"", x))
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(lambda x: re.sub(emoji.get_emoji_regexp(), r"", x))
Remove Hashtags
#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(lambda x: ' '.join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', x)))
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(lambda x: ' '.join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', x)))
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(lambda x: ' '.join(word.strip() for word in re.split('#|_', x)))
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(lambda x: ' '.join(word.strip() for word in re.split('#|_', x)))
Remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(lambda x: re.sub(r'[^\x00-\x7f]',r'', x))
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(lambda x: re.sub(r'[^\x00-\x7f]',r'', x))
Remove Punctuations
def remove_punctuations(text):
for punctuation in string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§':
text = text.replace(punctuation, '')
return text
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(remove_punctuations)
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(remove_punctuations)
Remove Special Characters
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(lambda x: x.replace('$', ''))
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(lambda x: x.replace('$', ''))
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(lambda x: x.replace('&', ''))
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(lambda x: x.replace('&', ''))
Remove multiple Spaces
data_train['OriginalTweet'] = data_train['OriginalTweet'].apply(lambda x: re.sub("\s\s+" , " ", x))
data_test['OriginalTweet'] = data_test['OriginalTweet'].apply(lambda x: re.sub("\s\s+" , " ", x))
Remove Empty Rows
len(data_train[data_train['OriginalTweet'] == ''])
len(data_test[data_test['OriginalTweet'] == ''])
data_train.shape, data_test.shape
((41157, 6), (3798, 6))
train = data_train[data_train['OriginalTweet'] != '']
test = data_test[data_test['OriginalTweet'] != '']
train.shape, test.shape
((41134, 6), (3796, 6))
X_train = train['OriginalTweet']
y_train = train['Sentiment']
X_test = test['OriginalTweet']
y_test = test['Sentiment']
mask = np.array(Image.open('/content/twitter_logo.png'))
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=[31, 16])
wordcloud1 = WordCloud(width = 800, height = 800,
background_color = 'black',
colormap = 'Set2',
min_font_size = 10,
mask = mask).generate(' '.join(train[train['Sentiment'] == 'Positive']['OriginalTweet']))
ax1.set_title('Positive Sentiment',fontsize=35)
wordcloud2 = WordCloud(width = 800, height = 800,
background_color = 'black',
colormap = 'Set2',
min_font_size = 10,
mask = mask).generate(' '.join(train[train['Sentiment'] == 'Negative']['OriginalTweet']))
ax2.set_title('Negative Sentiment',fontsize=35)
wordcloud3 = WordCloud(width = 800, height = 800,
background_color = 'black',
colormap = 'Set2',
min_font_size = 10,
mask = mask).generate(' '.join(train[train['Sentiment'] == 'Neutral']['OriginalTweet']))
ax3.set_title('Neutral Sentiment',fontsize=35);
def create_corpus(target):
corpus = []
for x in data_train[data_train['Sentiment']==target]['OriginalTweet'].str.split():
for i in x:
return corpus
counter_p = Counter(create_corpus('Positive'))
common_p = counter_p.most_common()
for word,count in common_p[:10]:
counter_neg = Counter(create_corpus('Negative'))
common_neg = counter_neg.most_common()
for word,count in common_neg[:10]:
counter_neu = Counter(create_corpus('Neutral'))
common_neu = counter_neu.most_common()
for word,count in common_neu[:10]:
fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Bar(x = y_p[::-1], y = x_p[::-1], name='Positive', marker_color='#cc0000', orientation='h',
text=y_p[::-1], textposition='auto') , 1, 1)
fig.add_trace(go.Bar(x = y_neg[::-1], y = x_neg[::-1], name='Negative', marker_color='#80ff00', orientation='h',
text=y_neg[::-1], textposition='auto') , 1, 2)
fig.add_trace(go.Bar(x = y_neu[::-1], y = x_neu[::-1], name='Neutral', marker_color='#00b8e6', orientation='h',
text=y_neu[::-1], textposition='auto') , 1, 3)
fig.update_layout(title_text='<b>Common Words in Tweets After Cleaning<b>')
vocab_size = 1000000
max_length = 100
embedding_dim = 64
trunc_type = 'post'
oov_tok = '<OOV>'
#Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
#Generate the word index dictionary for the training sentences
word_index = tokenizer.word_index
#Generate and pad the training sequences
sequences = tokenizer.texts_to_sequences(X_train)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
#Generate and pad the test sequences
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating=trunc_type)
y_train = y_train.map({'Negative':0, 'Neutral':1, 'Positive':2})
y_test = y_test.map({'Negative':0, 'Neutral':1, 'Positive':2})
training_padded = np.array(padded)
training_labels = np.array(y_train)
testing_padded = np.array(testing_padded)
testing_labels = np.array(y_test)
# Build the model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(), # or use flatten()
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(3, activation='softmax')
# Print the model summary
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_2 (Embedding) (None, 100, 100) 200000000 global_average_pooling1d (G (None, 100) 0 lobalAveragePooling1D) dense_4 (Dense) (None, 64) 6464 dense_5 (Dense) (None, 3) 195 ================================================================= Total params: 200,006,659 Trainable params: 200,006,659 Non-trainable params: 0 _________________________________________________________________
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
#filepath = 'my_best_model.epoch{epoch:02d}-loss{val_accuracy:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath='DNN_Best_Model', monitor='val_accuracy', verbose=1,
save_best_only=True, mode='max', save_freq='epoch')
callbacks = [checkpoint]
#Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
history = model.fit(training_padded, training_labels, epochs = 16, validation_data=(testing_padded, testing_labels),
verbose = 1, callbacks=callbacks)
# Retrieve a list of list results on training and test data
# sets for each training epoch
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc)) # Get number of epochs
# Plot training and validation accuracy per epoch
plt.plot (epochs, acc, 'bo', color = '#ff0066')
plt.plot (epochs, val_acc, color = '#00ccff')
plt.title ('Training and validation accuracy')
plt.legend(['Train', 'Test'], loc = 'upper left')
# Plot training and validation loss per epoch
plt.plot (epochs, loss, 'bo', color = '#ff0066')
plt.plot (epochs, val_loss, color = '#00ccff')
plt.legend(['Train', 'Test'], loc = 'upper left')
plt.title ('Train and Test Loss');
best_DNN_model = tf.keras.models.load_model('DNN_Best_Model')
best_DNN_model.evaluate(testing_padded, testing_labels, verbose=1)
119/119 [==============================] - 0s 2ms/step - loss: 0.5976 - accuracy: 0.7750
[0.5976424217224121, 0.7750263214111328]
predictions = best_DNN_model.predict(testing_padded)
y_pred = np.argmax(predictions, axis = 1)
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(testing_labels, y_pred, target_names=target_names))
precision recall f1-score support Negative 0.84 0.74 0.79 1633 Neutral 0.62 0.64 0.63 617 Positive 0.78 0.86 0.82 1546 accuracy 0.78 3796 macro avg 0.75 0.75 0.75 3796 weighted avg 0.78 0.78 0.77 3796
cm = confusion_matrix(testing_labels, y_pred)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='');
# Hyperparameters
lstm1_dim = 64
lstm2_dim = 32
dense_dim = 64
# Build the model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm1_dim, return_sequences=True)),
tf.keras.layers.Dense(dense_dim, activation='relu'),
tf.keras.layers.Dense(3, activation='softmax')
Model: "sequential_3" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_3 (Embedding) (None, None, 100) 200000000 bidirectional_4 (Bidirectio (None, None, 128) 84480 nal) bidirectional_5 (Bidirectio (None, 64) 41216 nal) dropout_2 (Dropout) (None, 64) 0 dense_6 (Dense) (None, 64) 4160 dense_7 (Dense) (None, 3) 195 ================================================================= Total params: 200,130,051 Trainable params: 200,130,051 Non-trainable params: 0 _________________________________________________________________
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
#filepath = 'my_best_model.epoch{epoch:02d}-loss{val_accuracy:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath='LSTM_Best_Model', monitor='val_accuracy', verbose=1,
save_best_only=True, mode='max', save_freq='epoch')
callbacks = [checkpoint]
#Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
history = model.fit(training_padded, training_labels, epochs = 5, validation_data=(testing_padded, testing_labels),
verbose = 1, callbacks=callbacks)
Epoch 1/5 1286/1286 [==============================] - ETA: 0s - loss: 0.6065 - accuracy: 0.7524 Epoch 1: val_accuracy improved from -inf to 0.83061, saving model to LSTM_Best_Model
1286/1286 [==============================] - 104s 77ms/step - loss: 0.6065 - accuracy: 0.7524 - val_loss: 0.4667 - val_accuracy: 0.8306 Epoch 2/5 1286/1286 [==============================] - ETA: 0s - loss: 0.3170 - accuracy: 0.8973 Epoch 2: val_accuracy improved from 0.83061 to 0.83483, saving model to LSTM_Best_Model
1286/1286 [==============================] - 96s 75ms/step - loss: 0.3170 - accuracy: 0.8973 - val_loss: 0.4928 - val_accuracy: 0.8348 Epoch 3/5 1286/1286 [==============================] - ETA: 0s - loss: 0.2153 - accuracy: 0.9328 Epoch 3: val_accuracy did not improve from 0.83483 1286/1286 [==============================] - 67s 52ms/step - loss: 0.2153 - accuracy: 0.9328 - val_loss: 0.5586 - val_accuracy: 0.8222 Epoch 4/5 1286/1286 [==============================] - ETA: 0s - loss: 0.1489 - accuracy: 0.9526 Epoch 4: val_accuracy did not improve from 0.83483 1286/1286 [==============================] - 67s 52ms/step - loss: 0.1489 - accuracy: 0.9526 - val_loss: 0.6227 - val_accuracy: 0.8145 Epoch 5/5 1285/1286 [============================>.] - ETA: 0s - loss: 0.1065 - accuracy: 0.9660 Epoch 5: val_accuracy did not improve from 0.83483 1286/1286 [==============================] - 67s 52ms/step - loss: 0.1065 - accuracy: 0.9660 - val_loss: 0.7465 - val_accuracy: 0.8048
# Retrieve a list of list results on training and test data
# sets for each training epoch
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc)) # Get number of epochs
# Plot training and validation accuracy per epoch
plt.plot (epochs, acc, 'bo', color = '#ff0066')
plt.plot (epochs, val_acc, color = '#00ccff')
plt.title ('Training and validation accuracy')
plt.legend(['Train', 'Test'], loc = 'upper left')
# Plot training and validation loss per epoch
plt.plot (epochs, loss, 'bo', color = '#ff0066')
plt.plot (epochs, val_loss, color = '#00ccff')
plt.legend(['Train', 'Test'], loc = 'upper left')
plt.title ('Train and Test Loss');
best_LSTM_model = tf.keras.models.load_model('LSTM_Best_Model')
best_LSTM_model.evaluate(testing_padded, testing_labels, verbose=1)
119/119 [==============================] - 3s 10ms/step - loss: 0.4928 - accuracy: 0.8348
[0.4927993714809418, 0.8348261117935181]
predictions2 = best_LSTM_model.predict(testing_padded)
y_pred2 = np.argmax(predictions2, axis = 1)
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(testing_labels, y_pred2, target_names=target_names))
precision recall f1-score support Negative 0.85 0.83 0.84 1633 Neutral 0.79 0.75 0.77 617 Positive 0.84 0.88 0.86 1546 accuracy 0.83 3796 macro avg 0.82 0.82 0.82 3796 weighted avg 0.83 0.83 0.83 3796
cm = confusion_matrix(testing_labels, y_pred2)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='');