Sarcasm Detection in Tensorflow¶

In [ ]:

path = '/content/drive/My Drive/Playground/DAF0AC92369C4F74A4AAA2AE089DFDB2'

In [ ]:

!unzip '/content/drive/My Drive/Playground/DAF0AC92369C4F74A4AAA2AE089DFDB2/Sarcasm_Headlines_Dataset_v2.zip'

In [ ]:

import os
import re
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
%reload_ext autoreload
%autoreload 2
%reload_ext google.colab.data_table
%config InlineBackend.figure_format = 'retina'

plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')

In [ ]:

import tensorflow as tf
import tensorflow.keras as keras 
from keras.models import Sequential, Model 
from keras import layers
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D

In [ ]:

def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('Sarcasm_Headlines_Dataset_v2.json'))
df = pd.DataFrame(data)
df.head()

Out[ ]:

	is_sarcastic	headline	article_link
0	1	thirtysomething scientists unveil doomsday clo...	https://www.theonion.com/thirtysomething-scien...
1	0	dem rep. totally nails why congress is falling...	https://www.huffingtonpost.com/entry/donna-edw...
2	0	eat your veggies: 9 deliciously different recipes	https://www.huffingtonpost.com/entry/eat-your-...
3	1	inclement weather prevents liar from getting t...	https://local.theonion.com/inclement-weather-p...
4	1	mother comes pretty close to using word 'strea...	https://www.theonion.com/mother-comes-pretty-c...

In [ ]:

df.is_sarcastic.value_counts()

Out[ ]:

0    14985
1    13634
Name: is_sarcastic, dtype: int64

In [ ]:

def clean_text(corpus):
  cleaned_corpus = pd.Series()
  for row in corpus:
      qs = []
      for word in row.split():
          p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
          p1 = p1.lower()
          qs.append(p1)
      cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  return cleaned_corpus

def stopwords_removal(corpus):
    stop = set(stopwords.words('english'))
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus
  
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    if cleaning:
        corpus = clean_text(corpus)
    if remove_stopwords:
        corpus = stopwords_removal(corpus)
    else:
        corpus = [[x for x in x.split()] for x in corpus]
    if lemmatization:
        corpus = lemmatize(corpus)
    if stemming == True:
        corpus = stem(corpus, stem_type)
    corpus = [' '.join(x) for x in corpus]
    return corpus

In [ ]:

headlines = preprocess(df['headline'], lemmatization = True, remove_stopwords = True)

In [ ]:

# !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [ ]:

MAX_LENGTH = 10
VECTOR_SIZE = 300

def vectorize_data(data):
  vectors = []
  padding_vector = [0.0] * VECTOR_SIZE
  for i, data_point in enumerate(data):
    data_point_vectors = []
    count = 0
    tokens = data_point.split()
    for token in tokens:
      if count >= MAX_LENGTH:
        break
      if token in model.wv.vocab:
        data_point_vectors.append(model.wv[token])
      count+=1
    if len(data_point_vectors) < MAX_LENGTH:
      to_fill = MAX_LENGTH - len(data_point_vectors)
      for _ in range(to_fill):
        data_point_vectors.append(padding_vector)
    vectors.append(data_point_vectors)
  return vectors

In [ ]:

vectorized_headlines = vectorize_data(headlines)

In [ ]:

train_div = math.floor(0.7 * len(vectorized_headlines))

X_train = vectorized_headlines[:train_div]
y_train = df['is_sarcastic'][:train_div]
X_test = vectorized_headlines[train_div:]
y_test = df['is_sarcastic'][train_div:]

print('The size of X_train is:', len(X_train), '\nThe size of y_train is:', len(y_train),
      '\nThe size of X_test is:', len(X_test), '\nThe size of y_test is:', len(y_test))

X_train = np.reshape(X_train, (len(X_train), MAX_LENGTH, VECTOR_SIZE))
X_test = np.reshape(X_test, (len(X_test), MAX_LENGTH, VECTOR_SIZE))
y_train = np.array(y_train)
y_test = np.array(y_test)

The size of X_train is: 20033 
The size of y_train is: 20033 
The size of X_test is: 8586 
The size of y_test is: 8586

In [ ]:

FILTERS=8
KERNEL_SIZE=3
HIDDEN_LAYER_1_NODES=10
HIDDEN_LAYER_2_NODES=5
DROPOUT_PROB=0.35
NUM_EPOCHS=10
BATCH_SIZE=50

In [ ]:

model = Sequential()

model.add(Conv1D(FILTERS,
                 KERNEL_SIZE,
                 padding='same',
                 strides=1,
                 activation='relu', 
                 input_shape = (MAX_LENGTH, VECTOR_SIZE)))
model.add(GlobalMaxPooling1D())
model.add(Dense(HIDDEN_LAYER_1_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(HIDDEN_LAYER_2_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv1d_1 (Conv1D)            (None, 10, 8)             7208      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                90        
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 55        
_________________________________________________________________
dropout_2 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 6         
=================================================================
Total params: 7,359
Trainable params: 7,359
Non-trainable params: 0
_________________________________________________________________
None

In [ ]:

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [ ]:

training_history = model.fit(X_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/10
20033/20033 [==============================] - 3s 143us/step - loss: 0.6554 - accuracy: 0.5961
Epoch 2/10
20033/20033 [==============================] - 2s 118us/step - loss: 0.5766 - accuracy: 0.6954
Epoch 3/10
20033/20033 [==============================] - 2s 118us/step - loss: 0.5371 - accuracy: 0.7318
Epoch 4/10
20033/20033 [==============================] - 2s 117us/step - loss: 0.5071 - accuracy: 0.7501
Epoch 5/10
20033/20033 [==============================] - 2s 118us/step - loss: 0.4790 - accuracy: 0.7658
Epoch 6/10
20033/20033 [==============================] - 2s 118us/step - loss: 0.4640 - accuracy: 0.7804
Epoch 7/10
20033/20033 [==============================] - 2s 119us/step - loss: 0.4421 - accuracy: 0.7919
Epoch 8/10
20033/20033 [==============================] - 2s 114us/step - loss: 0.4265 - accuracy: 0.8007
Epoch 9/10
20033/20033 [==============================] - 2s 115us/step - loss: 0.4081 - accuracy: 0.8050
Epoch 10/10
20033/20033 [==============================] - 2s 115us/step - loss: 0.3939 - accuracy: 0.8148

In [ ]:

loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.7600

In [ ]:

model_structure = model.to_json()
with open("sarcasm_detection_model_cnn.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("sarcasm_detection_model_cnn.h5")