path = '/content/drive/My Drive/Playground/DAF0AC92369C4F74A4AAA2AE089DFDB2'
!unzip '/content/drive/My Drive/Playground/DAF0AC92369C4F74A4AAA2AE089DFDB2/Sarcasm_Headlines_Dataset_v2.zip'
import os
import re
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")
tqdm.pandas()
%reload_ext autoreload
%autoreload 2
%reload_ext google.colab.data_table
%config InlineBackend.figure_format = 'retina'
plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')
import tensorflow as tf
import tensorflow.keras as keras
from keras.models import Sequential, Model
from keras import layers
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D
def parse_data(file):
for l in open(file,'r'):
yield json.loads(l)
data = list(parse_data('Sarcasm_Headlines_Dataset_v2.json'))
df = pd.DataFrame(data)
df.head()
is_sarcastic | headline | article_link | |
---|---|---|---|
0 | 1 | thirtysomething scientists unveil doomsday clo... | https://www.theonion.com/thirtysomething-scien... |
1 | 0 | dem rep. totally nails why congress is falling... | https://www.huffingtonpost.com/entry/donna-edw... |
2 | 0 | eat your veggies: 9 deliciously different recipes | https://www.huffingtonpost.com/entry/eat-your-... |
3 | 1 | inclement weather prevents liar from getting t... | https://local.theonion.com/inclement-weather-p... |
4 | 1 | mother comes pretty close to using word 'strea... | https://www.theonion.com/mother-comes-pretty-c... |
df.is_sarcastic.value_counts()
0 14985 1 13634 Name: is_sarcastic, dtype: int64
def clean_text(corpus):
cleaned_corpus = pd.Series()
for row in corpus:
qs = []
for word in row.split():
p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
p1 = p1.lower()
qs.append(p1)
cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
return cleaned_corpus
def stopwords_removal(corpus):
stop = set(stopwords.words('english'))
corpus = [[x for x in x.split() if x not in stop] for x in corpus]
return corpus
def lemmatize(corpus):
lem = WordNetLemmatizer()
corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
return corpus
def stem(corpus, stem_type = None):
if stem_type == 'snowball':
stemmer = SnowballStemmer(language = 'english')
corpus = [[stemmer.stem(x) for x in x] for x in corpus]
else :
stemmer = PorterStemmer()
corpus = [[stemmer.stem(x) for x in x] for x in corpus]
return corpus
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
if cleaning:
corpus = clean_text(corpus)
if remove_stopwords:
corpus = stopwords_removal(corpus)
else:
corpus = [[x for x in x.split()] for x in corpus]
if lemmatization:
corpus = lemmatize(corpus)
if stemming == True:
corpus = stem(corpus, stem_type)
corpus = [' '.join(x) for x in corpus]
return corpus
headlines = preprocess(df['headline'], lemmatization = True, remove_stopwords = True)
# !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
MAX_LENGTH = 10
VECTOR_SIZE = 300
def vectorize_data(data):
vectors = []
padding_vector = [0.0] * VECTOR_SIZE
for i, data_point in enumerate(data):
data_point_vectors = []
count = 0
tokens = data_point.split()
for token in tokens:
if count >= MAX_LENGTH:
break
if token in model.wv.vocab:
data_point_vectors.append(model.wv[token])
count+=1
if len(data_point_vectors) < MAX_LENGTH:
to_fill = MAX_LENGTH - len(data_point_vectors)
for _ in range(to_fill):
data_point_vectors.append(padding_vector)
vectors.append(data_point_vectors)
return vectors
vectorized_headlines = vectorize_data(headlines)
train_div = math.floor(0.7 * len(vectorized_headlines))
X_train = vectorized_headlines[:train_div]
y_train = df['is_sarcastic'][:train_div]
X_test = vectorized_headlines[train_div:]
y_test = df['is_sarcastic'][train_div:]
print('The size of X_train is:', len(X_train), '\nThe size of y_train is:', len(y_train),
'\nThe size of X_test is:', len(X_test), '\nThe size of y_test is:', len(y_test))
X_train = np.reshape(X_train, (len(X_train), MAX_LENGTH, VECTOR_SIZE))
X_test = np.reshape(X_test, (len(X_test), MAX_LENGTH, VECTOR_SIZE))
y_train = np.array(y_train)
y_test = np.array(y_test)
The size of X_train is: 20033 The size of y_train is: 20033 The size of X_test is: 8586 The size of y_test is: 8586
FILTERS=8
KERNEL_SIZE=3
HIDDEN_LAYER_1_NODES=10
HIDDEN_LAYER_2_NODES=5
DROPOUT_PROB=0.35
NUM_EPOCHS=10
BATCH_SIZE=50
model = Sequential()
model.add(Conv1D(FILTERS,
KERNEL_SIZE,
padding='same',
strides=1,
activation='relu',
input_shape = (MAX_LENGTH, VECTOR_SIZE)))
model.add(GlobalMaxPooling1D())
model.add(Dense(HIDDEN_LAYER_1_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(HIDDEN_LAYER_2_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= conv1d_1 (Conv1D) (None, 10, 8) 7208 _________________________________________________________________ global_max_pooling1d_1 (Glob (None, 8) 0 _________________________________________________________________ dense_1 (Dense) (None, 10) 90 _________________________________________________________________ dropout_1 (Dropout) (None, 10) 0 _________________________________________________________________ dense_2 (Dense) (None, 5) 55 _________________________________________________________________ dropout_2 (Dropout) (None, 5) 0 _________________________________________________________________ dense_3 (Dense) (None, 1) 6 ================================================================= Total params: 7,359 Trainable params: 7,359 Non-trainable params: 0 _________________________________________________________________ None
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
training_history = model.fit(X_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
Epoch 1/10 20033/20033 [==============================] - 3s 143us/step - loss: 0.6554 - accuracy: 0.5961 Epoch 2/10 20033/20033 [==============================] - 2s 118us/step - loss: 0.5766 - accuracy: 0.6954 Epoch 3/10 20033/20033 [==============================] - 2s 118us/step - loss: 0.5371 - accuracy: 0.7318 Epoch 4/10 20033/20033 [==============================] - 2s 117us/step - loss: 0.5071 - accuracy: 0.7501 Epoch 5/10 20033/20033 [==============================] - 2s 118us/step - loss: 0.4790 - accuracy: 0.7658 Epoch 6/10 20033/20033 [==============================] - 2s 118us/step - loss: 0.4640 - accuracy: 0.7804 Epoch 7/10 20033/20033 [==============================] - 2s 119us/step - loss: 0.4421 - accuracy: 0.7919 Epoch 8/10 20033/20033 [==============================] - 2s 114us/step - loss: 0.4265 - accuracy: 0.8007 Epoch 9/10 20033/20033 [==============================] - 2s 115us/step - loss: 0.4081 - accuracy: 0.8050 Epoch 10/10 20033/20033 [==============================] - 2s 115us/step - loss: 0.3939 - accuracy: 0.8148
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
Testing Accuracy: 0.7600
model_structure = model.to_json()
with open("sarcasm_detection_model_cnn.json", "w") as json_file:
json_file.write(model_structure)
model.save_weights("sarcasm_detection_model_cnn.h5")