import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
!wget --no-check-certificate \
https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
-O /tmp/sarcasm.json
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000
with open("/tmp/sarcasm.json", 'r') as f:
datastore = json.load(f)
sentences = []
labels = []
urls = []
for item in datastore:
sentences.append(item['headline'])
labels.append(item['is_sarcastic'])
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
num_epochs = 50
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1)
--2019-11-24 12:33:45-- https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.197.144, 2404:6800:4004:800::2010 Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.197.144|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 5643545 (5.4M) [application/json] Saving to: ‘/tmp/sarcasm.json’ /tmp/sarcasm.json 100%[===================>] 5.38M --.-KB/s in 0.1s 2019-11-24 12:33:45 (37.1 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545] Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 120, 16) 16000 _________________________________________________________________ bidirectional (Bidirectional (None, 64) 12544 _________________________________________________________________ dense (Dense) (None, 24) 1560 _________________________________________________________________ dense_1 (Dense) (None, 1) 25 ================================================================= Total params: 30,129 Trainable params: 30,129 Non-trainable params: 0 _________________________________________________________________
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-1-499141e300d1> in <module> 53 54 num_epochs = 50 ---> 55 history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1) 56 ~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs) 726 max_queue_size=max_queue_size, 727 workers=workers, --> 728 use_multiprocessing=use_multiprocessing) 729 730 def evaluate(self, ~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs) 222 validation_data=validation_data, 223 validation_steps=validation_steps, --> 224 distribution_strategy=strategy) 225 226 total_samples = _get_total_number_of_samples(training_data_adapter) ~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing) 495 'at same time.') 496 --> 497 adapter_cls = data_adapter.select_data_adapter(x, y) 498 499 # Handle validation_split, we want to split the data and get the training ~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in select_data_adapter(x, y) 651 "Failed to find data adapter that can handle " 652 "input: {}, {}".format( --> 653 _type_name(x), _type_name(y))) 654 elif len(adapter_cls) > 1: 655 raise RuntimeError( ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})
import matplotlib.pyplot as plt
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
plot_graphs(history, 'acc')
plot_graphs(history, 'loss')
model.save("test.h5")