import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import gzip
from collections import Counter
from pprint import pprint
import pandas as pd
import numpy as np
np.random.seed(42)
import matplotlib
import matplotlib.pyplot as plt
import tqdm
from tqdm.notebook import tqdm
import sklearn
from sklearn.manifold import TSNE
import watermark
import keras
import tensorflow as tf
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from keras_preprocessing import sequence
%load_ext watermark
%matplotlib inline
We start by print out the versions of the libraries we're using for future reference
%watermark -n -v -m -g -iv
Python implementation: CPython Python version : 3.11.5 IPython version : 8.12.3 Compiler : Clang 14.0.6 OS : Darwin Release : 23.3.0 Machine : arm64 Processor : arm CPU cores : 16 Architecture: 64bit Git hash: 3022f3d8aeb2c24ebf2d47d4e9999181a7ad777d sklearn : 1.3.0 keras : 2.12.0 json : 2.0.9 watermark : 2.4.3 pandas : 2.0.3 keras_preprocessing: 1.1.2 numpy : 1.23.5 matplotlib : 3.7.2 tensorflow : 2.12.0
Load default figure style
plt.style.use('d4sci.mplstyle')
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
We start by loading pre-computed Word Embeddings
word_dict = {}
word_list = []
embeddings = np.zeros((400_000, 50), dtype='float32')
count = 0
with gzip.open('data/glove.6B.50d.txt.gz', 'rt') as fp:
for line in tqdm(fp, total=400_000):
fields = line.split()
word = fields[0]
word_list.append(word)
word_dict[word] = count
embeddings[count] = np.asarray(fields[1:], dtype='float32')
count += 1
0%| | 0/400000 [00:00<?, ?it/s]
fig, ax = plt.subplots(1, figsize=(10, 10))
ax.imshow(embeddings.T, aspect=4000, cmap=plt.cm.seismic)
ax.grid(None)
ax.set_xticks([0, 100_000, 200_000, 300_000, 400_000])
ax.set_xlabel("Vocabulary")
ax.set_ylabel("Dimensions")
Text(0, 0.5, 'Dimensions')
A 2D projection makes it clear that semantics is also encoded in the representation
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
plot_only = 500 # Plot only 500 words
low_dim_embs = tsne.fit_transform(np.array(embeddings)[:plot_only, :])
labels = [word_list[i] for i in range(plot_only)]
plt.figure(figsize=(18, 18))
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom',
fontsize=12)
top_words = 10_000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
word_dict = imdb.get_word_index()
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
Recall our original FF network model
model = Sequential()
model.add(Embedding(top_words, 50, input_shape=(max_words,)))
## <==> ##
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 500, 50) 500000 flatten (Flatten) (None, 25000) 0 dense (Dense) (None, 32) 800032 dense_1 (Dense) (None, 1) 33 ================================================================= Total params: 1,300,065 Trainable params: 1,300,065 Non-trainable params: 0 _________________________________________________________________
We're going to build an embedding matrix from the values we have in disk
embedding_matrix = np.zeros((top_words, 50), dtype='float32')
word_dict = imdb.get_word_index()
count = 0
with gzip.open('data/glove.6B.50d.txt.gz', 'rt') as fp:
for line in tqdm(fp, total=400000):
fields = line.split()
word = fields[0]
if word in word_dict:
pos = word_dict[word]
if pos < top_words:
count += 1
pos = word_dict[word]
embedding_matrix[pos] = np.asarray(fields[1:], dtype='float32')
0%| | 0/400000 [00:00<?, ?it/s]
And assign this matrix to the embedding layer
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False # Don't update the embeddings
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 500, 50) 500000 flatten (Flatten) (None, 25000) 0 dense (Dense) (None, 32) 800032 dense_1 (Dense) (None, 1) 33 ================================================================= Total params: 1,300,065 Trainable params: 800,065 Non-trainable params: 500,000 _________________________________________________________________
Now you can see that there are 500,000 parameters that we will not need to train corresponding to 10,000 words x 50 dimensions
history = model.fit(X_train, y_train,
validation_data=(X_test, y_test), # The testing data for validation
epochs=10, # How many epochs to train for
batch_size=128, # The batch size. Gradients are updated after each batch
verbose=1) # The level of detail in the output
Epoch 1/10 196/196 [==============================] - 1s 6ms/step - loss: 0.6994 - accuracy: 0.5250 - val_loss: 0.6904 - val_accuracy: 0.5280 Epoch 2/10 196/196 [==============================] - 1s 6ms/step - loss: 0.6707 - accuracy: 0.5875 - val_loss: 0.6884 - val_accuracy: 0.5596 Epoch 3/10 196/196 [==============================] - 1s 6ms/step - loss: 0.5948 - accuracy: 0.6834 - val_loss: 0.7180 - val_accuracy: 0.5610 Epoch 4/10 196/196 [==============================] - 1s 6ms/step - loss: 0.5245 - accuracy: 0.7350 - val_loss: 0.7575 - val_accuracy: 0.5584 Epoch 5/10 196/196 [==============================] - 1s 6ms/step - loss: 0.4608 - accuracy: 0.7798 - val_loss: 0.8245 - val_accuracy: 0.5566 Epoch 6/10 196/196 [==============================] - 1s 6ms/step - loss: 0.4092 - accuracy: 0.8118 - val_loss: 0.8768 - val_accuracy: 0.5565 Epoch 7/10 196/196 [==============================] - 1s 6ms/step - loss: 0.3536 - accuracy: 0.8473 - val_loss: 0.9932 - val_accuracy: 0.5550 Epoch 8/10 196/196 [==============================] - 1s 6ms/step - loss: 0.3130 - accuracy: 0.8668 - val_loss: 1.0569 - val_accuracy: 0.5534 Epoch 9/10 196/196 [==============================] - 1s 6ms/step - loss: 0.2703 - accuracy: 0.8882 - val_loss: 1.1486 - val_accuracy: 0.5526 Epoch 10/10 196/196 [==============================] - 1s 6ms/step - loss: 0.2327 - accuracy: 0.9080 - val_loss: 1.2022 - val_accuracy: 0.5504
def plot_history(history):
fig, ax_lst = plt.subplots(1, 2, sharex=True, sharey=True)
epochs = np.array(history.epoch) + 1
ax_lst[0].plot(epochs, history.history['loss'], label='Training')
ax_lst[0].plot(epochs, history.history['val_loss'], label='Testing')
ax_lst[0].set_ylabel('Loss')
ax_lst[0].set_xlabel('Epoch')
ax_lst[0].set_xticks(history.epoch)
best_epoch = np.argmin(history.history['val_loss']) + 1
ax_lst[0].axvline(x=best_epoch, linestyle=':', color=colors[2])
ax_lst[0].legend()
ax_lst[1].plot(epochs, history.history['accuracy'], label='Training')
ax_lst[1].plot(epochs, history.history['val_accuracy'], label='Testing')
ax_lst[1].set_ylabel('Accuracy')
ax_lst[1].set_xlabel('Epoch')
ax_lst[1].set_xticks(history.epoch)
ax_lst[1].axvline(x=best_epoch, linestyle=':', color=colors[2])
fig.tight_layout()
plot_history(history)
model = Sequential()
model.add(Embedding(top_words, 50, input_length=max_words))
## <==> ##
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
## <==> ##
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (None, 500, 50) 500000 conv1d (Conv1D) (None, 500, 32) 4832 max_pooling1d (MaxPooling1D (None, 250, 32) 0 ) flatten_1 (Flatten) (None, 8000) 0 dense_2 (Dense) (None, 32) 256032 dense_3 (Dense) (None, 1) 33 ================================================================= Total params: 760,897 Trainable params: 760,897 Non-trainable params: 0 _________________________________________________________________
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False # Don't update the embeddings
model.summary()
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (None, 500, 50) 500000 conv1d (Conv1D) (None, 500, 32) 4832 max_pooling1d (MaxPooling1D (None, 250, 32) 0 ) flatten_1 (Flatten) (None, 8000) 0 dense_2 (Dense) (None, 32) 256032 dense_3 (Dense) (None, 1) 33 ================================================================= Total params: 760,897 Trainable params: 260,897 Non-trainable params: 500,000 _________________________________________________________________
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
history = model.fit(X_train, y_train,
validation_data=(X_test, y_test), # The testing data for validation
epochs=10, # How many epochs to train for
batch_size=128, # The batch size. Gradients are updated after each batch
verbose=1) # The level of detail in the output
Epoch 1/10 196/196 [==============================] - 3s 16ms/step - loss: 0.6962 - accuracy: 0.5203 - val_loss: 0.6892 - val_accuracy: 0.5335 Epoch 2/10 196/196 [==============================] - 3s 16ms/step - loss: 0.6611 - accuracy: 0.6042 - val_loss: 0.6630 - val_accuracy: 0.6007 Epoch 3/10 196/196 [==============================] - 3s 16ms/step - loss: 0.6098 - accuracy: 0.6648 - val_loss: 0.6526 - val_accuracy: 0.6197 Epoch 4/10 196/196 [==============================] - 3s 16ms/step - loss: 0.5568 - accuracy: 0.7132 - val_loss: 0.6512 - val_accuracy: 0.6316 Epoch 5/10 196/196 [==============================] - 3s 16ms/step - loss: 0.4994 - accuracy: 0.7552 - val_loss: 0.6535 - val_accuracy: 0.6490 Epoch 6/10 196/196 [==============================] - 3s 16ms/step - loss: 0.4575 - accuracy: 0.7828 - val_loss: 0.6844 - val_accuracy: 0.6450 Epoch 7/10 196/196 [==============================] - 3s 16ms/step - loss: 0.4220 - accuracy: 0.8040 - val_loss: 0.6984 - val_accuracy: 0.6498 Epoch 8/10 196/196 [==============================] - 3s 16ms/step - loss: 0.3846 - accuracy: 0.8249 - val_loss: 0.7266 - val_accuracy: 0.6511 Epoch 9/10 196/196 [==============================] - 3s 16ms/step - loss: 0.3426 - accuracy: 0.8474 - val_loss: 0.7571 - val_accuracy: 0.6535 Epoch 10/10 196/196 [==============================] - 3s 16ms/step - loss: 0.3006 - accuracy: 0.8722 - val_loss: 0.8105 - val_accuracy: 0.6530
plot_history(history)