#!/usr/bin/env python
# coding: utf-8
#
#
#
Natural Language Processing
#
Text Classification
#
Bruno Gonçalves
# www.data4sci.com
# @bgoncalves, @data4sci
#
# In[1]:
import warnings
warnings.filterwarnings('ignore')
import gzip
from collections import Counter
from pprint import pprint
import string
import pandas as pd
import numpy as np
np.random.seed(123456)
import matplotlib
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
tqdm.pandas()
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
import torchview
from torchview import draw_graph
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stopwords = set(stopwords.words('english'))
import sklearn
from sklearn.manifold import TSNE
import watermark
get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('matplotlib', 'inline')
# We start by print out the versions of the libraries we're using for future reference
# In[2]:
get_ipython().run_line_magic('watermark', '-n -v -m -g -iv')
# Load default figure style
# In[3]:
plt.style.use('d4sci.mplstyle')
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
# # Word Embeddings
# We start by loading pre-computed Word Embeddings
# In[4]:
word_dict = {}
word_list = []
embeddings = np.zeros((400_000, 50), dtype='float32')
count = 0
# Windows users may have to add encoding="utf8" to the call to gzip.open when reading the dataset
with gzip.open('data/glove.6B.50d.txt.gz', 'rt') as fp:
for line in tqdm(fp, total=400_000):
fields = line.split()
word = fields[0]
word_list.append(word)
word_dict[word] = count
embeddings[count] = np.asarray(fields[1:], dtype='float32')
count += 1
# In[5]:
fig, ax = plt.subplots(1, figsize=(10, 10))
ax.imshow(embeddings.T, aspect=4000, cmap=plt.cm.seismic)
ax.grid(None)
ax.set_xticks([0, 100_000, 200_000, 300_000, 400_000])
ax.set_xlabel("Vocabulary")
ax.set_ylabel("Dimensions")
# A 2D projection makes it clear that semantics is also encoded in the representation
# In[6]:
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
plot_only = 500 # Plot only 500 words
low_dim_embs = tsne.fit_transform(np.array(embeddings)[:plot_only, :])
# In[7]:
labels = [word_list[i] for i in range(plot_only)]
# In[8]:
plt.figure(figsize=(18, 18))
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom',
fontsize=12)
#
# # IMDB Dataset
# The IMDB dataset can be downloaded from [Kaggle](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews). We load it directly from the data directory in the repository
# In[9]:
data = pd.read_csv('data/IMDB Dataset.csv')
# We transform the sentiment label into a numeric value
# In[10]:
data['label'] = data['sentiment'].progress_apply(lambda x: 1 if x=='positive' else 0)
# In[11]:
def keep_words(text):
if len(set(text) & set(string.ascii_letters)) != len(set(text)):
return None
return text
def preprocess_pipeline(text):
lemmatizer = WordNetLemmatizer()
tokens = word_tokenize(text)
no_stopwords = [token for token in tokens if token not in stopwords]
lemmas = [lemmatizer.lemmatize(t) for t in no_stopwords]
words = [word for word in lemmas if keep_words(word) is not None]
return ' '.join(words)
# In[12]:
data['processed'] = data['review'].progress_apply(preprocess_pipeline)
data.head()
# Extract all the tokens
# In[13]:
reviews = data.processed.values
words = ' '.join(reviews)
words = words.split()
# Build vocabulary
# In[14]:
counter = Counter(words)
vocab = sorted(counter, key=counter.get, reverse=True)
word_dict = dict(zip(vocab, range(1, len(vocab)+1)))
word_dict[''] = 0
word_list = ['']
word_list.extend(vocab)
vocab_size = len(vocab)
# Tokenize reviews
# In[15]:
reviews_enc = [[word_dict[word] for word in review.split()] for review in tqdm(reviews)]
# Make sure all the reviews have the same length. We truncate reviews that are too long and pad the ones that are too short
# In[16]:
def pad_features(reviews, pad_id, seq_length=128):
features = np.full((len(reviews), seq_length), pad_id, dtype=int)
for i, row in enumerate(reviews):
features[i, :len(row)] = np.array(row)[:seq_length]
return features
max_words = 500
features = pad_features(reviews_enc, pad_id=word_dict[''], seq_length=max_words)
# Now we have a matrix where each row corresponds to a unique review and each column to each of the words
# In[17]:
features.shape
# Train, test and valication splits
# In[18]:
train_size = .7 # 70% used for training
val_size = .5 # 50% of test data set used for validation
# In[19]:
labels = data.label.to_numpy()
# Training dataset
split_id = int(len(features) * train_size)
X_train, X_remain = features[:split_id], features[split_id:]
y_train, y_remain = labels[:split_id], labels[split_id:]
# Testing and Validation
split_val_id = int(len(X_remain) * val_size)
X_validation, X_test = X_remain[:split_val_id], X_remain[split_val_id:]
y_validation, y_test = y_remain[:split_val_id], y_remain[split_val_id:]
# In[20]:
print('Shape of training data:', X_train.shape)
print('Shape of test data:', X_test.shape)
print('Shape of validation data:', X_validation.shape)
# # PyTorch
# ## Tensor datasets
# In[21]:
trainset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
validset = TensorDataset(torch.from_numpy(X_validation), torch.from_numpy(y_validation))
testset = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
# ## Data Loaders
# In[22]:
batch_size = 128
hidden_size = 32
output_size = 1
# In[23]:
trainloader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
valloader = DataLoader(validset, shuffle=True, batch_size=batch_size)
testloader = DataLoader(testset, shuffle=True, batch_size=batch_size)
# Finally, we are ready to define our models
# # Feed Forward Model
# We'll start by training a simple FF model without any fancyness.
# Our model will have an embedding layer to turn our numerical ids into vectors, a hidden dense layer of 32 neurons and ReLu activation followed by an output layer with just one neuron and a sigmoid activation
# In[24]:
# FF_model = nn.Sequential(
# nn.Embedding(vocab_size, 50),
# # <===> #
# nn.Flatten(),
# nn.Linear(50*max_words, hidden_size),
# nn.ReLU(),
# nn.Linear(hidden_size, output_size),
# nn.Sigmoid()
# )
# In[25]:
class FFNet(nn.Module):
def __init__(self):
super(FFNet, self).__init__()
self.word_embeddings = nn.Embedding(vocab_size, 50)
self.flatten = nn.Flatten(1, -1)
self.linear1 = nn.Linear(50*max_words, hidden_size)
self.relu = nn.ReLU()
self.linear2 = nn.Linear(hidden_size, output_size)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
emb = self.word_embeddings(x)
flat = self.flatten(emb)
line1 = self.linear1(flat)
relu = self.relu(line1)
line2 = self.linear2(relu)
output = self.sigmoid(line2)
return output
# In[26]:
FF_model = FFNet()
# In[27]:
draw_graph(FF_model, input_data=torch.as_tensor(X_test[:128])).visual_graph
# Now we generate and load the embedding matrix
# In[28]:
embedding_matrix = np.zeros((vocab_size, 50), dtype='float32')
count = 0
with gzip.open('data/glove.6B.50d.txt.gz', 'rt') as fp:
for line in tqdm(fp, total=400_000):
fields = line.split()
word = fields[0]
if word in word_dict:
pos = word_dict[word]
if pos < vocab_size:
count += 1
pos = word_dict[word]
embedding_matrix[pos] = np.asarray(fields[1:], dtype='float32')
# In[29]:
embedding_matrix.shape
# Copy embedding matrix into Embedding layer weights
# In[30]:
FF_model.word_embeddings.weight.data.copy_(torch.from_numpy(embedding_matrix))
# "Freeze" the layer (don't allow the weights to be updated)
# In[31]:
FF_model.word_embeddings.weight.requires_grad=False
# In[33]:
print(FF_model)
# Define training device
# In[34]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(device)
# Hyperparameters
# In[35]:
vocab_size = len(word_list)
output_size = 1
embedding_size = 50
hidden_size = 32
# Then, we define some training config for our model. Since we have *binary classification* task, we will use **Binary Cross Entropy Loss (BCELoss)** as our loss function. And we will use **Adam** as optimizer as it shown in many references that it could find converge point quickly. Based on
# In[36]:
lr = 0.001
criterion = nn.BCELoss() # we use BCELoss cz we have binary classification problem
optim = Adam(FF_model.parameters(), lr=lr)
grad_clip = 5
epochs = 8
print_every = 1
es_limit = 5
# **This is our main part!**
#
# So, here we define training and validation loop. We will use only 5 epochs and see if we need more or less epochs to obtain good performance. We log our loss and accuracy for each epochs, so we can plot them later to see how the performances. Inspired by https://github.com/LukeDitria/pytorch_tutorials
# In[37]:
def train_model(model, epochs, optim):
history = {
'train_loss': [],
'train_acc': [],
'val_loss': [],
'val_acc': [],
'epochs': epochs
}
# train loop
model = model.to(device)
epochloop = tqdm(range(epochs), position=0, desc='Training', leave=True)
# early stop trigger
es_trigger = 0
val_loss_min = torch.inf
for e in epochloop:
#################
# training mode #
#################
model.train()
train_loss = 0
train_acc = 0
for id, (feature, target) in enumerate(trainloader):
# add epoch meta info
epochloop.set_postfix_str(f'Training batch {id}/{len(trainloader)}')
# move to device
feature, target = feature.to(device), target.to(device)
# reset optimizer
optim.zero_grad()
# forward pass
out = model(feature)
# acc
predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
equals = predicted == target
acc = torch.mean(equals.type(torch.FloatTensor))
train_acc += acc.item()
# loss
loss = criterion(out.squeeze(), target.float())
train_loss += loss.item()
loss.backward()
# clip grad
nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
# update optimizer
optim.step()
# free some memory
del feature, target, predicted
history['train_loss'].append(train_loss / len(trainloader))
history['train_acc'].append(train_acc / len(trainloader))
####################
# validation model #
####################
model.eval()
val_loss = 0
val_acc = 0
with torch.no_grad():
for id, (feature, target) in enumerate(valloader):
# add epoch meta info
epochloop.set_postfix_str(f'Validation batch {id}/{len(valloader)}')
# move to device
feature, target = feature.to(device), target.to(device)
# forward pass
out = model(feature)
# acc
predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
equals = predicted == target
acc = torch.mean(equals.type(torch.FloatTensor))
val_acc += acc.item()
# loss
loss = criterion(out.squeeze(), target.float())
val_loss += loss.item()
# free some memory
del feature, target, predicted
history['val_loss'].append(val_loss / len(valloader))
history['val_acc'].append(val_acc / len(valloader))
# reset model mode
model.train()
# add epoch meta info
epochloop.set_postfix_str(f'Val Loss: {val_loss / len(valloader):.3f} | Val Acc: {val_acc / len(valloader):.3f}')
# print epoch
if (e+1) % print_every == 0:
epochloop.write(f'Epoch {e+1}/{epochs} | Train Loss: {train_loss / len(trainloader):.3f} Train Acc: {train_acc / len(trainloader):.3f} | Val Loss: {val_loss / len(valloader):.3f} Val Acc: {val_acc / len(valloader):.3f}')
epochloop.update()
return history
# In[38]:
def plot_history(history):
fig, ax_lst = plt.subplots(1, 2, sharex=True, sharey=True)
epochs = range(history['epochs'])
ax_lst[0].plot(epochs, history['train_loss'], label='Training')
ax_lst[0].plot(epochs, history['val_loss'], label='Testing')
ax_lst[0].set_ylabel('Loss')
ax_lst[0].set_xlabel('Epoch')
ax_lst[0].set_xticks(epochs)
ax_lst[1].plot(epochs, history['train_acc'], label='Training')
ax_lst[1].plot(epochs, history['val_acc'], label='Testing')
ax_lst[1].set_ylabel('Accuracy')
ax_lst[1].set_xlabel('Epoch')
ax_lst[1].set_xticks(epochs)
ax_lst[1].legend()
fig.tight_layout()
# In[39]:
get_ipython().run_cell_magic('time', '', 'history = train_model(FF_model, epochs, optim)\n')
# In[40]:
plot_history(history)
#
#
#