Before we start doing anything, I think it's important to understand for NLP, this is the intuitive process on what we are trying to do when we are processing our data in the IMDB dataset:
"PyTorch seems really easy to use!"
["PyTorch", "seems", "really", "easy", "to", "use", "!"]
["PyTorch", "seems", "really", "easy", "to", "use", "!"]
{"Pytorch: 0, "seems": 1, "really": 2, ...}
{"Pytorch: 0, "seems": 1, "really": 2, ...}
[0, 1, 2, ...]
[[0.1,
[0.8, 0.1, 0.5],
...]```
# Critical plotting imports
import matplotlib.pyplot as plt
%matplotlib inline
# Checnking for module version
from cmp_version import cmp_version
# PyTorch imports
from torchtext import __version__ as ttver
# https://github.com/pytorch/text/releases/tag/v0.9.0-rc5
if cmp_version(ttver,"0.9.0")>=0:
from torchtext.legacy import data, datasets
else:
from torchtext import data, datasets
import torch
import torch.nn as nn
import torch.nn.functional as F
# Checking for iterable objects
import collections
import random
# Set seed
torch.manual_seed(1337)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(1337)
# Set plotting style
plt.style.use(('dark_background', 'bmh'))
plt.rc('axes', facecolor='none')
plt.rc('figure', figsize=(16, 4))
# Create instances of fields
# The important field here is fix_length: all examples using this field will be padded to, or None for flexible sequence lengths
# We are fixing this because we will be using a FNN not an LSTM/RNN/GRU where we can go through uneven sequence lengths
max_len = 80
text = data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)
label = data.LabelField(sequential=False, dtype=torch.float)
# Calling splits() class method of datasets.IMDB to return a torchtext.data.Dataset object
datasets.IMDB.download('./')
ds_train, ds_test = datasets.IMDB.splits(text, label, path='./imdb/aclImdb/')
# Training and test set each 25k samples
# 2 fields due to the way we split above
print('train : ', len(ds_train))
print('test : ', len(ds_test))
print('train.fields :', ds_train.fields)
# Get validation set
seed_num = 1337
ds_train, ds_valid = ds_train.split(random_state=random.seed(seed_num))
# Now we've training, validation and test set
print('train : ', len(ds_train))
print('valid : ', len(ds_valid))
print('valid : ', len(ds_test))
# Build vocabulary
# num_words = 25000
num_words = 1000
text.build_vocab(ds_train, max_size=num_words)
label.build_vocab(ds_train)
# Print vocab size
print('Vocabulary size: {}'.format(len(text.vocab)))
print('Label size: {}'.format(len(label.vocab)))
# Print most common vocabulary text
most_common_samples = 10
print(text.vocab.freqs.most_common(most_common_samples))
# Print most common labels
print(label.vocab.freqs.most_common())
# Sample 0 label
ds_train[0].label
# Sample 0 text: broken down into individual portions
ds_train[0].text
# Sample 0 text: human readeable sample
def show_text(sample):
print(' '.join(word for word in sample))
show_text(ds_train[0].text)
# Create and iterable object for our training, validation and testing datasets
# Batches examples of similar lengths together that minimizes amount of padding needed
batch_size = 64 # Change batch size from 1 to bigger number once explanation is done
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
(ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False
)
# Check if iterator above is an iterable which should show True
isinstance(train_loader, collections.Iterable)
# What's inside this iteratable object? Our text and label although now everything is in machine format (not "words") but in numbers!
# The text we saw above becomes a matrix of size 1 x 80 represented by the fixed length we defined before that
list(train_loader)[0]
# Alternative to above, this is much faster but the above code is easy to understand and implement
next(train_loader.__iter__())
test_batch = next(train_loader.__iter__())
# What methods can we call on this batch object? Text and label
test_batch.fields
# Let's break this down to check what's in a batch
test_batch.text
# 1 comment per batch, each comment is limited to a size of 80 as we've defined
test_batch.text.size()
test_batch.label
# Extremely weird problem in torchtext where BucketIterator returns a Batch object versus just a simple tuple of tensors containing our text index and labels
# So let's fix this with a new class FixBatchGenerator
class FixBatchGenerator:
def __init__(self, dl, x_field, y_field):
self.dl, self.x_field, self.y_field = dl, x_field, y_field
def __len__(self):
return len(self.dl)
def __iter__(self):
for batch in self.dl:
X = getattr(batch, self.x_field)
y = getattr(batch, self.y_field)
yield (X,y)
train_loader, valid_loader, test_loader = FixBatchGenerator(train_loader, 'text', 'label'), FixBatchGenerator(valid_loader, 'text', 'label'), FixBatchGenerator(test_loader, 'text', 'label')
# Text index
print(next(train_loader.__iter__())[0])
# Text label
print(next(train_loader.__iter__())[1])
class FeedforwardNeuralNetModel(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super(FeedforwardNeuralNetModel, self).__init__()
# Embedding layer
self.embedding = nn.Embedding(input_dim, embedding_dim)
# Linear function
self.fc1 = nn.Linear(embedding_dim*embedding_dim, hidden_dim)
# Linear function (readout)
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
# Embedding
embedded = self.embedding(x)
embedded = embedded.view(-1, embedding_dim*embedding_dim)
# Linear function
out = self.fc1(embedded)
# Non-linearity
out = torch.relu(out)
# Toggle 3: Dropout
# out = torch.dropout(out, 0.8)
# Linear function (readout)
# Take note here use a final sigmoid function so your loss should not go through sigmoid again.
# BCELoss is the right class to use as it doesn't pass your output through a sigmoid function again.
# In multi-class problems you're used to softmax which can be simplified to a logistic,
# function when you have a two-class problem.
out = self.fc2(out)
out = torch.sigmoid(out)
return out
input_dim = num_words + 2
embedding_dim = max_len
hidden_dim = 32
output_dim = 1
# Instantiate model class and assign to object
model = FeedforwardNeuralNetModel(input_dim, embedding_dim, hidden_dim, output_dim)
# Push model to CUDA device if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
# Loss function
criterion = nn.BCELoss()
# Optimizer
# Toggle 2: L2 Norm option - this is called weight decay
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# Number of groups of parameters
print('Number of groups of parameters {}'.format(len(list(model.parameters()))))
print('-'*50)
# Print parameters
for i in range(len(list(model.parameters()))):
print(list(model.parameters())[i].size())
print('-'*50)
iter = 0
num_epochs = 10
history_train_acc, history_val_acc, history_train_loss, history_val_loss = [], [], [], []
best_accuracy = 0
for epoch in range(num_epochs):
# print('-'*50)
for i, (samples, labels) in enumerate(train_loader):
# Training mode
model.train()
# Load samples
samples = samples.view(-1, max_len).to(device)
labels = labels.view(-1, 1).to(device)
# Clear gradients w.r.t. parameters
optimizer.zero_grad()
# Forward pass to get output/logits
outputs = model(samples)
# Calculate Loss: softmax --> cross entropy loss
loss = criterion(outputs, labels)
# Toggle 1: L1 norm, add to original loss
# fc1_params = torch.cat([x.view(-1) for x in model.fc1.parameters()])
# loss += 0.001 * torch.norm(fc1_params, 1)
# Getting gradients w.r.t. parameters
loss.backward()
# Updating parameters
optimizer.step()
iter += 1
if iter % 100 == 0:
# Get training statistics
train_loss = loss.data.item()
# Testing mode
model.eval()
# Calculate Accuracy
correct = 0
total = 0
# Iterate through test dataset
for samples, labels in valid_loader:
# Load samples
samples = samples.view(-1, max_len).to(device)
labels = labels.view(-1).to(device)
# Forward pass only to get logits/output
outputs = model(samples)
# Val loss
val_loss = criterion(outputs.view(-1, 1), labels.view(-1, 1))
# We use a threshold to define.
# There is another way to do this with one-hot label. Feel free to explore and understand what are the pros/cons of each.
# This opens up a whole topic on why it becomes problematic when we expand beyond 2 class to 10 classes.
# Why do we encode? Why can't we do 0, 1, 2, 3, 4 etc. without one-hot encoding?
predicted = outputs.ge(0.5).view(-1)
# Total number of labels
total += labels.size(0)
# Total correct predictions
correct += (predicted.type(torch.FloatTensor).cpu() == labels.type(torch.FloatTensor)).sum().item()
# correct = (predicted == labels.byte()).int().sum().item()
accuracy = 100. * correct / total
# Print Loss
print('Iter: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format(iter, train_loss, val_loss.item(), round(accuracy, 2)))
# Append to history
history_val_loss.append(val_loss.data.item())
history_val_acc.append(round(accuracy, 2))
history_train_loss.append(train_loss)
# Save model when accuracy beats best accuracy
if accuracy > best_accuracy:
best_accuracy = accuracy
# We can load this best model on the validation set later
torch.save(model.state_dict(), 'best_model.pth')
# Plotting loss graph
plt.plot(history_train_loss, label='Train')
plt.plot(history_val_loss, label='Validation')
plt.title('Loss Graph')
plt.legend()
plt.show()
# Plotting validation accuracy graph
plt.plot(history_val_acc)
plt.title('Validation Accuracy')
weights = torch.Tensor().to(device)
for param_group in list(model.parameters()):
weights = torch.cat((param_group.view(-1), weights))
print(param_group.size())
# Toggle 0: No regularization
weights_nothing = weights.cpu().detach().numpy()
# Toggle 1: L1 norm on FC1
# weights_L1 = weights.detach().numpy()
# Toggle 2: L2 norm
# weights_L2 = weights.detach().numpy()
# Toggle 3: dropout
# weights_dropout = weights.detach().numpy()
# plt.hist(weights_L1.reshape(-1), range=(-.5, .5), bins=20)
# plt.hist(weights_nothing.reshape(-1), range=(-.5, .5), bins=20)
# Show weight distribution
plt.hist((
weights_nothing.reshape(-1),
weights_L1.reshape(-1),
weights_L2.reshape(-1),
), 49, range=(-.5, .5), label=(
'No-reg',
'L1',
'L2',
))
plt.legend();