Notebook

Regularisation in NNs¶¶

Before we start doing anything, I think it's important to understand for NLP, this is the intuitive process on what we are trying to do when we are processing our data in the IMDB dataset:

Tokenization: break sentence into individual words
- Before: "PyTorch seems really easy to use!"
- After: ["PyTorch", "seems", "really", "easy", "to", "use", "!"]
Building vocabulary: build an index of words associated with unique numbers
- Before: ["PyTorch", "seems", "really", "easy", "to", "use", "!"]
- After: {"Pytorch: 0, "seems": 1, "really": 2, ...}
Convert to numerals: map words to unique numbers (indices)
- Before: {"Pytorch: 0, "seems": 1, "really": 2, ...}
- After: [0, 1, 2, ...]
Embedding look-up: map sentences (indices now) to fixed matrices
- ```
[[0.1,
 [0.8, 0.1, 0.5],
 ...]```
```

In [ ]:

# Critical plotting imports
import matplotlib.pyplot as plt
%matplotlib inline

# Checnking for module version 
from cmp_version import cmp_version 

# PyTorch imports
from torchtext import __version__ as ttver
# https://github.com/pytorch/text/releases/tag/v0.9.0-rc5
if cmp_version(ttver,"0.9.0")>=0: 
    from torchtext.legacy import data, datasets
else:
    from torchtext import data, datasets
import torch
import torch.nn as nn
import torch.nn.functional as F

# Checking for iterable objects
import collections
import random

In [ ]:

# Set seed
torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(1337)

In [ ]:

# Set plotting style
plt.style.use(('dark_background', 'bmh'))
plt.rc('axes', facecolor='none')
plt.rc('figure', figsize=(16, 4))

In [ ]:

# Create instances of fields
# The important field here is fix_length: all examples using this field will be padded to, or None for flexible sequence lengths
# We are fixing this because we will be using a FNN not an LSTM/RNN/GRU where we can go through uneven sequence lengths
max_len = 80
text = data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)
label = data.LabelField(sequential=False, dtype=torch.float)

In [ ]:

# Calling splits() class method of datasets.IMDB to return a torchtext.data.Dataset object
datasets.IMDB.download('./')
ds_train, ds_test = datasets.IMDB.splits(text, label, path='./imdb/aclImdb/')

In [ ]:

# Training and test set each 25k samples
# 2 fields due to the way we split above
print('train : ', len(ds_train))
print('test : ', len(ds_test))
print('train.fields :', ds_train.fields)

In [ ]:

# Get validation set
seed_num = 1337
ds_train, ds_valid = ds_train.split(random_state=random.seed(seed_num))

In [ ]:

# Now we've training, validation and test set
print('train : ', len(ds_train))
print('valid : ', len(ds_valid))
print('valid : ', len(ds_test))

In [ ]:

# Build vocabulary
# num_words = 25000
num_words = 1000
text.build_vocab(ds_train, max_size=num_words)
label.build_vocab(ds_train)

In [ ]:

# Print vocab size
print('Vocabulary size: {}'.format(len(text.vocab)))
print('Label size: {}'.format(len(label.vocab)))

In [ ]:

# Print most common vocabulary text
most_common_samples = 10
print(text.vocab.freqs.most_common(most_common_samples))

In [ ]:

# Print most common labels
print(label.vocab.freqs.most_common())

In [ ]:

# Sample 0 label
ds_train[0].label

In [ ]:

# Sample 0 text: broken down into individual portions
ds_train[0].text

In [ ]:

# Sample 0 text: human readeable sample
def show_text(sample):
    print(' '.join(word for word in sample))
    
show_text(ds_train[0].text)

In [ ]:

# Create and iterable object for our training, validation and testing datasets
# Batches examples of similar lengths together that minimizes amount of padding needed
batch_size = 64  # Change batch size from 1 to bigger number once explanation is done
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False
)

In [ ]:

# Check if iterator above is an iterable which should show True
isinstance(train_loader, collections.Iterable)

In [ ]:

# What's inside this iteratable object? Our text and label although now everything is in machine format (not "words") but in numbers!
# The text we saw above becomes a matrix of size 1 x 80 represented by the fixed length we defined before that
list(train_loader)[0]

In [ ]:

# Alternative to above, this is much faster but the above code is easy to understand and implement
next(train_loader.__iter__())

In [ ]:

test_batch = next(train_loader.__iter__())

In [ ]:

# What methods can we call on this batch object? Text and label
test_batch.fields

In [ ]:

# Let's break this down to check what's in a batch
test_batch.text

In [ ]:

# 1 comment per batch, each comment is limited to a size of 80 as we've defined
test_batch.text.size()

In [ ]:

test_batch.label

In [ ]:

# Extremely weird problem in torchtext where BucketIterator returns a Batch object versus just a simple tuple of tensors containing our text index and labels
# So let's fix this with a new class FixBatchGenerator

class FixBatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)
            
train_loader, valid_loader, test_loader = FixBatchGenerator(train_loader, 'text', 'label'), FixBatchGenerator(valid_loader, 'text', 'label'), FixBatchGenerator(test_loader, 'text', 'label')

In [ ]:

# Text index
print(next(train_loader.__iter__())[0])

# Text label
print(next(train_loader.__iter__())[1])

In [ ]:

class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        # Linear function
        self.fc1 = nn.Linear(embedding_dim*embedding_dim, hidden_dim) 

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)
        embedded = embedded.view(-1, embedding_dim*embedding_dim)
        # Linear function
        out = self.fc1(embedded)

        # Non-linearity
        out = torch.relu(out)
        
        # Toggle 3: Dropout
        # out = torch.dropout(out, 0.8)

        # Linear function (readout)
        # Take note here use a final sigmoid function so your loss should not go through sigmoid again.
        # BCELoss is the right class to use as it doesn't pass your output through a sigmoid function again.
        # In multi-class problems you're used to softmax which can be simplified to a logistic,
        # function when you have a two-class problem.
        out = self.fc2(out)
        out = torch.sigmoid(out)
    
        return out

In [ ]:

input_dim = num_words + 2
embedding_dim = max_len
hidden_dim = 32
output_dim = 1

# Instantiate model class and assign to object
model = FeedforwardNeuralNetModel(input_dim, embedding_dim, hidden_dim, output_dim)

# Push model to CUDA device if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function
criterion = nn.BCELoss()

# Optimizer
# Toggle 2: L2 Norm option - this is called weight decay
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [ ]:

# Number of groups of parameters
print('Number of groups of parameters {}'.format(len(list(model.parameters()))))
print('-'*50)
# Print parameters
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())
print('-'*50)

In [ ]:

iter = 0
num_epochs = 10
history_train_acc, history_val_acc, history_train_loss, history_val_loss = [], [], [], []
best_accuracy = 0
for epoch in range(num_epochs):
#     print('-'*50)
    for i, (samples, labels) in enumerate(train_loader):
        # Training mode
        model.train()
        
        # Load samples
        samples = samples.view(-1, max_len).to(device)
        labels = labels.view(-1, 1).to(device)

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = model(samples)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        
        # Toggle 1: L1 norm, add to original loss
        # fc1_params = torch.cat([x.view(-1) for x in model.fc1.parameters()])
        # loss += 0.001 * torch.norm(fc1_params, 1)
    
        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 100 == 0:
            # Get training statistics
            train_loss = loss.data.item()
            
            # Testing mode
            model.eval()
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for samples, labels in valid_loader:
                # Load samples
                samples = samples.view(-1, max_len).to(device)
                labels = labels.view(-1).to(device)

                # Forward pass only to get logits/output
                outputs = model(samples)
                
                # Val loss
                val_loss = criterion(outputs.view(-1, 1), labels.view(-1, 1))
                
                # We use a threshold to define. 
                # There is another way to do this with one-hot label. Feel free to explore and understand what are the pros/cons of each.
                # This opens up a whole topic on why it becomes problematic when we expand beyond 2 class to 10 classes.
                # Why do we encode? Why can't we do 0, 1, 2, 3, 4 etc. without one-hot encoding?
                predicted = outputs.ge(0.5).view(-1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted.type(torch.FloatTensor).cpu() == labels.type(torch.FloatTensor)).sum().item()
                # correct = (predicted == labels.byte()).int().sum().item()
            
            accuracy = 100. * correct / total
        
            # Print Loss
            print('Iter: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format(iter, train_loss, val_loss.item(), round(accuracy, 2)))
            
            # Append to history
            history_val_loss.append(val_loss.data.item())
            history_val_acc.append(round(accuracy, 2))
            history_train_loss.append(train_loss)
            
            # Save model when accuracy beats best accuracy
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                # We can load this best model on the validation set later
                torch.save(model.state_dict(), 'best_model.pth')

In [ ]:

# Plotting loss graph
plt.plot(history_train_loss, label='Train')
plt.plot(history_val_loss, label='Validation')
plt.title('Loss Graph')
plt.legend()
plt.show()

In [ ]:

# Plotting validation accuracy graph
plt.plot(history_val_acc)
plt.title('Validation Accuracy')

In [ ]:

weights = torch.Tensor().to(device)
for param_group in list(model.parameters()):
    weights = torch.cat((param_group.view(-1), weights))
    print(param_group.size())
    
# Toggle 0: No regularization
weights_nothing = weights.cpu().detach().numpy()

# Toggle 1: L1 norm on FC1
# weights_L1 = weights.detach().numpy()

# Toggle 2: L2 norm
# weights_L2 = weights.detach().numpy()

# Toggle 3: dropout
# weights_dropout = weights.detach().numpy()

In [ ]:

# plt.hist(weights_L1.reshape(-1), range=(-.5, .5), bins=20)

In [ ]:

# plt.hist(weights_nothing.reshape(-1), range=(-.5, .5), bins=20)

In [ ]:

# Show weight distribution
plt.hist((
    weights_nothing.reshape(-1),
    weights_L1.reshape(-1),
    weights_L2.reshape(-1),
), 49, range=(-.5, .5), label=(
    'No-reg',
    'L1',
    'L2',
))
plt.legend();

In [ ]: