#!/usr/bin/env python # coding: utf-8 # ## Regularisation in NNsĀ¶ # Before we start doing anything, I think it's important to understand for NLP, this is the intuitive process on what we are trying to do when we are processing our data in the IMDB dataset: # 1. Tokenization: break sentence into individual words # - Before: `"PyTorch seems really easy to use!"` # - After: `["PyTorch", "seems", "really", "easy", "to", "use", "!"]` # 2. Building vocabulary: build an index of words associated with unique numbers # - Before: `["PyTorch", "seems", "really", "easy", "to", "use", "!"]` # - After: `{"Pytorch: 0, "seems": 1, "really": 2, ...}` # 3. Convert to numerals: map words to unique numbers (indices) # - Before: `{"Pytorch: 0, "seems": 1, "really": 2, ...}` # - After: `[0, 1, 2, ...]` # 4. Embedding look-up: map sentences (indices now) to fixed matrices # - ```[[0.1, 0.4, 0.3], # [0.8, 0.1, 0.5], # ...]``` # In[ ]: # Critical plotting imports import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # PyTorch imports from torchtext import data, datasets import torch import torch.nn as nn import torch.nn.functional as F # Checking for iterable objects import collections import random # In[ ]: # Set seed torch.manual_seed(1337) if torch.cuda.is_available(): torch.cuda.manual_seed_all(1337) # In[ ]: # Set plotting style plt.style.use(('dark_background', 'bmh')) plt.rc('axes', facecolor='none') plt.rc('figure', figsize=(16, 4)) # In[ ]: # Create instances of fields # The important field here is fix_length: all examples using this field will be padded to, or None for flexible sequence lengths # We are fixing this because we will be using a FNN not an LSTM/RNN/GRU where we can go through uneven sequence lengths max_len = 80 text = data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long) label = data.LabelField(sequential=False, dtype=torch.float) # In[ ]: # Calling splits() class method of datasets.IMDB to return a torchtext.data.Dataset object datasets.IMDB.download('./') ds_train, ds_test = datasets.IMDB.splits(text, label, path='./imdb/aclImdb/') # In[ ]: # Training and test set each 25k samples # 2 fields due to the way we split above print('train : ', len(ds_train)) print('test : ', len(ds_test)) print('train.fields :', ds_train.fields) # In[ ]: # Get validation set seed_num = 1337 ds_train, ds_valid = ds_train.split(random_state=random.seed(seed_num)) # In[ ]: # Now we've training, validation and test set print('train : ', len(ds_train)) print('valid : ', len(ds_valid)) print('valid : ', len(ds_test)) # In[ ]: # Build vocabulary # num_words = 25000 num_words = 1000 text.build_vocab(ds_train, max_size=num_words) label.build_vocab(ds_train) # In[ ]: # Print vocab size print('Vocabulary size: {}'.format(len(text.vocab))) print('Label size: {}'.format(len(label.vocab))) # In[ ]: # Print most common vocabulary text most_common_samples = 10 print(text.vocab.freqs.most_common(most_common_samples)) # In[ ]: # Print most common labels print(label.vocab.freqs.most_common()) # In[ ]: # Sample 0 label ds_train[0].label # In[ ]: # Sample 0 text: broken down into individual portions ds_train[0].text # In[ ]: # Sample 0 text: human readeable sample def show_text(sample): print(' '.join(word for word in sample)) show_text(ds_train[0].text) # In[ ]: # Create and iterable object for our training, validation and testing datasets # Batches examples of similar lengths together that minimizes amount of padding needed batch_size = 64 # Change batch size from 1 to bigger number once explanation is done train_loader, valid_loader, test_loader = data.BucketIterator.splits( (ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False ) # In[ ]: # Check if iterator above is an iterable which should show True isinstance(train_loader, collections.Iterable) # In[ ]: # What's inside this iteratable object? Our text and label although now everything is in machine format (not "words") but in numbers! # The text we saw above becomes a matrix of size 1 x 80 represented by the fixed length we defined before that list(train_loader)[0] # In[ ]: # Alternative to above, this is much faster but the above code is easy to understand and implement next(train_loader.__iter__()) # In[ ]: test_batch = next(train_loader.__iter__()) # In[ ]: # What methods can we call on this batch object? Text and label test_batch.fields # In[ ]: # Let's break this down to check what's in a batch test_batch.text # In[ ]: # 1 comment per batch, each comment is limited to a size of 80 as we've defined test_batch.text.size() # In[ ]: test_batch.label # In[ ]: # Extremely weird problem in torchtext where BucketIterator returns a Batch object versus just a simple tuple of tensors containing our text index and labels # So let's fix this with a new class FixBatchGenerator class FixBatchGenerator: def __init__(self, dl, x_field, y_field): self.dl, self.x_field, self.y_field = dl, x_field, y_field def __len__(self): return len(self.dl) def __iter__(self): for batch in self.dl: X = getattr(batch, self.x_field) y = getattr(batch, self.y_field) yield (X,y) train_loader, valid_loader, test_loader = FixBatchGenerator(train_loader, 'text', 'label'), FixBatchGenerator(valid_loader, 'text', 'label'), FixBatchGenerator(test_loader, 'text', 'label') # In[ ]: # Text index print(next(train_loader.__iter__())[0]) # Text label print(next(train_loader.__iter__())[1]) # In[ ]: class FeedforwardNeuralNetModel(nn.Module): def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim): super(FeedforwardNeuralNetModel, self).__init__() # Embedding layer self.embedding = nn.Embedding(input_dim, embedding_dim) # Linear function self.fc1 = nn.Linear(embedding_dim*embedding_dim, hidden_dim) # Linear function (readout) self.fc2 = nn.Linear(hidden_dim, output_dim) def forward(self, x): # Embedding embedded = self.embedding(x) embedded = embedded.view(-1, embedding_dim*embedding_dim) # Linear function out = self.fc1(embedded) # Non-linearity out = torch.relu(out) # Toggle 3: Dropout # out = torch.dropout(out, 0.8) # Linear function (readout) # Take note here use a final sigmoid function so your loss should not go through sigmoid again. # BCELoss is the right class to use as it doesn't pass your output through a sigmoid function again. # In multi-class problems you're used to softmax which can be simplified to a logistic, # function when you have a two-class problem. out = self.fc2(out) out = torch.sigmoid(out) return out # In[ ]: input_dim = num_words + 2 embedding_dim = max_len hidden_dim = 32 output_dim = 1 # Instantiate model class and assign to object model = FeedforwardNeuralNetModel(input_dim, embedding_dim, hidden_dim, output_dim) # Push model to CUDA device if available device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # Loss function criterion = nn.BCELoss() # Optimizer # Toggle 2: L2 Norm option - this is called weight decay # optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.005) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) # In[ ]: # Number of groups of parameters print('Number of groups of parameters {}'.format(len(list(model.parameters())))) print('-'*50) # Print parameters for i in range(len(list(model.parameters()))): print(list(model.parameters())[i].size()) print('-'*50) # In[ ]: iter = 0 num_epochs = 10 history_train_acc, history_val_acc, history_train_loss, history_val_loss = [], [], [], [] best_accuracy = 0 for epoch in range(num_epochs): # print('-'*50) for i, (samples, labels) in enumerate(train_loader): # Training mode model.train() # Load samples samples = samples.view(-1, max_len).to(device) labels = labels.view(-1, 1).to(device) # Clear gradients w.r.t. parameters optimizer.zero_grad() # Forward pass to get output/logits outputs = model(samples) # Calculate Loss: softmax --> cross entropy loss loss = criterion(outputs, labels) # Toggle 1: L1 norm, add to original loss # fc1_params = torch.cat([x.view(-1) for x in model.fc1.parameters()]) # loss += 0.001 * torch.norm(fc1_params, 1) # Getting gradients w.r.t. parameters loss.backward() # Updating parameters optimizer.step() iter += 1 if iter % 100 == 0: # Get training statistics train_loss = loss.data.item() # Testing mode model.eval() # Calculate Accuracy correct = 0 total = 0 # Iterate through test dataset for samples, labels in valid_loader: # Load samples samples = samples.view(-1, max_len).to(device) labels = labels.view(-1).to(device) # Forward pass only to get logits/output outputs = model(samples) # Val loss val_loss = criterion(outputs.view(-1, 1), labels.view(-1, 1)) # We use a threshold to define. # There is another way to do this with one-hot label. Feel free to explore and understand what are the pros/cons of each. # This opens up a whole topic on why it becomes problematic when we expand beyond 2 class to 10 classes. # Why do we encode? Why can't we do 0, 1, 2, 3, 4 etc. without one-hot encoding? predicted = outputs.ge(0.5).view(-1) # Total number of labels total += labels.size(0) # Total correct predictions correct += (predicted.type(torch.FloatTensor).cpu() == labels.type(torch.FloatTensor)).sum().item() # correct = (predicted == labels.byte()).int().sum().item() accuracy = 100. * correct / total # Print Loss print('Iter: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format(iter, train_loss, val_loss.item(), round(accuracy, 2))) # Append to history history_val_loss.append(val_loss.data.item()) history_val_acc.append(round(accuracy, 2)) history_train_loss.append(train_loss) # Save model when accuracy beats best accuracy if accuracy > best_accuracy: best_accuracy = accuracy # We can load this best model on the validation set later torch.save(model.state_dict(), 'best_model.pth') # In[ ]: # Plotting loss graph plt.plot(history_train_loss, label='Train') plt.plot(history_val_loss, label='Validation') plt.title('Loss Graph') plt.legend() plt.show() # In[ ]: # Plotting validation accuracy graph plt.plot(history_val_acc) plt.title('Validation Accuracy') # In[ ]: weights = torch.Tensor().to(device) for param_group in list(model.parameters()): weights = torch.cat((param_group.view(-1), weights)) print(param_group.size()) # Toggle 0: No regularization weights_nothing = weights.cpu().detach().numpy() # Toggle 1: L1 norm on FC1 # weights_L1 = weights.detach().numpy() # Toggle 2: L2 norm # weights_L2 = weights.detach().numpy() # Toggle 3: dropout # weights_dropout = weights.detach().numpy() # In[ ]: # plt.hist(weights_L1.reshape(-1), range=(-.5, .5), bins=20) # In[ ]: # plt.hist(weights_nothing.reshape(-1), range=(-.5, .5), bins=20) # In[ ]: # Show weight distribution plt.hist(( weights_nothing.reshape(-1), weights_L1.reshape(-1), weights_L2.reshape(-1), ), 49, range=(-.5, .5), label=( 'No-reg', 'L1', 'L2', )) plt.legend(); # In[ ]: