Adapted from : https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/2_lstm.ipynb
tqdm
torch
torchtext
datasets
matplotlib
Run the following cell to install the packages.
#
# Required Packages
# Run this cell to install required packages.
#
%pip install "datasets>=2.2" "matplotlib>=2.0" "torch>=1.9" "torchtext>=0.12" "tqdm>=4.64"
import functools
import sys
import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
from datasets import Dataset, DatasetDict
_ = torch.manual_seed(seed)
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
def tokenize_data(example, tokenizer, max_length):
tokens = tokenizer(example["text"])[:max_length]
length = len(tokens)
return {"tokens": tokens, "length": length}
train_data = train_data.map(tokenize_data, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length})
test_data = test_data.map(tokenize_data, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length})
train_data_df = Dataset.to_pandas(train_data).sample(n=3000)
train_data = Dataset.from_pandas(train_data_df)
test_data_df = Dataset.to_pandas(test_data).sample(n=2000)
test_data = Dataset.from_pandas(test_data_df)
train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]
special_tokens = ["<unk>", "<pad>"]
vocab = torchtext.vocab.build_vocab_from_iterator(
train_data["tokens"],
min_freq=min_freq,
specials=special_tokens,
)
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]
vocab.set_default_index(unk_index)
def numericalize_data(example, vocab):
ids = [vocab[token] for token in example["tokens"]]
return {"ids": ids}
train_data = train_data.map(numericalize_data, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize_data, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_data, fn_kwargs={"vocab": vocab})
train_data = train_data.with_format(type="torch", columns=["ids", "label", "length"])
valid_data = valid_data.with_format(type="torch", columns=["ids", "label", "length"])
test_data = test_data.with_format(type="torch", columns=["ids", "label", "length"])
def collate(batch, pad_index):
batch_ids = [i["ids"] for i in batch]
batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
batch_length = [i["length"] for i in batch]
batch_length = torch.stack(batch_length)
batch_label = [i["label"] for i in batch]
batch_label = torch.stack(batch_label)
batch = {"ids": batch_ids, "length": batch_length, "label": batch_label}
return batch
collate = functools.partial(collate, pad_index=pad_index)
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, collate_fn=collate, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, collate_fn=collate)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, collate_fn=collate)
class RNN(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_index):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.rnn = nn.RNN(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, ids, length):
# text = [sent len, batch size]
embedded = self.embedding(ids)
# embedded = [sent len, batch size, emb dim]
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True, enforce_sorted=False)
packed_output, hidden = self.rnn(packed_embedded)
output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
# output = [sent len, batch size, hid dim]
# hidden = [1, batch size, hid dim]
# assert torch.equal(output[-1,:,:], hidden.squeeze(0))
return self.fc(hidden.squeeze(0))
vocab_size = len(vocab)
output_dim = 2
model = RNN(vocab_size, embedding_dim, hidden_dim, output_dim, pad_index)
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"The model has {count_parameters(model):,} trainable parameters")
The model has 1,809,398 trainable parameters
def initialize_weights(m):
if isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.LSTM):
for name, param in m.named_parameters():
if "bias" in name:
nn.init.zeros_(param)
elif "weight" in name:
nn.init.orthogonal_(param)
model.apply(initialize_weights)
def train(dataloader, model, criterion, optimizer, device):
model.train()
epoch_losses = []
epoch_accs = []
for batch in tqdm.tqdm(dataloader, desc="training...", file=sys.stdout):
ids = batch["ids"].to(device)
length = batch["length"]
label = batch["label"].to(device)
prediction = model(ids, length)
loss = criterion(prediction, label)
accuracy = get_accuracy(prediction, label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_losses.append(loss.item())
epoch_accs.append(accuracy.item())
return epoch_losses, epoch_accs
def evaluate(dataloader, model, criterion, device):
model.eval()
epoch_losses = []
epoch_accs = []
with torch.no_grad():
for batch in tqdm.tqdm(dataloader, desc="evaluating...", file=sys.stdout):
ids = batch["ids"].to(device)
length = batch["length"]
label = batch["label"].to(device)
prediction = model(ids, length)
loss = criterion(prediction, label)
accuracy = get_accuracy(prediction, label)
epoch_losses.append(loss.item())
epoch_accs.append(accuracy.item())
return epoch_losses, epoch_accs
def get_accuracy(prediction, label):
batch_size, _ = prediction.shape
predicted_classes = prediction.argmax(dim=-1)
correct_predictions = predicted_classes.eq(label).sum()
accuracy = correct_predictions / batch_size
return accuracy
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
best_valid_loss = float("inf")
train_losses = []
train_accs = []
valid_losses = []
valid_accs = []
for epoch in range(n_epochs):
train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)
train_losses.extend(train_loss)
train_accs.extend(train_acc)
valid_losses.extend(valid_loss)
valid_accs.extend(valid_acc)
epoch_train_loss = np.mean(train_loss)
epoch_train_acc = np.mean(train_acc)
epoch_valid_loss = np.mean(valid_loss)
epoch_valid_acc = np.mean(valid_acc)
if epoch_valid_loss < best_valid_loss:
best_valid_loss = epoch_valid_loss
torch.save(model.state_dict(), "rnn.pt")
print(f"epoch: {epoch+1}")
print(f"train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}")
print(f"valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}")
training...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:58<00:00, 23.71s/it] evaluating...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.60it/s] epoch: 1 train_loss: 0.788, train_acc: 0.521 valid_loss: 0.749, valid_acc: 0.545 training...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:55<00:00, 23.05s/it] evaluating...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.60it/s] epoch: 2 train_loss: 0.682, train_acc: 0.592 valid_loss: 0.736, valid_acc: 0.534 training...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:55<00:00, 23.04s/it] evaluating...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.59it/s] epoch: 3 train_loss: 0.641, train_acc: 0.637 valid_loss: 0.708, valid_acc: 0.553 training...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:55<00:00, 23.19s/it] evaluating...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.48it/s] epoch: 4 train_loss: 0.604, train_acc: 0.671 valid_loss: 0.707, valid_acc: 0.586 training...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:55<00:00, 23.16s/it] evaluating...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.59it/s] epoch: 5 train_loss: 0.567, train_acc: 0.706 valid_loss: 0.717, valid_acc: 0.576
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(train_losses, label="train loss")
ax.plot(valid_losses, label="valid loss")
plt.legend()
ax.set_xlabel("updates")
ax.set_ylabel("loss")
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(train_accs, label="train accuracy")
ax.plot(valid_accs, label="valid accuracy")
plt.legend()
ax.set_xlabel("updates")
ax.set_ylabel("accuracy");
Text(17.200000000000003, 0.5, 'accuracy')
# model.load_state_dict(torch.load('rnn.pt'))
test_loss, test_acc = evaluate(test_dataloader, model, criterion, device)
epoch_test_loss = np.mean(test_loss)
epoch_test_acc = np.mean(test_acc)
print(f"test_loss: {epoch_test_loss:.3f}, test_acc: {epoch_test_acc:.3f}")
evaluating...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00, 1.17it/s] test_loss: 0.748, test_acc: 0.532
def predict_sentiment(text, model, tokenizer, vocab, device):
tokens = tokenizer(text)
ids = [vocab[t] for t in tokens]
length = torch.LongTensor([len(ids)])
tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
prediction = model(tensor, length).squeeze(dim=0)
probability = torch.softmax(prediction, dim=-1)
predicted_class = prediction.argmax(dim=-1).item()
predicted_probability = probability[predicted_class].item()
return predicted_class, predicted_probability
text = "This film is terrible!"
predict_sentiment(text, model, tokenizer, vocab, device)
(0, 0.523607611656189)
text = "This film is great!"
predict_sentiment(text, model, tokenizer, vocab, device)
(1, 0.7643947005271912)
text = "This film is not terrible, it's great!"
predict_sentiment(text, model, tokenizer, vocab, device)
(1, 0.8956218957901001)
text = "This film is not great, it's terrible!"
predict_sentiment(text, model, tokenizer, vocab, device)
(1, 0.6588938236236572)