Text Sentiment Classification: Using Recurrent Neural Networks

In [1]:
import sys
sys.path.insert(0, '..')

import d2l
from mxnet import gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn, utils as gutils
from mxnet.contrib import text
import os
import tarfile

Text Sentiment Classification Data

Reading Data

In [2]:
data_dir = './'
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
fname = gutils.download(url, data_dir)
with tarfile.open(fname, 'r') as f:

Read the training and test data sets.

In [3]:
def read_imdb(folder='train'):
    data, labels = [], []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_dir, 'aclImdb', folder, label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                labels.append(1 if label == 'pos' else 0)
    return data, labels

train_data, test_data = read_imdb('train'), read_imdb('test')
print('# trainings:', len(train_data[0]), '\n# tests:', len(test_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('label:', y, 'review:', x[0:60])
# trainings: 25000 
# tests: 25000
label: 1 review: Normally the best way to annoy me in a film is to include so
label: 1 review: The Bible teaches us that the love of money is the root of a
label: 1 review: Being someone who lists Night of the Living Dead at number t

Tokenization and Vocabulary

In [4]:
def tokenize(sentences):
    return [line.split(' ') for line in sentences]

train_tokens = tokenize(train_data[0])
test_tokens = tokenize(test_data[0])

vocab = d2l.Vocab([tk for line in train_tokens for tk in line], min_freq=5)

Padding to the Same Length

In [5]:
max_len = 500

def pad(x):
    if len(x) > max_len:        
        return x[:max_len]
        return x + [vocab.unk] * (max_len - len(x))
train_features = nd.array([pad(vocab[line]) for line in train_tokens])
test_features = nd.array([pad(vocab[line]) for line in test_tokens])

Create Data Iterator

In [6]:
batch_size = 64
train_set = gdata.ArrayDataset(train_features, train_data[1])
test_set = gdata.ArrayDataset(test_features, test_data[1])
train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)
test_iter = gdata.DataLoader(test_set, batch_size)

Print the shape of the first mini-batch of data and the number of mini-batches in the training set.

In [7]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
'# batches:', len(train_iter)
X (64, 500) y (64,)
('# batches:', 391)

Use a Recurrent Neural Network Model

In [8]:
class BiRNN(nn.Block):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # Set Bidirectional to True to get a bidirectional recurrent neural
        # network
        self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers,
                                bidirectional=True, input_size=embed_size)
        self.decoder = nn.Dense(2)

    def forward(self, inputs):
        # The shape of inputs is (batch size, number of words). Because LSTM
        # needs to use sequence as the first dimension, the input is
        # transformed and the word feature is then extracted. The output shape
        # is (number of words, batch size, word vector dimension).
        embeddings = self.embedding(inputs.T)
        # The shape of states is (number of words, batch size, 2 * number of
        # hidden units).
        states = self.encoder(embeddings)
        # Concatenate the hidden states of the initial time step and final
        # time step to use as the input of the fully connected layer. Its
        # shape is (batch size, 4 * number of hidden units)
        encoding = nd.concat(states[0], states[-1])
        outputs = self.decoder(encoding)
        return outputs

Create a bidirectional recurrent neural network with two hidden layers.

In [9]:
embed_size, num_hiddens, num_layers, ctx = 100, 100, 2, d2l.try_all_gpus()
net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers)
net.initialize(init.Xavier(), ctx=ctx)

Load Pre-trained Word Vectors

In [10]:
glove_embedding = text.embedding.create(
    'glove', pretrained_file_name='glove.6B.100d.txt')
embeds = glove_embedding.get_vecs_by_tokens(vocab.idx_to_token)
(49339, 100)

Use these word vectors as feature vectors for each word in the reviews.

In [11]:
net.embedding.collect_params().setattr('grad_req', 'null')

Train and Evaluate the Model

In [12]:
lr, num_epochs = 0.01, 5
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gloss.SoftmaxCrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)
training on [gpu(0), gpu(1)]
epoch 1, loss 0.5948, train acc 0.660, test acc 0.811, time 41.8 sec
epoch 2, loss 0.4026, train acc 0.822, test acc 0.836, time 41.6 sec
epoch 3, loss 0.3604, train acc 0.843, test acc 0.844, time 42.8 sec
epoch 4, loss 0.3320, train acc 0.859, test acc 0.842, time 42.4 sec
epoch 5, loss 0.3044, train acc 0.870, test acc 0.853, time 41.0 sec

Define the prediction function.

In [13]:
def predict_sentiment(net, vocab, sentence):
    sentence = nd.array(vocab[sentence.split()], ctx=d2l.try_gpu())
    label = nd.argmax(net(sentence.reshape((1, -1))), axis=1)
    return 'positive' if label.asscalar() == 1 else 'negative'

Then, use the trained model to classify the sentiments of two simple sentences.

In [14]:
predict_sentiment(net, vocab, 'this movie is so great')
In [15]:
predict_sentiment(net, vocab, 'this movie is so bad')