I recommend you take a look at these material first.
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
import re
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor
def getBatch(batch_size, train_data):
random.shuffle(train_data)
sindex = 0
eindex = batch_size
while eindex < len(train_data):
batch = train_data[sindex: eindex]
temp = eindex
eindex = eindex + batch_size
sindex = temp
yield batch
if eindex >= len(train_data):
batch = train_data[sindex:]
yield batch
def pad_to_batch(batch):
x,y = zip(*batch)
max_x = max([s.size(1) for s in x])
x_p = []
for i in range(len(batch)):
if x[i].size(1) < max_x:
x_p.append(torch.cat([x[i], Variable(LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1)))).view(1, -1)], 1))
else:
x_p.append(x[i])
return torch.cat(x_p), torch.cat(y).view(-1)
def prepare_sequence(seq, to_index):
idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<UNK>"], seq))
return Variable(LongTensor(idxs))
Task involves classifying a question into 6 question types (whether the question is about person, location, numeric information, etc.)
data = open('../dataset/train_5500.label.txt', 'r', encoding='latin-1').readlines()
data = [[d.split(':')[1][:-1], d.split(':')[0]] for d in data]
X, y = list(zip(*data))
X = list(X)
It reduces the search space. ex. my birthday is 12.22 ==> my birthday is ##.##
for i, x in enumerate(X):
X[i] = re.sub('\d', '#', x).split()
vocab = list(set(flatten(X)))
len(vocab)
9117
len(set(y)) # num of class
6
word2index={'<PAD>': 0, '<UNK>': 1}
for vo in vocab:
if word2index.get(vo) is None:
word2index[vo] = len(word2index)
index2word = {v:k for k, v in word2index.items()}
target2index = {}
for cl in set(y):
if target2index.get(cl) is None:
target2index[cl] = len(target2index)
index2target = {v:k for k, v in target2index.items()}
X_p, y_p = [], []
for pair in zip(X,y):
X_p.append(prepare_sequence(pair[0], word2index).view(1, -1))
y_p.append(Variable(LongTensor([target2index[pair[1]]])).view(1, -1))
data_p = list(zip(X_p, y_p))
random.shuffle(data_p)
train_data = data_p[: int(len(data_p) * 0.9)]
test_data = data_p[int(len(data_p) * 0.9):]
you can download pretrained word vector from here https://github.com/mmihaltz/word2vec-GoogleNews-vectors
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('../dataset/GoogleNews-vectors-negative300.bin', binary=True)
len(model.index2word)
3000000
pretrained = []
for key in word2index.keys():
try:
pretrained.append(model[word2index[key]])
except:
pretrained.append(np.random.randn(300))
pretrained_vectors = np.vstack(pretrained)
class CNNClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
super(CNNClassifier,self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])
# kernal_size = (K,D)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)
def init_weights(self, pretrained_word_vectors, is_static=False):
self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
if is_static:
self.embedding.weight.requires_grad = False
def forward(self, inputs, is_training=False):
inputs = self.embedding(inputs).unsqueeze(1) # (B,1,T,D)
inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs] #[(N,Co,W), ...]*len(Ks)
inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] #[(N,Co), ...]*len(Ks)
concated = torch.cat(inputs, 1)
if is_training:
concated = self.dropout(concated) # (N,len(Ks)*Co)
out = self.fc(concated)
return F.log_softmax(out,1)
It takes for a while if you use just cpu.
EPOCH = 5
BATCH_SIZE = 50
KERNEL_SIZES = [3,4,5]
KERNEL_DIM = 100
LR = 0.001
model = CNNClassifier(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES)
model.init_weights(pretrained_vectors) # initialize embedding matrix using pretrained vectors
if USE_CUDA:
model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)
for epoch in range(EPOCH):
losses = []
for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
inputs,targets = pad_to_batch(batch)
model.zero_grad()
preds = model(inputs, True)
loss = loss_function(preds, targets)
losses.append(loss.data.tolist()[0])
loss.backward()
#for param in model.parameters():
# param.grad.data.clamp_(-3, 3)
optimizer.step()
if i % 100 == 0:
print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
losses = []
[0/5] mean_loss : 2.13 [1/5] mean_loss : 0.12 [2/5] mean_loss : 0.08 [3/5] mean_loss : 0.02 [4/5] mean_loss : 0.05
accuracy = 0
for test in test_data:
pred = model(test[0]).max(1)[1]
pred = pred.data.tolist()[0]
target = test[1].data.tolist()[0][0]
if pred == target:
accuracy += 1
print(accuracy/len(test_data) * 100)
97.61904761904762