#!/usr/bin/env python # coding: utf-8 # # 7. Neural Machine Translation and Models with Attention # I recommend you take a look at these material first. # * http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture9.pdf # * http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture10.pdf # * http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture11.pdf # * https://arxiv.org/pdf/1409.0473.pdf # * https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation-batched.ipynb # * https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983 # * http://www.manythings.org/anki/ # In[4]: import torch import torch.nn as nn from torch.autograd import Variable import torch.optim as optim import torch.nn.functional as F import nltk import random import numpy as np from collections import Counter, OrderedDict import nltk from copy import deepcopy import os import re import unicodedata flatten = lambda l: [item for sublist in l for item in sublist] from torch.nn.utils.rnn import PackedSequence,pack_padded_sequence import matplotlib.pyplot as plt import matplotlib.ticker as ticker random.seed(1024) get_ipython().run_line_magic('matplotlib', 'inline') # In[6]: USE_CUDA = torch.cuda.is_available() gpus = [0] torch.cuda.set_device(gpus[0]) FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor # In[3]: def getBatch(batch_size, train_data): random.shuffle(train_data) sindex=0 eindex=batch_size while eindex < len(train_data): batch = train_data[sindex: eindex] temp = eindex eindex = eindex + batch_size sindex = temp yield batch if eindex >= len(train_data): batch = train_data[sindex:] yield batch # ### Padding # #
borrowed image from https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983
# In[4]: # It is for Sequence 2 Sequence format def pad_to_batch(batch, x_to_ix, y_to_ix): sorted_batch = sorted(batch, key=lambda b:b[0].size(1), reverse=True) # sort by len x,y = list(zip(*sorted_batch)) max_x = max([s.size(1) for s in x]) max_y = max([s.size(1) for s in y]) x_p, y_p = [], [] for i in range(len(batch)): if x[i].size(1) < max_x: x_p.append(torch.cat([x[i], Variable(LongTensor([x_to_ix['']] * (max_x - x[i].size(1)))).view(1, -1)], 1)) else: x_p.append(x[i]) if y[i].size(1) < max_y: y_p.append(torch.cat([y[i], Variable(LongTensor([y_to_ix['']] * (max_y - y[i].size(1)))).view(1, -1)], 1)) else: y_p.append(y[i]) input_var = torch.cat(x_p) target_var = torch.cat(y_p) input_len = [list(map(lambda s: s ==0, t.data)).count(False) for t in input_var] target_len = [list(map(lambda s: s ==0, t.data)).count(False) for t in target_var] return input_var, target_var, input_len, target_len # In[5]: def prepare_sequence(seq, to_index): idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index[""], seq)) return Variable(LongTensor(idxs)) # ## Data load and Preprocessing # Borrowed code from https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation-batched.ipynb # In[6]: # Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427 def unicode_to_ascii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' ) # Lowercase, trim, and remove non-letter characters def normalize_string(s): s = unicode_to_ascii(s.lower().strip()) s = re.sub(r"([,.!?])", r" \1 ", s) s = re.sub(r"[^a-zA-Z,.!?]+", r" ", s) s = re.sub(r"\s+", r" ", s).strip() return s #

French -> English

# In[7]: corpus = open('../dataset/eng-fra.txt', 'r', encoding='utf-8').readlines() # In[8]: len(corpus) # In[9]: corpus = corpus[:30000] # for practice # In[10]: MIN_LENGTH = 3 MAX_LENGTH = 25 # In[11]: get_ipython().run_cell_magic('time', '', 'X_r, y_r = [], [] # raw\n\nfor parallel in corpus:\n so,ta = parallel[:-1].split(\'\\t\')\n if so.strip() == "" or ta.strip() == "": \n continue\n \n normalized_so = normalize_string(so).split()\n normalized_ta = normalize_string(ta).split()\n \n if len(normalized_so) >= MIN_LENGTH and len(normalized_so) <= MAX_LENGTH \\\n and len(normalized_ta) >= MIN_LENGTH and len(normalized_ta) <= MAX_LENGTH:\n X_r.append(normalized_so)\n y_r.append(normalized_ta)\n \n\nprint(len(X_r), len(y_r))\nprint(X_r[0], y_r[0])\n') # ### Build Vocab # In[12]: source_vocab = list(set(flatten(X_r))) target_vocab = list(set(flatten(y_r))) print(len(source_vocab), len(target_vocab)) # In[13]: source2index = {'': 0, '': 1, '': 2, '': 3} for vo in source_vocab: if source2index.get(vo) is None: source2index[vo] = len(source2index) index2source = {v:k for k, v in source2index.items()} target2index = {'': 0, '': 1, '': 2, '': 3} for vo in target_vocab: if target2index.get(vo) is None: target2index[vo] = len(target2index) index2target = {v:k for k, v in target2index.items()} # ### Prepare train data # In[14]: get_ipython().run_cell_magic('time', '', "X_p, y_p = [], []\n\nfor so, ta in zip(X_r, y_r):\n X_p.append(prepare_sequence(so + [''], source2index).view(1, -1))\n y_p.append(prepare_sequence(ta + [''], target2index).view(1, -1))\n \ntrain_data = list(zip(X_p, y_p))\n") # ## Modeling # #
borrowd image from http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture10.pdf
# If you're not familier with pack_padded_sequence and pad_packed_sequence, check this post. # In[15]: class Encoder(nn.Module): def __init__(self, input_size, embedding_size,hidden_size, n_layers=1,bidirec=False): super(Encoder, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.n_layers = n_layers self.embedding = nn.Embedding(input_size, embedding_size) if bidirec: self.n_direction = 2 self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True, bidirectional=True) else: self.n_direction = 1 self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True) def init_hidden(self, inputs): hidden = Variable(torch.zeros(self.n_layers * self.n_direction, inputs.size(0), self.hidden_size)) return hidden.cuda() if USE_CUDA else hidden def init_weight(self): self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight) self.gru.weight_hh_l0 = nn.init.xavier_uniform(self.gru.weight_hh_l0) self.gru.weight_ih_l0 = nn.init.xavier_uniform(self.gru.weight_ih_l0) def forward(self, inputs, input_lengths): """ inputs : B, T (LongTensor) input_lengths : real lengths of input batch (list) """ hidden = self.init_hidden(inputs) embedded = self.embedding(inputs) packed = pack_padded_sequence(embedded, input_lengths, batch_first=True) outputs, hidden = self.gru(packed, hidden) outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) # unpack (back to padded) if self.n_layers > 1: if self.n_direction == 2: hidden = hidden[-2:] else: hidden = hidden[-1] return outputs, torch.cat([h for h in hidden], 1).unsqueeze(1) # ### Attention Mechanism ( https://arxiv.org/pdf/1409.0473.pdf ) # I used general-type for score function $h_t^TW_ah_s^-$ # #
borrowed image from http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture10.pdf
# In[37]: class Decoder(nn.Module): def __init__(self, input_size, embedding_size, hidden_size, n_layers=1, dropout_p=0.1): super(Decoder, self).__init__() self.hidden_size = hidden_size self.n_layers = n_layers # Define the layers self.embedding = nn.Embedding(input_size, embedding_size) self.dropout = nn.Dropout(dropout_p) self.gru = nn.GRU(embedding_size + hidden_size, hidden_size, n_layers, batch_first=True) self.linear = nn.Linear(hidden_size * 2, input_size) self.attn = nn.Linear(self.hidden_size, self.hidden_size) # Attention def init_hidden(self,inputs): hidden = Variable(torch.zeros(self.n_layers, inputs.size(0), self.hidden_size)) return hidden.cuda() if USE_CUDA else hidden def init_weight(self): self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight) self.gru.weight_hh_l0 = nn.init.xavier_uniform(self.gru.weight_hh_l0) self.gru.weight_ih_l0 = nn.init.xavier_uniform(self.gru.weight_ih_l0) self.linear.weight = nn.init.xavier_uniform(self.linear.weight) self.attn.weight = nn.init.xavier_uniform(self.attn.weight) # self.attn.bias.data.fill_(0) def Attention(self, hidden, encoder_outputs, encoder_maskings): """ hidden : 1,B,D encoder_outputs : B,T,D encoder_maskings : B,T # ByteTensor """ hidden = hidden[0].unsqueeze(2) # (1,B,D) -> (B,D,1) batch_size = encoder_outputs.size(0) # B max_len = encoder_outputs.size(1) # T energies = self.attn(encoder_outputs.contiguous().view(batch_size * max_len, -1)) # B*T,D -> B*T,D energies = energies.view(batch_size,max_len, -1) # B,T,D attn_energies = energies.bmm(hidden).squeeze(2) # B,T,D * B,D,1 --> B,T # if isinstance(encoder_maskings,torch.autograd.variable.Variable): # attn_energies = attn_energies.masked_fill(encoder_maskings,float('-inf'))#-1e12) # PAD masking alpha = F.softmax(attn_energies,1) # B,T alpha = alpha.unsqueeze(1) # B,1,T context = alpha.bmm(encoder_outputs) # B,1,T * B,T,D => B,1,D return context, alpha def forward(self, inputs, context, max_length, encoder_outputs, encoder_maskings=None, is_training=False): """ inputs : B,1 (LongTensor, START SYMBOL) context : B,1,D (FloatTensor, Last encoder hidden state) max_length : int, max length to decode # for batch encoder_outputs : B,T,D encoder_maskings : B,T # ByteTensor is_training : bool, this is because adapt dropout only training step. """ # Get the embedding of the current input word embedded = self.embedding(inputs) hidden = self.init_hidden(inputs) if is_training: embedded = self.dropout(embedded) decode = [] # Apply GRU to the output so far for i in range(max_length): _, hidden = self.gru(torch.cat((embedded, context), 2), hidden) # h_t = f(h_{t-1},y_{t-1},c) concated = torch.cat((hidden, context.transpose(0, 1)), 2) # y_t = g(h_t,y_{t-1},c) score = self.linear(concated.squeeze(0)) softmaxed = F.log_softmax(score,1) decode.append(softmaxed) decoded = softmaxed.max(1)[1] embedded = self.embedding(decoded).unsqueeze(1) # y_{t-1} if is_training: embedded = self.dropout(embedded) # compute next context vector using attention context, alpha = self.Attention(hidden, encoder_outputs, encoder_maskings) # column-wise concat, reshape!! scores = torch.cat(decode, 1) return scores.view(inputs.size(0) * max_length, -1) def decode(self, context, encoder_outputs): start_decode = Variable(LongTensor([[target2index['']] * 1])).transpose(0, 1) embedded = self.embedding(start_decode) hidden = self.init_hidden(start_decode) decodes = [] attentions = [] decoded = embedded while decoded.data.tolist()[0] != target2index['']: # until _, hidden = self.gru(torch.cat((embedded, context), 2), hidden) # h_t = f(h_{t-1},y_{t-1},c) concated = torch.cat((hidden, context.transpose(0, 1)), 2) # y_t = g(h_t,y_{t-1},c) score = self.linear(concated.squeeze(0)) softmaxed = F.log_softmax(score,1) decodes.append(softmaxed) decoded = softmaxed.max(1)[1] embedded = self.embedding(decoded).unsqueeze(1) # y_{t-1} context, alpha = self.Attention(hidden, encoder_outputs,None) attentions.append(alpha.squeeze(1)) return torch.cat(decodes).max(1)[1], torch.cat(attentions) # ## Train # It takes for a while if you use just cpu.... # In[70]: EPOCH = 50 BATCH_SIZE = 64 EMBEDDING_SIZE = 300 HIDDEN_SIZE = 512 LR = 0.001 DECODER_LEARNING_RATIO = 5.0 RESCHEDULED = False # In[71]: encoder = Encoder(len(source2index), EMBEDDING_SIZE, HIDDEN_SIZE, 3, True) decoder = Decoder(len(target2index), EMBEDDING_SIZE, HIDDEN_SIZE * 2) encoder.init_weight() decoder.init_weight() if USE_CUDA: encoder = encoder.cuda() decoder = decoder.cuda() loss_function = nn.CrossEntropyLoss(ignore_index=0) enc_optimizer = optim.Adam(encoder.parameters(), lr=LR) dec_optimizer = optim.Adam(decoder.parameters(), lr=LR * DECODER_LEARNING_RATIO) # In[72]: for epoch in range(EPOCH): losses=[] for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)): inputs, targets, input_lengths, target_lengths = pad_to_batch(batch, source2index, target2index) input_masks = torch.cat([Variable(ByteTensor(tuple(map(lambda s: s ==0, t.data)))) for t in inputs]).view(inputs.size(0), -1) start_decode = Variable(LongTensor([[target2index['']] * targets.size(0)])).transpose(0, 1) encoder.zero_grad() decoder.zero_grad() output, hidden_c = encoder(inputs, input_lengths) preds = decoder(start_decode, hidden_c, targets.size(1), output, input_masks, True) loss = loss_function(preds, targets.view(-1)) losses.append(loss.data.tolist()[0] ) loss.backward() torch.nn.utils.clip_grad_norm(encoder.parameters(), 50.0) # gradient clipping torch.nn.utils.clip_grad_norm(decoder.parameters(), 50.0) # gradient clipping enc_optimizer.step() dec_optimizer.step() if i % 200==0: print("[%02d/%d] [%03d/%d] mean_loss : %0.2f" %(epoch, EPOCH, i, len(train_data)//BATCH_SIZE, np.mean(losses))) losses=[] # You can use http://pytorch.org/docs/master/optim.html#how-to-adjust-learning-rate if RESCHEDULED == False and epoch == EPOCH//2: LR *= 0.01 enc_optimizer = optim.Adam(encoder.parameters(), lr=LR) dec_optimizer = optim.Adam(decoder.parameters(), lr=LR * DECODER_LEARNING_RATIO) RESCHEDULED = True # ## Test # ### Visualize Attention # In[20]: # borrowed code from https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation-batched.ipynb def show_attention(input_words, output_words, attentions): # Set up figure with colorbar fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(attentions.numpy(), cmap='bone') fig.colorbar(cax) # Set up axes ax.set_xticklabels([''] + input_words, rotation=90) ax.set_yticklabels([''] + output_words) # Show label at every tick ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) # show_plot_visdom() plt.show() plt.close() # In[99]: test = random.choice(train_data) input_ = test[0] truth = test[1] output, hidden = encoder(input_, [input_.size(1)]) pred, attn = decoder.decode(hidden, output) input_ = [index2source[i] for i in input_.data.tolist()[0]] pred = [index2target[i] for i in pred.data.tolist()] print('Source : ',' '.join([i for i in input_ if i not in ['']])) print('Truth : ',' '.join([index2target[i] for i in truth.data.tolist()[0] if i not in [2, 3]])) print('Prediction : ',' '.join([i for i in pred if i not in ['']])) if USE_CUDA: attn = attn.cpu() show_attention(input_, pred, attn.data) # # TODO # * BLEU # * Beam Search # * Sampled Softmax # ## Further topics # * Convolutional Sequence to Sequence learning # * Attention is all you need # * Unsupervised Machine Translation Using Monolingual Corpora Only # ## Suggested Reading # * SMT chapter13. Neural Machine Translation # In[ ]: