#!/usr/bin/env python
# coding: utf-8

# # CS 20 : TensorFlow for Deep Learning Research
# ## Lecture 11 : Recurrent Neural Networks
# Simple example for Many to One Classification (word sentiment classification) by Stacked Bi-directional Long Short-Term Memory with Drop out.
# 
# ### Many to One Classification by Stacked Bi-directional LSTM with Drop out
# - Creating the **data pipeline** with `tf.data`
# - Preprocessing word sequences (variable input sequence length) using `padding technique` by `user function (pad_seq)`
# - Using `tf.nn.embedding_lookup` for getting vector of tokens (eg. word, character)
# - Creating the model as **Class**
# - Applying **Drop out** to model by `tf.contrib.rnn.DropoutWrapper`
# - Applying **Stacking** and **dynamic rnn** to model by `tf.contrib.rnn.stack_bidirectional_dynamic_rnn`
# - Reference
#     - https://github.com/golbin/TensorFlow-Tutorials/blob/master/10%20-%20RNN/02%20-%20Autocomplete.py
#     - https://github.com/aisolab/TF_code_examples_for_Deep_learning/blob/master/Tutorial%20of%20implementing%20Sequence%20classification%20with%20RNN%20series.ipynb
#     - https://pozalabs.github.io/blstm/

# ### Setup

# In[1]:


import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import string
get_ipython().run_line_magic('matplotlib', 'inline')

slim = tf.contrib.slim
print(tf.__version__)


# ### Prepare example data

# In[2]:


words = ['good', 'bad', 'amazing', 'so good', 'bull shit', 'awesome']
y = [[1.,0.], [0.,1.], [1.,0.], [1., 0.],[0.,1.], [1.,0.]]


# In[3]:


# Character quantization
char_space = string.ascii_lowercase 
char_space = char_space + ' ' + '*'
char_space


# In[4]:


char_dic = {char : idx for idx, char in enumerate(char_space)}
print(char_dic)


# ### Create pad_seq function

# In[5]:


def pad_seq(sequences, max_len, dic):
    seq_len, seq_indices = [], []
    for seq in sequences:
        seq_len.append(len(seq))
        seq_idx = [dic.get(char) for char in seq]
        seq_idx += (max_len - len(seq_idx)) * [dic.get('*')] # 27 is idx of meaningless token "*"
        seq_indices.append(seq_idx)
    return seq_len, seq_indices


# ### Apply pad_seq function to data

# In[6]:


max_length = 10
X_length, X_indices = pad_seq(sequences = words, max_len = max_length, dic = char_dic)


# In[7]:


print(X_length)
print(np.shape(X_indices))


# ### Define CharBiLSTM class

# In[8]:


class CharStackedBiLSTM:
    def __init__(self, X_length, X_indices, y, n_of_classes, hidden_dims, dic):
        
        # data pipeline
        with tf.variable_scope('input_layer'):
            self._X_length = X_length
            self._X_indices = X_indices
            self._y = y
            
            one_hot = tf.eye(len(dic), dtype = tf.float32)
            self._one_hot = tf.get_variable(name='one_hot_embedding', initializer = one_hot,
                                            trainable = False) # embedding vector training 안할 것이기 때문
            self._X_batch = tf.nn.embedding_lookup(params = self._one_hot, ids = self._X_indices)
            self._keep_prob = tf.placeholder(dtype = tf.float32)
        
        # Stacked Bi-directional LSTM with Drop out
        with tf.variable_scope('stacked_bi-directional_lstm'):
            
            # forward 
            lstm_fw_cells = []
            for hidden_dim in hidden_dims:
                lstm_fw_cell = tf.contrib.rnn.BasicRNNCell(num_units = hidden_dim, activation = tf.nn.tanh)
                lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(cell = lstm_fw_cell,
                                                             output_keep_prob = self._keep_prob)
                lstm_fw_cells.append(lstm_fw_cell)
            
            # backword
            lstm_bw_cells = []
            for hidden_dim in hidden_dims:
                lstm_bw_cell = tf.contrib.rnn.BasicRNNCell(num_units = hidden_dim, activation = tf.nn.tanh)
                lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(cell = lstm_bw_cell,
                                                             output_keep_prob = self._keep_prob)
                lstm_bw_cells.append(lstm_bw_cell)
            
            _, output_state_fw, output_state_bw = \
            tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw = lstm_fw_cells, cells_bw = lstm_bw_cells,
                                                           inputs = self._X_batch,
                                                           sequence_length = self._X_length,
                                                           dtype = tf.float32)

            final_state = tf.concat([output_state_fw[-1], output_state_bw[-1]], axis = 1)

        with tf.variable_scope('output_layer'):
            self._score = slim.fully_connected(inputs = final_state, num_outputs = n_of_classes,
                                               activation_fn = None)
            
        with tf.variable_scope('loss'):
            self.ce_loss = tf.losses.softmax_cross_entropy(onehot_labels = self._y, logits = self._score)
            
        with tf.variable_scope('prediction'):
            self._prediction = tf.argmax(input = self._score, axis = -1, output_type = tf.int32)
    
    def predict(self, sess, X_length, X_indices, keep_prob = 1.):
        feed_prediction = {self._X_length : X_length, self._X_indices : X_indices, self._keep_prob : keep_prob}
        return sess.run(self._prediction, feed_dict = feed_prediction)


# ### Create a model of CharStackedBiLSTM

# In[9]:


# hyper-parameter#
lr = .003
epochs = 10
batch_size = 2
total_step = int(np.shape(X_indices)[0] / batch_size)
print(total_step)


# In[10]:


## create data pipeline with tf.data
tr_dataset = tf.data.Dataset.from_tensor_slices((X_length, X_indices, y))
tr_dataset = tr_dataset.shuffle(buffer_size = 20)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()
print(tr_dataset)


# In[11]:


X_length_mb, X_indices_mb, y_mb = tr_iterator.get_next()


# In[12]:


char_stacked_bi_lstm = CharStackedBiLSTM(X_length = X_length_mb, X_indices = X_indices_mb, 
                                         y = y_mb, n_of_classes = 2, hidden_dims = [16,16], dic = char_dic)


# ### Creat training op and train model

# In[13]:


## create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = char_stacked_bi_lstm.ce_loss)


# In[14]:


sess = tf.Session()
sess.run(tf.global_variables_initializer())

tr_loss_hist = []

for epoch in range(epochs):
    avg_tr_loss = 0
    tr_step = 0
    
    sess.run(tr_iterator.initializer)
    try:
        while True:
            _, tr_loss = sess.run(fetches = [training_op, char_stacked_bi_lstm.ce_loss],
                                  feed_dict = {char_stacked_bi_lstm._keep_prob : .5})
            avg_tr_loss += tr_loss
            tr_step += 1
            
    except tf.errors.OutOfRangeError:
        pass
    
    avg_tr_loss /= tr_step
    tr_loss_hist.append(avg_tr_loss)
    
    print('epoch : {:3}, tr_loss : {:.3f}'.format(epoch + 1, avg_tr_loss))


# In[15]:


plt.plot(tr_loss_hist, label = 'train')


# In[16]:


yhat = char_stacked_bi_lstm.predict(sess = sess, X_length = X_length, X_indices = X_indices)


# In[17]:


print('training acc: {:.2%}'.format(np.mean(yhat == np.argmax(y, axis = -1))))