#!/usr/bin/env python
# coding: utf-8

# # CS 20 : TensorFlow for Deep Learning Research
# ## Lecture 05 : Variable sharing and managing experiments
# ### Word2vec (skip-gram) for simple example
# 
# - Creating the **input pipeline** with `tf.data`
# - Creating the model as **Class**
# 
# Ref  
# - https://github.com/golbin/TensorFlow-Tutorials/blob/master/04%20-%20Neural%20Network%20Basic/03%20-%20Word2Vec.py  
# - https://github.com/aisolab/TF_code_examples_for_Deep_learning/blob/master/Tutorial%20of%20implementing%20Word2Vec.ipynb

# ### Setup

# In[1]:


from __future__ import absolute_import, division, print_function
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
get_ipython().run_line_magic('matplotlib', 'inline')

print(tf.__version__)


# ### Data-preprocessing

# In[2]:


# 단어 벡터를 분석해볼 임의의 문장들
sentences = ["나 고양이 좋다",
             "나 강아지 좋다",
             "나 동물 좋다",
             "강아지 고양이 동물",
             "여자친구 고양이 강아지 좋다",
             "고양이 생선 우유 좋다",
             "강아지 생선 싫다 우유 좋다",
             "강아지 고양이 눈 좋다",
             "나 여자친구 좋다",
             "여자친구 나 싫다",
             "여자친구 나 영화 책 음악 좋다",
             "나 게임 만화 애니 좋다",
             "고양이 강아지 싫다",
             "강아지 고양이 좋다"]


# In[3]:


# 문장을 전부 합친 후, 공백으로 단어들을 나누고 고유한 단어들로 리스트를 만듭니다.
word_sequence = ' '.join(sentences).split()
word_list = list(set(word_sequence))
word_list.sort()
print(word_sequence)
print(word_list)


# In[4]:


# 문자열로 분석하는 것 보다, 숫자로 분석하는 것이 훨씬 용이하므로
# 리스트에서 문자들의 인덱스를 뽑아서 사용하기 위해,
# 이를 표현하기 위한 연관 배열과, 단어 리스트에서 단어를 참조 할 수 있는 인덱스 배열을 만듭합니다.
word_dic = {w: i for i, w in enumerate(word_list)}
print(word_dic)


# ### Define preprocessor function for skip-gram of Word2Vec

# In[5]:


def preprocessor(sequences, word_dic, window_size):
    context = []
    for idx in range(window_size, len(sequences) - window_size):
        center_word = word_dic.get(sequences[idx])
        context_words = [word_dic.get(sequences[idx + _]) for _ in range(-window_size, window_size + 1) if _ != 0]
    
        for token in context_words:
            context.append([center_word, token])
    else:
        return context


# In[6]:


batch = preprocessor(sequences = word_sequence, word_dic = word_dic, window_size = 2)


# In[7]:


center_words = np.array(batch)[:,0]
target_words = np.array(batch)[:,[1]]


# ### Define Word2vec class

# In[8]:


class Word2vec:
    def __init__(self, center_words, target_words, vocab_size,
                embedding_dim = 2, num_sampled = 10):
        
        self._center_words = center_words
        self._target_words = target_words
        
        with tf.variable_scope('embeddings'):
            self._embeddings = tf.get_variable(name = 'lookup_table', shape = [vocab_size, embedding_dim],
                                               dtype = tf.float32, initializer = tf.truncated_normal_initializer())
            self._selected_embed = tf.nn.embedding_lookup(params = self._embeddings, ids = self._center_words)
            

        with tf.variable_scope('nce'):
            nce_weights = tf.get_variable('weights', shape = [vocab_size, embedding_dim], dtype = tf.float32,
                                         initializer = tf.truncated_normal_initializer())
            nce_biases = tf.get_variable('biases', initializer = tf.zeros(shape = [vocab_size]))
            
            self.nce_loss = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights, biases = nce_biases,
                                            labels = self._target_words, inputs = self._selected_embed,
                                            num_sampled = num_sampled, num_classes = vocab_size))
    
    def get_wordvector(self, sess, word_dic, word):
        idx = word_dic.get(word)
        feed_get_wordvector = {self._center_words : [idx]}
        return sess.run(self._selected_embed, feed_dict = feed_get_wordvector)


# ### Create a model of Word2vec

# In[9]:


# hyper-parameter
epochs = 200
batch_size = 8
learning_rate = .001
total_step = int(len(batch) / batch_size)
print(total_step)


# In[10]:


## create input pipeline with tf.data
dataset = tf.data.Dataset.from_tensor_slices((center_words, target_words))
dataset = dataset.shuffle(buffer_size = 32)
dataset = dataset.batch(batch_size = batch_size)
iterator = dataset.make_initializable_iterator()
x_data, y_data = iterator.get_next()


# In[11]:


sgram = Word2vec(center_words = x_data, target_words = y_data, vocab_size = len(word_dic))


# ### Create training op and train model 

# In[12]:


# create training op
opt = tf.train.AdamOptimizer(learning_rate = learning_rate)

# equal to 'var_list = None'
training_op = opt.minimize(loss = sgram.nce_loss)


# In[13]:


sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
sess = tf.Session(config = sess_config)
sess.run(tf.global_variables_initializer())

tr_loss_hist = []

for epoch in range(epochs):
    sess.run(iterator.initializer)
    avg_tr_loss = 0
    total_step = 0

    try:
        while True:
            _, tr_loss = sess.run(fetches = [training_op, sgram.nce_loss])
            avg_tr_loss += tr_loss
            total_step += 1

    except tf.errors.OutOfRangeError:
        pass
    
    avg_tr_loss /= total_step
    tr_loss_hist.append(avg_tr_loss)
    
    if epoch % 20 == 0:
        print('epoch : {:3}, tr_loss : {:.2f}'.format(epoch, avg_tr_loss))


# In[14]:


plt.plot(tr_loss_hist)


# In[15]:


for word in word_list:
    tmp = sgram.get_wordvector(sess = sess, word_dic = word_dic, word = word)
    x, y = tmp[0][0], tmp[0][1]
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), xytext=(5, 2),
                 textcoords='offset points', ha='right', va='bottom')