#!/usr/bin/env python
# coding: utf-8

# ## Using wrappers for Gensim models for working with Keras

# This tutorial is about using gensim models as a part of your Keras models.

# The wrappers available (as of now) are :
# * Word2Vec (uses the function ```get_keras_embedding``` defined in  ```gensim.models.keyedvectors```)

# ### Word2Vec

# #### Integration with Keras : 20NewsGroups Task

# To see how Gensim's Word2Vec model could be integrated with Keras while dealing with a supervised (classification) task, we consider the [20NewsGroups](qwone.com/~jason/20Newsgroups/) task. Here, we take a smaller version of this data by taking a subset of the documents to be classified. 
# 
# First, we import the necessary modules.

# In[163]:


import os
import sys
import keras
import numpy as np

from gensim.models import word2vec

from keras.models import Model
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Input, Dense, Flatten
from keras.layers import Conv1D, MaxPooling1D

from sklearn.datasets import fetch_20newsgroups


# We first load the training data.
# Then, we format our text samples and labels into tensors that can be fed into a neural network. To do this, we rely on Keras utilities `keras.preprocessing.text.Tokenizer`, `keras.preprocessing.sequence.pad_sequences` and `from keras.utils.np_utils import to_categorical`.
# 

# In[164]:


dataset = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'comp.graphics', 'sci.space'])

MAX_SEQUENCE_LENGTH = 1000

# Vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset.data)
sequences = tokenizer.texts_to_sequences(dataset.data)

x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y_train = to_categorical(np.asarray(dataset.target))


# Now we train a Word2Vec model from the documents we have.
# From the word2vec model we construct the embedding layer to be used in our actual Keras model.
# 
# The Keras tokenizer object maintains an internal vocabulary (a token to index mapping), which might be different from the vocabulary gensim builds when training the word2vec model. To align the vocabularies we pass the Keras tokenizer vocabulary to the `get_keras_embedding` function

# In[165]:


keras_w2v = word2vec.Word2Vec([text_to_word_sequence(doc) for doc in dataset.data],min_count=0)
embedding_layer = keras_w2v.wv.get_keras_embedding(word_index = tokenizer.word_index,train_embeddings=True)


# Finally, we create a small 1D convnet to solve our classification problem.

# In[166]:


sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(y_train.shape[1], activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

model.fit(x_train, y_train, epochs=3, validation_split= 0.1)


# We see that the model learns to reaches a reasonable accuracy, considering the small dataset.
# 
# Alternatively, we can use embeddings pretrained on a different larger corpus (Glove), to see if performance impoves

# In[167]:


import gensim.downloader as api

glove_embeddings = api.load("glove-wiki-gigaword-100")


# In[168]:


glove_embedding_layer = glove_embeddings.get_keras_embedding(word_index = tokenizer.word_index,train_embeddings=True)

embedded_sequences = glove_embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(y_train.shape[1], activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

model.fit(x_train, y_train, epochs=3, validation_split= 0.1)


# We see that pretrained embeddings result in a faster convergence