#!/usr/bin/env python
# coding: utf-8

# # doc2vec
# 
# This is an experimental code developed by Tomas Mikolov found and [published on the word2vec Google group](https://groups.google.com/d/msg/word2vec-toolkit/Q49FIrNOQRo/J6KG8mUj45sJ).
# 
# The input format for `doc2vec` is still one big text document but every line should be one document prepended with an unique id, for example:
# 
# ```
# _*0 This is sentence 1
# _*1 This is sentence 2
# ```
# 
# ### Requirements
# 
# This notebook requires [`nltk`](http://www.nltk.org/)
# 
# Download some data: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz.
# 
# You could use `make test-data` from the root of the repo.
# 
# ## Preprocess
# 
# Merge data into one big document with an id per line and do some basic preprocessing: word tokenizer.

# In[1]:


import os
import nltk


# In[2]:


directories = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']


# In[3]:


input_file = open('../data/alldata.txt', 'w')


# In[4]:


id_ = 0
for directory in directories:
    rootdir = os.path.join('../data/aclImdb', directory)
    for subdir, dirs, files in os.walk(rootdir):
        for file_ in files:
            with open(os.path.join(subdir, file_), "r") as f:
                doc_id = "_*%i" % id_
                id_ = id_ + 1

                text = f.read()
                text = text
                tokens = nltk.word_tokenize(text)
                doc = " ".join(tokens).lower()
                doc = doc.encode("ascii", "ignore")
                input_file.write("%s %s\n" % (doc_id, doc))


# In[5]:


input_file.close()


# ## doc2vec

# In[ ]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[2]:


import word2vec


# In[ ]:


word2vec.doc2vec('../data/alldata.txt', '../data/doc2vec-vectors.bin', cbow=0, size=100, window=10, negative=5,
                 hs=0, sample='1e-4', threads=12, iter_=20, min_count=1, binary=True, verbose=True)


# ## Prediction
# 
# Is possible to load the vectors using the same wordvectors class as a regular word2vec binary file.

# In[ ]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[2]:


import word2vec


# In[3]:


model = word2vec.load('../data/doc2vec-vectors.bin')


# In[4]:


model.vectors.shape


# The documents vector are going to be identified by the id we used in the preprocesing section, for example document 1 is going to have vector:

# In[6]:


model['_*1']


# We can ask for similarity words or documents on document `1`

# In[10]:


indexes, metrics = model.similar('_*1')


# In[11]:


model.generate_response(indexes, metrics).tolist()


# Now its we just need to matching the id to the data created on the preprocessing step

# In[ ]: