#!/usr/bin/env python # coding: utf-8 # # doc2vec # # This is an experimental code developed by Tomas Mikolov found and [published on the word2vec Google group](https://groups.google.com/d/msg/word2vec-toolkit/Q49FIrNOQRo/J6KG8mUj45sJ). # # The input format for `doc2vec` is still one big text document but every line should be one document prepended with an unique id, for example: # # ``` # _*0 This is sentence 1 # _*1 This is sentence 2 # ``` # # ### Requirements # # This notebook requires [`nltk`](http://www.nltk.org/) # # Download some data: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz. # # You could use `make test-data` from the root of the repo. # # ## Preprocess # # Merge data into one big document with an id per line and do some basic preprocessing: word tokenizer. # In[1]: import os import nltk # In[2]: directories = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup'] # In[3]: input_file = open('../data/alldata.txt', 'w') # In[4]: id_ = 0 for directory in directories: rootdir = os.path.join('../data/aclImdb', directory) for subdir, dirs, files in os.walk(rootdir): for file_ in files: with open(os.path.join(subdir, file_), "r") as f: doc_id = "_*%i" % id_ id_ = id_ + 1 text = f.read() text = text tokens = nltk.word_tokenize(text) doc = " ".join(tokens).lower() doc = doc.encode("ascii", "ignore") input_file.write("%s %s\n" % (doc_id, doc)) # In[5]: input_file.close() # ## doc2vec # In[ ]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import word2vec # In[ ]: word2vec.doc2vec('../data/alldata.txt', '../data/doc2vec-vectors.bin', cbow=0, size=100, window=10, negative=5, hs=0, sample='1e-4', threads=12, iter_=20, min_count=1, binary=True, verbose=True) # ## Prediction # # Is possible to load the vectors using the same wordvectors class as a regular word2vec binary file. # In[ ]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import word2vec # In[3]: model = word2vec.load('../data/doc2vec-vectors.bin') # In[4]: model.vectors.shape # The documents vector are going to be identified by the id we used in the preprocesing section, for example document 1 is going to have vector: # In[6]: model['_*1'] # We can ask for similarity words or documents on document `1` # In[10]: indexes, metrics = model.similar('_*1') # In[11]: model.generate_response(indexes, metrics).tolist() # Now its we just need to matching the id to the data created on the preprocessing step # In[ ]: