#!/usr/bin/env python # coding: utf-8 # # Similarity Queries using Nmslib Tutorial # This tutorial is about using the ([Non-Metric Space Library (NMSLIB)](https://github.com/nmslib/nmslib "Link to nmslib repo")) library for similarity queries with a Word2Vec model built with gensim. # # ## Why use Nmslib? # The current implementation for finding k nearest neighbors in a vector space in gensim has linear complexity via brute force in the number of indexed documents, although with extremely low constant factors. The retrieved results are exact, which is an overkill in many applications: approximate results retrieved in sub-linear time may be enough. Nmslib can find approximate nearest neighbors much faster. # Compared to annoy, nmslib has more parameteres to control the build and query time and accuracy. Nmslib can achieve faster and more accurate nearest neighbors search than annoy. This figure shows a comparison between annoy and nmslib indexer with differents parameters. This shows nmslib is better than annoy. # ![nmslib.png](nmslib.png) # # ## Prerequisites # Additional libraries needed for this tutorial: # - nmslib # - annoy # - psutil # - matplotlib # # ## Outline # 1. Download Text8 Corpus # 2. Build Word2Vec Model # 3. Construct NmslibIndex with model & make a similarity query # 4. Verify & Evaluate performance # 5. Evaluate relationship of parameters to initialization/query time and accuracy, compared with annoy # 6. Work with Google's word2vec C formats # In[5]: # pip install watermark get_ipython().run_line_magic('reload_ext', 'watermark') get_ipython().run_line_magic('watermark', '-v -m -p gensim,numpy,scipy,psutil,matplotlib') # ### 1. Download Text8 Corpus # In[2]: import os.path if not os.path.isfile('text8'): get_ipython().system('wget -c http://mattmahoney.net/dc/text8.zip') get_ipython().system('unzip text8.zip') # #### Import & Set up Logging # I'm not going to set up logging due to the verbose input displaying in notebooks, but if you want that, uncomment the lines in the cell below. # In[6]: LOGS = False if LOGS: import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # ### 2. Build Word2Vec Model # In[57]: from gensim.models import Word2Vec, KeyedVectors from gensim.models.word2vec import Text8Corpus # Using params from Word2Vec_FastText_Comparison params = { 'alpha': 0.05, 'size': 100, 'window': 5, 'iter': 5, 'min_count': 5, 'sample': 1e-4, 'sg': 1, 'hs': 0, 'negative': 5 } model = Word2Vec(Text8Corpus('text8'), **params) print(model) # See the [Word2Vec tutorial](word2vec.ipynb) for how to initialize and save this model. # #### Comparing the traditional implementation, Annoy and Nmslib approximation # In[14]: # Set up the model and vector that we are using in the comparison from gensim.similarities.index import AnnoyIndexer from gensim.similarities.nmslib import NmslibIndexer model.init_sims() annoy_index = AnnoyIndexer(model, 300) nmslib_index = NmslibIndexer(model, {'M': 100, 'indexThreadQty': 1, 'efConstruction': 100}, {'efSearch': 10}) # In[15]: # Dry run to make sure both indices are fully in RAM vector = model.wv.syn0norm[0] print(model.most_similar([vector], topn=5, indexer=annoy_index)) print(model.most_similar([vector], topn=5, indexer=nmslib_index)) print(model.most_similar([vector], topn=5)) # In[7]: import time import numpy as np # In[8]: def avg_query_time(annoy_index=None, queries=1000): """ Average query time of a most_similar method over 1000 random queries, uses annoy if given an indexer """ total_time = 0 for _ in range(queries): rand_vec = model.wv.syn0norm[np.random.randint(0, len(model.wv.vocab))] start_time = time.clock() model.most_similar([rand_vec], topn=5, indexer=annoy_index) total_time += time.clock() - start_time return total_time / queries # In[18]: queries = 10000 gensim_time = avg_query_time(queries=queries) annoy_time = avg_query_time(annoy_index, queries=queries) nmslib_time = avg_query_time(nmslib_index, queries=queries) print("Gensim (s/query):\t{0:.5f}".format(gensim_time)) print("Annoy (s/query):\t{0:.5f}".format(annoy_time)) print("Nmslib (s/query):\t{0:.5f}".format(nmslib_time)) speed_improvement_gensim = gensim_time / nmslib_time speed_improvement_annoy = annoy_time / nmslib_time print ("\nNmslib is {0:.2f} times faster on average on this particular run".format(speed_improvement_gensim)) print ("\nNmslib is {0:.2f} times faster on average than annoy on this particular run".format(speed_improvement_annoy)) # ## 3. Construct Nmslib Index with model & make a similarity query # # ### Creating an indexer # An instance of `NmslibIndexer` needs to be created in order to use Nmslib in gensim. The `NmslibIndexer` class is located in `gensim.similarities.nmslib` # # `NmslibIndexer()` takes three parameters: # # **`model`**: A `Word2Vec` or `Doc2Vec` model # # **`index_params`**: Parameters for building nmslib indexer. `index_params` effects the build time and the index size. The example is `{'M': 100, 'indexThreadQty': 1, 'efConstruction': 100}`. Increasing the value of `M` and `efConstruction` improves the accuracy of search. However this also leads to longer indexing times. `indexThreadQty` is the number of thread. # # **`query_time_params`**: Parameters for querying on nmslib indexer. `query_time_params` effects the query time and the search accuracy. The example is `{'efSearch': 100}`. A larger `efSearch` will give more accurate results, but larger query time. # # More information can be found [here](https://github.com/nmslib/nmslib/blob/master/manual/methods.md). The relationship between parameters, build/query time, and accuracy will be investigated later in the tutorial. # # Now that we are ready to make a query, lets find the top 5 most similar words to "science" in the Text8 corpus. To make a similarity query we call `Word2Vec.most_similar` like we would traditionally, but with an added parameter, `indexer`. The only supported indexerers in gensim as of now are Annoy and Nmslib. # In[19]: # Building nmslib indexer nmslib_index = NmslibIndexer(model, {'M': 100, 'indexThreadQty': 1, 'efConstruction': 100}, {'efSearch': 10}) # Derive the vector for the word "science" in our model vector = model["science"] # The instance of AnnoyIndexer we just created is passed approximate_neighbors = model.most_similar([vector], topn=11, indexer=nmslib_index) # Neatly print the approximate_neighbors and their corresponding cosine similarity values print("Approximate Neighbors") for neighbor in approximate_neighbors: print(neighbor) normal_neighbors = model.most_similar([vector], topn=11) print("\nNormal (not nmslib-indexed) Neighbors") for neighbor in normal_neighbors: print(neighbor) # #### Analyzing the results # The closer the cosine similarity of a vector is to 1, the more similar that word is to our query, which was the vector for "science". In this case the results are almostly same. # ### 4. Verify & Evaluate performance # #### Persisting Indexes # You can save and load your indexes from/to disk to prevent having to construct them each time. This will create two files on disk, _fname_ and _fname.d_. Both files are needed to correctly restore all attributes. # In[23]: import os fname = '/tmp/mymodel.index' # Persist index to disk nmslib_index.save(fname) # Load index back if os.path.exists(fname): nmslib_index2 = NmslibIndexer.load(fname) nmslib_index2.model = model # In[24]: # Results should be identical to above vector = model["science"] approximate_neighbors2 = model.most_similar([vector], topn=11, indexer=nmslib_index2) for neighbor in approximate_neighbors2: print(neighbor) assert approximate_neighbors == approximate_neighbors2 # Be sure to use the same model at load that was used originally, otherwise you will get unexpected behaviors. # #### Save memory by memory-mapping indices saved to disk # # Nmslib library has a useful feature that indices can be memory-mapped from disk. It saves memory when the same index is used by several processes. # # Below are two snippets of code. First one has a separate index for each process. The second snipped shares the index between two processes via memory-mapping. The second example uses less total RAM as it is shared. # In[26]: # Remove verbosity from code below (if logging active) if LOGS: logging.disable(logging.CRITICAL) # In[27]: from multiprocessing import Process import psutil # #### Bad Example: Two processes load the Word2vec model from disk and create there own Nmslib indices from that model. # In[28]: get_ipython().run_cell_magic('time', '', '\nmodel.save(\'/tmp/mymodel.pkl\')\n\ndef f(process_id):\n print(\'Process Id: {}\'.format(os.getpid()))\n process = psutil.Process(os.getpid())\n new_model = Word2Vec.load(\'/tmp/mymodel.pkl\')\n vector = new_model["science"]\n nmslib_index = NmslibIndexer(new_model, {\'M\': 100, \'indexThreadQty\': 1, \'efConstruction\': 100}, {\'efSearch\': 10})\n approximate_neighbors = new_model.most_similar([vector], topn=5, indexer=nmslib_index)\n print(\'\\nMemory used by process {}: {}\\n---\'.format(os.getpid(), process.memory_info()))\n\n# Creating and running two parallel process to share the same index file.\np1 = Process(target=f, args=(\'1\',))\np1.start()\np1.join()\np2 = Process(target=f, args=(\'2\',))\np2.start()\np2.join()\n') # #### Good example. Two processes load both the Word2vec model and index from disk and memory-map the index # # In[29]: get_ipython().run_cell_magic('time', '', '\nmodel.save(\'/tmp/mymodel.pkl\')\n\ndef f(process_id):\n print(\'Process Id: {}\'.format(os.getpid()))\n process = psutil.Process(os.getpid())\n new_model = Word2Vec.load(\'/tmp/mymodel.pkl\')\n vector = new_model["science"]\n nmslib_index = NmslibIndexer.load(\'/tmp/mymodel.index\')\n nmslib_index.model = new_model\n approximate_neighbors = new_model.most_similar([vector], topn=5, indexer=nmslib_index)\n print(\'\\nMemory used by process {}: {}\\n---\'.format(os.getpid(), process.memory_info()))\n\n# Creating and running two parallel process to share the same index file.\np1 = Process(target=f, args=(\'1\',))\np1.start()\np1.join()\np2 = Process(target=f, args=(\'2\',))\np2.start()\np2.join()\n') # ### 5. Evaluate relationship of parameters to initialization/query time and accuracy, compared with annoy # # In[9]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # #### Build dataset of Initialization times and accuracy measures # In[14]: exact_results = [element[0] for element in model.most_similar([model.wv.syn0norm[0]], topn=100)] # In[48]: # For calculating query time queries = 1000 # In[49]: def create_evaluation_graph(x_values, y_values_init, y_values_accuracy, y_values_query, param_name): plt.figure(1, figsize=(12, 6)) plt.subplot(231) plt.plot(x_values, y_values_init) plt.title("{} vs initalization time".format(param_name)) plt.ylabel("Initialization time (s)") plt.xlabel(param_name) plt.subplot(232) plt.plot(x_values, y_values_accuracy) plt.title("{} vs accuracy".format(param_name)) plt.ylabel("% accuracy") plt.xlabel(param_name) plt.tight_layout() plt.subplot(233) plt.plot(y_values_init, y_values_accuracy) plt.title("Initialization time vs accuracy") plt.ylabel("% accuracy") plt.xlabel("Initialization time (s)") plt.tight_layout() plt.subplot(234) plt.plot(x_values, y_values_query) plt.title("{} vs query time".format(param_name)) plt.ylabel("query time") plt.xlabel(param_name) plt.tight_layout() plt.subplot(235) plt.plot(y_values_query, y_values_accuracy) plt.title("query time vs accuracy") plt.ylabel("% accuracy") plt.xlabel("query time (s)") plt.tight_layout() plt.show() # In[50]: def evaluate_nmslib_performance(parameter, is_parameter_query, parameter_start, parameter_end, parameter_step): nmslib_x_values = [] nmslib_y_values_init = [] nmslib_y_values_accuracy = [] nmslib_y_values_query = [] index_params = {'M': 100, 'indexThreadQty': 10, 'efConstruction': 100, 'post': 0} query_params = {'efSearch': 100} for x in range(parameter_start, parameter_end, parameter_step): nmslib_x_values.append(x) start_time = time.time() if is_parameter_query: query_params[parameter] = x else: index_params[parameter] = x nmslib_index = NmslibIndexer(model , index_params , query_params) nmslib_y_values_init.append(time.time() - start_time) approximate_results = model.most_similar([model.wv.syn0norm[0]], topn=100, indexer=nmslib_index) top_words = [result[0] for result in approximate_results] nmslib_y_values_accuracy.append(len(set(top_words).intersection(exact_results))) nmslib_y_values_query.append(avg_query_time(nmslib_index, queries=queries)) create_evaluation_graph(nmslib_x_values, nmslib_y_values_init, nmslib_y_values_accuracy, nmslib_y_values_query, parameter) # In[51]: # Evaluate nmslib indexer, changing the parameter M evaluate_nmslib_performance("M", False, 50, 401, 50) # In[52]: # Evaluate nmslib indexer, changing the parameter efConstruction evaluate_nmslib_performance("efConstruction", False, 50, 1001, 100) # In[53]: # Evaluate nmslib indexer, changing the parameter efSearch evaluate_nmslib_performance("efSearch", True, 50, 401, 100) # In[54]: # Evaluate annoy indexer, changing the parameter num_tree annoy_x_values = [] annoy_y_values_init = [] annoy_y_values_accuracy = [] annoy_y_values_query = [] for x in range(100, 401, 50): annoy_x_values.append(x) start_time = time.time() annoy_index = AnnoyIndexer(model, x) annoy_y_values_init.append(time.time() - start_time) approximate_results = model.most_similar([model.wv.syn0norm[0]], topn=100, indexer=annoy_index) top_words = [result[0] for result in approximate_results] annoy_y_values_accuracy.append(len(set(top_words).intersection(exact_results))) annoy_y_values_query.append(avg_query_time(annoy_index, queries=queries)) create_evaluation_graph(annoy_x_values, annoy_y_values_init, annoy_y_values_accuracy, annoy_y_values_query, "num_tree") # In[55]: # nmslib indexer changing the parameter M, efConstruction, efSearch nmslib_y_values_init = [] nmslib_y_values_accuracy = [] nmslib_y_values_query = [] for M in [100, 200]: for efConstruction in [100, 200]: for efSearch in [100, 200]: start_time = time.time() nmslib_index = NmslibIndexer(model, {'M': M, 'indexThreadQty': 10, 'efConstruction': efConstruction, 'post': 0}, {'efSearch': efSearch}) nmslib_y_values_init.append(time.time() - start_time) approximate_results = model.most_similar([model.wv.syn0norm[0]], topn=100, indexer=nmslib_index) top_words = [result[0] for result in approximate_results] nmslib_y_values_accuracy.append(len(set(top_words).intersection(exact_results))) nmslib_y_values_query.append(avg_query_time(nmslib_index, queries=queries)) # In[56]: # Make a comparison between annoy and nmslib indexer plt.figure(1, figsize=(12, 6)) plt.subplot(121) plt.scatter(nmslib_y_values_init, nmslib_y_values_accuracy, label="nmslib", color='r', marker='o') plt.scatter(annoy_y_values_init, annoy_y_values_accuracy, label="annoy", color='b', marker='x') plt.legend() plt.title("Initialization time vs accuracy. Upper left is better.") plt.ylabel("% accuracy") plt.xlabel("Initialization time (s)") plt.subplot(122) plt.scatter(nmslib_y_values_query, nmslib_y_values_accuracy, label="nmslib", color='r', marker='o') plt.scatter(annoy_y_values_query, annoy_y_values_accuracy, label="annoy", color='b', marker='x') plt.legend() plt.title("Query time vs accuracy. Upper left is better.") plt.ylabel("% accuracy") plt.xlabel("Query time (s)") plt.xlim(min(nmslib_y_values_query+annoy_y_values_query), max(nmslib_y_values_query+annoy_y_values_query)) plt.tight_layout() plt.show() # ### 6. Work with Google word2vec files # # Our model can be exported to a word2vec C format. There is a binary and a plain text word2vec format. Both can be read with a variety of other software, or imported back into gensim as a `KeyedVectors` object. # In[74]: # To export our model as text model.wv.save_word2vec_format('/tmp/vectors.txt', binary=False) # In[37]: from smart_open import open # View the first 3 lines of the exported file # The first line has the total number of entries and the vector dimension count. # The next lines have a key (a string) followed by its vector. with open('/tmp/vectors.txt') as myfile: for i in range(3): print(myfile.readline().strip()) # In[38]: # To import a word2vec text model wv = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # In[39]: # To export our model as binary model.wv.save_word2vec_format('/tmp/vectors.bin', binary=True) # In[40]: # To import a word2vec binary model wv = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # In[41]: # To create and save Nmslib Index from a loaded `KeyedVectors` object nmslib_index = NmslibIndexer(wv, {'M': 100, 'indexThreadQty': 1, 'efConstruction': 100}, {'efSearch': 100}) nmslib_index.save('/tmp/mymodel.index') # In[44]: # Load and test the saved word vectors and saved nmslib index wv = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) nmslib_index = NmslibIndexer.load('/tmp/mymodel.index') nmslib_index.model = wv vector = wv["cat"] approximate_neighbors = wv.most_similar([vector], topn=11, indexer=nmslib_index) # Neatly print the approximate_neighbors and their corresponding cosine similarity values print("Approximate Neighbors") for neighbor in approximate_neighbors: print(neighbor) normal_neighbors = wv.most_similar([vector], topn=11) print("\nNormal (not Nmslib-indexed) Neighbors") for neighbor in normal_neighbors: print(neighbor) # ### Recap # In this notebook we used the Nmslib module to build an indexed approximation of our word embeddings. To do so, we did the following steps: # 1. Download Text8 Corpus # 2. Build Word2Vec Model # 3. Construct NmslibIndex with model & make a similarity query # 4. Verify & Evaluate performance # 5. Evaluate relationship of parameters to initialization/query time and accuracy, compared with annoy # 6. Work with Google's word2vec C formats