#!/usr/bin/env python # coding: utf-8 # # Benchmark: Implement Levenshtein term similarity matrix and fast SCM between corpora ([RaRe-Technologies/gensim PR #2016][#2016]) # # [#2016]: https://github.com/RaRe-Technologies/gensim/pull/2016 (Implement Levenshtein term similarity matrix and fast SCM between corpora - Pull Request #2016) # In[1]: get_ipython().system('git rev-parse HEAD') # In[2]: from copy import deepcopy from datetime import timedelta from itertools import product import logging from math import floor, ceil, log10 import pickle from random import sample, seed, shuffle from time import time import numpy as np import pandas as pd from tqdm import tqdm_notebook def tqdm(iterable, total=None, desc=None): if total is None: total = len(iterable) for num_done, element in enumerate(tqdm_notebook(iterable, total=total)): logger.info("%s: %d / %d", desc, num_done, total) yield element from gensim.corpora import Dictionary import gensim.downloader as api from gensim.similarities.index import AnnoyIndexer from gensim.similarities import SparseTermSimilarityMatrix from gensim.similarities import UniformTermSimilarityIndex from gensim.similarities import LevenshteinSimilarityIndex from gensim.similarities import WordEmbeddingSimilarityIndex from gensim.utils import simple_preprocess RANDOM_SEED = 12345 logger = logging.getLogger() fhandler = logging.FileHandler(filename='matrix_speed.log', mode='a') formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fhandler.setFormatter(formatter) logger.addHandler(fhandler) logger.setLevel(logging.INFO) pd.set_option('display.max_rows', None, 'display.max_seq_items', None) # In[3]: """Repeatedly run a benchmark callable given various configurations and get a list of results. Return a list of results of repeatedly running a benchmark callable. Parameters ---------- benchmark : callable tuple -> dict A benchmark callable that accepts a configuration and returns results. configurations : iterable of tuple An iterable of configurations that are used for calling the benchmark function. results_filename : str A filename of a file that will be used to persistently store the results using pickle. If the file exists, then the function will load the stored results instead of calling the benchmark callable. Returns ------- iterable of tuple The return values of the individual invocations of the benchmark callable. """ def benchmark_results(benchmark, configurations, results_filename): try: with open(results_filename, "rb") as file: results = pickle.load(file) except IOError: configurations = list(configurations) shuffle(configurations) results = list(tqdm( (benchmark(configuration) for configuration in configurations), total=len(configurations), desc="benchmark")) with open(results_filename, "wb") as file: pickle.dump(results, file) return results # ## Implement Levenshtein term similarity matrix # # In Gensim PR [#1827][], we added a base implementation of the soft cosine measure (SCM). The base implementation would create term similarity matrices using a single complex procedure. In the Gensim PR [#2016][], we split the procedure into: # # - **TermSimilarityIndex** builder classes that produce the $k$ most similar terms for a given term $t$ that are distinct from $t$ along with the term similarities, and # - the **SparseTermSimilarityMatrix** director class that constructs term similarity matrices and consumes term similarities produced by **TermSimilarityIndex** instances. # # One of the benefits of this separation is that we can easily measure the speed at which a **TermSimilarityIndex** builder class produces term similarities and compare this speed with the speed at which the **SparseTermSimilarityMatrix** director class consumes term similarities. This allows us to see which of the classes are a bottleneck that slows down the construction of term similarity matrices. # # In this notebook, we measure all the currently available builder and director classes. For the measurements, we use the [Google News word embeddings][word2vec-google-news-300] distributed with the C implementation of Word2Vec. From the word embeddings, we will derive a dictionary of 2.01M terms. # # [word2vec-google-news-300]: https://github.com/mmihaltz/word2vec-GoogleNews-vectors (word2vec-GoogleNews-vectors) # [#1827]: https://github.com/RaRe-Technologies/gensim/pull/1827 (Implement Soft Cosine Measure - Pull Request #1827) # [#2016]: https://github.com/RaRe-Technologies/gensim/pull/2016 (Implement Levenshtein term similarity matrix and fast SCM between corpora - Pull Request #2016) # In[4]: full_model = api.load("word2vec-google-news-300") try: full_dictionary = Dictionary.load("matrix_speed.dictionary") except IOError: full_dictionary = Dictionary([[term] for term in full_model.vocab.keys()]) full_dictionary.save("matrix_speed.dictionary") # ### Director class benchmark # #### SparseTermSimilarityMatrix # First, we measure the speed at which the **SparseTermSimilarityMatrix** director class consumes term similarities. # In[5]: def benchmark(configuration): dictionary, nonzero_limit, symmetric, positive_definite, repetition = configuration index = UniformTermSimilarityIndex(dictionary) start_time = time() matrix = SparseTermSimilarityMatrix( index, dictionary, nonzero_limit=nonzero_limit, symmetric=symmetric, positive_definite=positive_definite, dtype=np.float16).matrix end_time = time() duration = end_time - start_time return { "dictionary_size": len(dictionary), "nonzero_limit": nonzero_limit, "matrix_nonzero": matrix.nnz, "repetition": repetition, "symmetric": symmetric, "positive_definite": positive_definite, "duration": duration, } # In[6]: dictionary_sizes = [10**k for k in range(3, int(ceil(log10(len(full_dictionary)))))] seed(RANDOM_SEED) dictionaries = [] for size in tqdm(dictionary_sizes, desc="dictionaries"): dictionary = Dictionary([sample(list(full_dictionary.values()), size)]) dictionaries.append(dictionary) dictionaries.append(full_dictionary) nonzero_limits = [1, 10, 100] symmetry = (True, False) positive_definiteness = (True, False) repetitions = range(10) configurations = product(dictionaries, nonzero_limits, symmetry, positive_definiteness, repetitions) results = benchmark_results(benchmark, configurations, "matrix_speed.director_results") # The following tables show how long it takes to construct a term similarity matrix (the **duration** column), how many nonzero elements there are in the matrix (the **matrix_nonzero** column) and the mean term similarity consumption speed (the **consumption_speed** column) as we vary the dictionary size (the **dictionary_size** column) the maximum number of nonzero elements outside the diagonal in every column of the matrix (the **nonzero_limit** column), the matrix symmetry constraint (the **symmetric** column), and the matrix positive definiteness constraing (the **positive_definite** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations. # # We can see that the symmetry and positive definiteness constraints severely limit the number of nonzero elements in the resulting matrix. This in turn increases the consumption speed, since we end up throwing away most of the elements that we consume. The effects of the dictionary size on the mean term similarity consumption speed are minor to none. # In[7]: df = pd.DataFrame(results) df["consumption_speed"] = df.dictionary_size * df.nonzero_limit / df.duration df = df.groupby(["dictionary_size", "nonzero_limit", "symmetric", "positive_definite"]) def display(df): df["duration"] = [timedelta(0, duration) for duration in df["duration"]] df["matrix_nonzero"] = [int(nonzero) for nonzero in df["matrix_nonzero"]] df["consumption_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["consumption_speed"]] return df # In[8]: display(df.mean()).loc[ [10000, len(full_dictionary)], :, :].loc[ :, ["duration", "matrix_nonzero", "consumption_speed"]] # In[9]: display(df.apply(lambda x: (x - x.mean()).std())).loc[ [10000, len(full_dictionary)], :, :].loc[ :, ["duration", "matrix_nonzero", "consumption_speed"]] # ### Builder class benchmark # #### UniformTermSimilarityIndex # First, we measure the speed at which the **UniformTermSimilarityIndex** builder class produces term similarities. **UniformTermSimilarityIndex** is a dummy class that just generates a sequence of constants. It produces much more term similarities per second than the **SparseTermSimilarityMatrix** is capable of consuming and its results will serve as an upper limit. # In[10]: def benchmark(configuration): dictionary, nonzero_limit, repetition = configuration start_time = time() index = UniformTermSimilarityIndex(dictionary) end_time = time() constructor_duration = end_time - start_time start_time = time() for term in dictionary.values(): for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)): pass end_time = time() production_duration = end_time - start_time return { "dictionary_size": len(dictionary), "nonzero_limit": nonzero_limit, "repetition": repetition, "constructor_duration": constructor_duration, "production_duration": production_duration, } # In[11]: nonzero_limits = [1, 10, 100, 1000] configurations = product(dictionaries, nonzero_limits, repetitions) results = benchmark_results(benchmark, configurations, "matrix_speed.builder_results.uniform") # The following tables show how long it takes to retrieve the most similar terms for all terms in a dictionary (the **production_duration** column) and the mean term similarity production speed (the **production_speed** column) as we vary the dictionary size (the **dictionary_size** column), and the maximum number of most similar terms that will be retrieved (the **nonzero_limit** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations. # # The **production_speed** is proportional to **nonzero_limit**. # In[12]: df = pd.DataFrame(results) df["processing_speed"] = df.dictionary_size ** 2 / df.production_duration df["production_speed"] = df.dictionary_size * df.nonzero_limit / df.production_duration df = df.groupby(["dictionary_size", "nonzero_limit"]) def display(df): df["constructor_duration"] = [timedelta(0, duration) for duration in df["constructor_duration"]] df["production_duration"] = [timedelta(0, duration) for duration in df["production_duration"]] df["processing_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["processing_speed"]] df["production_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["production_speed"]] return df # In[13]: display(df.mean()).loc[ [1000, len(full_dictionary)], :, :].loc[ :, ["production_duration", "production_speed"]] # In[14]: display(df.apply(lambda x: (x - x.mean()).std())).loc[ [1000, len(full_dictionary)], :, :].loc[ :, ["production_duration", "production_speed"]] # #### LevenshteinSimilarityIndex # Next, we measure the speed at which the **LevenshteinSimilarityIndex** builder class produces term similarities. **LevenshteinSimilarityIndex** is currently just a naïve implementation that produces much fewer term similarities per second than the **SparseTermSimilarityMatrix** class is capable of consuming. # In[15]: def benchmark(configuration): dictionary, nonzero_limit, query_terms, repetition = configuration start_time = time() index = LevenshteinSimilarityIndex(dictionary) end_time = time() constructor_duration = end_time - start_time start_time = time() for term in query_terms: for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)): pass end_time = time() production_duration = end_time - start_time return { "dictionary_size": len(dictionary), "mean_query_term_length": np.mean([len(term) for term in query_terms]), "nonzero_limit": nonzero_limit, "repetition": repetition, "constructor_duration": constructor_duration, "production_duration": production_duration, } # In[16]: nonzero_limits = [1, 10, 100] seed(RANDOM_SEED) min_dictionary = sorted((len(dictionary), dictionary) for dictionary in dictionaries)[0][1] query_terms = sample(list(min_dictionary.values()), 10) configurations = product(dictionaries, nonzero_limits, [query_terms], repetitions) results = benchmark_results(benchmark, configurations, "matrix_speed.builder_results.levenshtein") # The following tables show how long it takes to retrieve the most similar terms for ten randomly sampled terms from a dictionary (the **production_duration** column), the mean term similarity production speed (the **production_speed** column) and the mean term similarity processing speed (the **processing_speed** column) as we vary the dictionary size (the **dictionary_size** column), and the maximum number of most similar terms that will be retrieved (the **nonzero_limit** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations. # # The **production_speed** is proportional to **nonzero_limit / dictionary_size**. The **processing_speed** is constant. # In[17]: df = pd.DataFrame(results) df["processing_speed"] = df.dictionary_size * len(query_terms) / df.production_duration df["production_speed"] = df.nonzero_limit * len(query_terms) / df.production_duration df = df.groupby(["dictionary_size", "nonzero_limit"]) def display(df): df["constructor_duration"] = [timedelta(0, duration) for duration in df["constructor_duration"]] df["production_duration"] = [timedelta(0, duration) for duration in df["production_duration"]] df["processing_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["processing_speed"]] df["production_speed"] = ["%.02f word pairs / s" % speed for speed in df["production_speed"]] return df # In[18]: display(df.mean()).loc[ [1000, 1000000, len(full_dictionary)], :].loc[ :, ["production_duration", "production_speed", "processing_speed"]] # In[19]: display(df.apply(lambda x: (x - x.mean()).std())).loc[ [1000, 1000000, len(full_dictionary)], :].loc[ :, ["production_duration", "production_speed", "processing_speed"]] # #### WordEmbeddingSimilarityIndex # Lastly, we measure the speed at which the **WordEmbeddingSimilarityIndex** builder class constructs an instance and produces term similarities. Gensim currently supports slow and precise nearest neighbor search, and also approximate nearest neighbor search using [ANNOY][]. We evaluate both options. # # [ANNOY]: https://github.com/spotify/annoy (Approximate Nearest Neighbors in C++/Python optimized for memory usage and loading/saving to disk) # In[20]: def benchmark(configuration): (model, dictionary), nonzero_limit, annoy_n_trees, query_terms, repetition = configuration use_annoy = annoy_n_trees > 0 model.init_sims() start_time = time() if use_annoy: annoy = AnnoyIndexer(model, annoy_n_trees) kwargs = {"indexer": annoy} else: kwargs = {} index = WordEmbeddingSimilarityIndex(model, kwargs=kwargs) end_time = time() constructor_duration = end_time - start_time start_time = time() for term in query_terms: for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)): pass end_time = time() production_duration = end_time - start_time return { "dictionary_size": len(dictionary), "mean_query_term_length": np.mean([len(term) for term in query_terms]), "nonzero_limit": nonzero_limit, "use_annoy": use_annoy, "annoy_n_trees": annoy_n_trees, "repetition": repetition, "constructor_duration": constructor_duration, "production_duration": production_duration, } # In[21]: models = [] for dictionary in tqdm(dictionaries, desc="models"): if dictionary == full_dictionary: models.append(full_model) continue model = full_model.__class__(full_model.vector_size) model.vocab = {word: deepcopy(full_model.vocab[word]) for word in dictionary.values()} model.index2entity = [] vector_indices = [] for index, word in enumerate(full_model.index2entity): if word in model.vocab.keys(): model.index2entity.append(word) model.vocab[word].index = len(vector_indices) vector_indices.append(index) model.vectors = full_model.vectors[vector_indices] models.append(model) annoy_n_trees = [0] + [10**k for k in range(3)] seed(RANDOM_SEED) query_terms = sample(list(min_dictionary.values()), 1000) configurations = product(zip(models, dictionaries), nonzero_limits, annoy_n_trees, [query_terms], repetitions) results = benchmark_results(benchmark, configurations, "matrix_speed.builder_results.wordembeddings") # The following tables show how long it takes to construct an ANNOY index and the builder class instance (the **constructor_duration** column), how long it takes to retrieve the most similar terms for 1,000 randomly sampled terms from a dictionary (the **production_duration** column), the mean term similarity production speed (the **production_speed** column) and the mean term similarity processing speed (the **processing_speed** column) as we vary the dictionary size (the **dictionary_size** column), the maximum number of most similar terms that will be retrieved (the **nonzero_limit** column), and the number of constructed ANNOY trees (the **annoy_n_trees** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations. # # If we do not use ANNOY (**annoy_n_trees**${}=0$), then **production_speed** is proportional to **nonzero_limit / dictionary_size**. # If we do use ANNOY (**annoy_n_trees**${}>0$), then **production_speed** is proportional to **nonzero_limit / (annoy_n_trees)**${}^{1/2}$. # In[22]: df = pd.DataFrame(results) df["processing_speed"] = df.dictionary_size * len(query_terms) / df.production_duration df["production_speed"] = df.nonzero_limit * len(query_terms) / df.production_duration df = df.groupby(["dictionary_size", "nonzero_limit", "annoy_n_trees"]) def display(df): df["constructor_duration"] = [timedelta(0, duration) for duration in df["constructor_duration"]] df["production_duration"] = [timedelta(0, duration) for duration in df["production_duration"]] df["processing_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["processing_speed"]] df["production_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["production_speed"]] return df # In[23]: display(df.mean()).loc[ [1000000, len(full_dictionary)], [1, 100], [0, 1, 100]].loc[ :, ["constructor_duration", "production_duration", "production_speed", "processing_speed"]] # In[24]: display(df.apply(lambda x: (x - x.mean()).std())).loc[ [1000000, len(full_dictionary)], [1, 100], [0, 1, 100]].loc[ :, ["constructor_duration", "production_duration", "production_speed", "processing_speed"]] # ## Implement fast SCM between corpora # # In Gensim PR [#1827][], we added a base implementation of the soft cosine measure (SCM). The base implementation would compute SCM between single documents using the **softcossim** function. In the Gensim PR [#2016][], we intruduced the **SparseTermSimilarityMatrix.inner_product** method, which computes SCM not only between single documents, but also between a document and a corpus, and between two corpora. # # For the measurements, we use the [Google News word embeddings][word2vec-google-news-300] distributed with the C implementation of Word2Vec. From the word embeddings, we will derive a dictionary of 2.01m terms. As a corpus, we will use a random sample of 100K articles from the 4.92m English [Wikipedia articles][enwiki]. # # [word2vec-google-news-300]: https://github.com/mmihaltz/word2vec-GoogleNews-vectors (word2vec-GoogleNews-vectors) # [enwiki]: https://github.com/RaRe-Technologies/gensim-data/releases/tag/wiki-english-20171001 (wiki-english-20171001) # [#1827]: https://github.com/RaRe-Technologies/gensim/pull/1827 (Implement Soft Cosine Measure - Pull Request #1827) # [#2016]: https://github.com/RaRe-Technologies/gensim/pull/2016 (Implement Levenshtein term similarity matrix and fast SCM between corpora - Pull Request #2016) # In[25]: full_model = api.load("word2vec-google-news-300") try: with open("matrix_speed.corpus", "rb") as file: full_corpus = pickle.load(file) except IOError: original_corpus = list(tqdm(api.load("wiki-english-20171001"), desc="original_corpus", total=4924894)) seed(RANDOM_SEED) full_corpus = [ simple_preprocess(u'\n'.join(article["section_texts"])) for article in tqdm(sample(original_corpus, 10**5), desc="full_corpus", total=10**5)] del original_corpus with open("matrix_speed.corpus", "wb") as file: pickle.dump(full_corpus, file) try: full_dictionary = Dictionary.load("matrix_speed.dictionary") except IOError: full_dictionary = Dictionary([[term] for term in full_model.vocab.keys()]) full_dictionary.save("matrix_speed.dictionary") # ### SCM between two documents # First, we measure the speed at which the **inner_product** method produces term similarities between single documents. # In[26]: def benchmark(configuration): (matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration corpus_size = len(corpus) corpus = [dictionary.doc2bow(doc) for doc in corpus] corpus = [vec for vec in corpus if len(vec) > 0] start_time = time() for vec1 in corpus: for vec2 in corpus: matrix.inner_product(vec1, vec2, normalized=normalized) end_time = time() duration = end_time - start_time return { "dictionary_size": matrix.matrix.shape[0], "matrix_nonzero": matrix.matrix.nnz, "nonzero_limit": nonzero_limit, "normalized": normalized, "corpus_size": corpus_size, "corpus_actual_size": len(corpus), "corpus_nonzero": sum(len(vec) for vec in corpus), "mean_document_length": np.mean([len(doc) for doc in corpus]), "repetition": repetition, "duration": duration, } # In[27]: seed(RANDOM_SEED) dictionary_sizes = [1000, 100000] dictionaries = [] for size in tqdm(dictionary_sizes, desc="dictionaries"): dictionary = Dictionary([sample(list(full_dictionary.values()), size)]) dictionaries.append(dictionary) min_dictionary = sorted((len(dictionary), dictionary) for dictionary in dictionaries)[0][1] corpus_sizes = [100, 1000] corpora = [] for size in tqdm(corpus_sizes, desc="corpora"): corpus = sample(full_corpus, size) corpora.append(corpus) models = [] for dictionary in tqdm(dictionaries, desc="models"): if dictionary == full_dictionary: models.append(full_model) continue model = full_model.__class__(full_model.vector_size) model.vocab = {word: deepcopy(full_model.vocab[word]) for word in dictionary.values()} model.index2entity = [] vector_indices = [] for index, word in enumerate(full_model.index2entity): if word in model.vocab.keys(): model.index2entity.append(word) model.vocab[word].index = len(vector_indices) vector_indices.append(index) model.vectors = full_model.vectors[vector_indices] models.append(model) nonzero_limits = [1, 10, 100] matrices = [] for (model, dictionary), nonzero_limit in tqdm( list(product(zip(models, dictionaries), nonzero_limits)), desc="matrices"): annoy = AnnoyIndexer(model, 1) index = WordEmbeddingSimilarityIndex(model, kwargs={"indexer": annoy}) matrix = SparseTermSimilarityMatrix(index, dictionary, nonzero_limit=nonzero_limit) matrices.append((matrix, dictionary, nonzero_limit)) del annoy normalization = (True, False) repetitions = range(10) # In[28]: configurations = product(matrices, corpora, normalization, repetitions) results = benchmark_results(benchmark, configurations, "matrix_speed.inner-product_results.doc_doc") # The following tables show how long it takes to compute the **inner_product** method between all document vectors in a corpus (the **duration** column), how many nonzero elements there are in a corpus matrix (the **corpus_nonzero** column), how many nonzero elements there are in a term similarity matrix (the **matrix_nonzero** column) and the mean document similarity production speed (the **speed** column) as we vary the dictionary size (the **dictionary_size** column), the size of the corpus (the **corpus_size** column), the maximum number of nonzero elements in a single column of the matrix (the **nonzero_limit** column), and the matrix symmetry constraint (the **symmetric** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations. # # The **speed** is proportional to the square of the number of unique terms shared by the two document vectors. In our scenario as well as the standard IR scenario, this means **speed** is constant. Computing a normalized inner product (**normalized**${}={}$True) results in a constant speed decrease. # In[29]: df = pd.DataFrame(results) df["speed"] = df.corpus_actual_size**2 / df.duration del df["corpus_actual_size"] df = df.groupby(["dictionary_size", "corpus_size", "nonzero_limit", "normalized"]) def display(df): df["duration"] = [timedelta(0, duration) for duration in df["duration"]] df["speed"] = ["%.02f Kdoc pairs / s" % (speed / 1000) for speed in df["speed"]] return df # In[30]: display(df.mean()).loc[ [1000, 100000], :, [1, 100], :].loc[ :, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]] # In[31]: display(df.apply(lambda x: (x - x.mean()).std())).loc[ [1000, 100000], :, [1, 100], :].loc[ :, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]] # ### SCM between a document and a corpus # Next, we measure the speed at which the **inner_product** method produces term similarities between documents and a corpus. # In[32]: def benchmark(configuration): (matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration corpus_size = len(corpus) corpus = [dictionary.doc2bow(doc) for doc in corpus if doc] start_time = time() for vec in corpus: matrix.inner_product(vec, corpus, normalized=normalized) end_time = time() duration = end_time - start_time return { "dictionary_size": matrix.matrix.shape[0], "matrix_nonzero": matrix.matrix.nnz, "nonzero_limit": nonzero_limit, "normalized": normalized, "corpus_size": corpus_size, "corpus_actual_size": len(corpus), "corpus_nonzero": sum(len(vec) for vec in corpus), "mean_document_length": np.mean([len(doc) for doc in corpus]), "repetition": repetition, "duration": duration, } # In[33]: configurations = product(matrices, corpora, normalization, repetitions) results = benchmark_results(benchmark, configurations, "matrix_speed.inner-product_results.doc_corpus") # The **speed** is inversely proportional to **matrix_nonzero**. Computing a normalized inner product (**normalized**${}={}$True) results in a constant speed decrease. # In[34]: df = pd.DataFrame(results) df["speed"] = df.corpus_actual_size**2 / df.duration del df["corpus_actual_size"] df = df.groupby(["dictionary_size", "corpus_size", "nonzero_limit", "normalized"]) def display(df): df["duration"] = [timedelta(0, duration) for duration in df["duration"]] df["speed"] = ["%.02f Kdoc pairs / s" % (speed / 1000) for speed in df["speed"]] return df # In[35]: display(df.mean()).loc[ [1000, 100000], :, [1, 100], :].loc[ :, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]] # In[36]: display(df.apply(lambda x: (x - x.mean()).std())).loc[ [1000, 100000], :, [1, 100], :].loc[ :, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]] # ### SCM between two corpora # Lastly, we measure the speed at which the **inner_product** method produces term similarities between entire corpora. # In[37]: def benchmark(configuration): (matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration corpus_size = len(corpus) corpus = [dictionary.doc2bow(doc) for doc in corpus] corpus = [vec for vec in corpus if len(vec) > 0] start_time = time() matrix.inner_product(corpus, corpus, normalized=normalized) end_time = time() duration = end_time - start_time return { "dictionary_size": matrix.matrix.shape[0], "matrix_nonzero": matrix.matrix.nnz, "nonzero_limit": nonzero_limit, "normalized": normalized, "corpus_size": corpus_size, "corpus_actual_size": len(corpus), "corpus_nonzero": sum(len(vec) for vec in corpus), "mean_document_length": np.mean([len(doc) for doc in corpus]), "repetition": repetition, "duration": duration, } # In[38]: nonzero_limits = [1000] dense_matrices = [] for (model, dictionary), nonzero_limit in tqdm( list(product(zip(models, dictionaries), nonzero_limits)), desc="matrices"): annoy = AnnoyIndexer(model, 1) index = WordEmbeddingSimilarityIndex(model, kwargs={"indexer": annoy}) matrix = SparseTermSimilarityMatrix(index, dictionary, nonzero_limit=nonzero_limit) matrices.append((matrix, dictionary, nonzero_limit)) del annoy # In[39]: configurations = product(matrices + dense_matrices, corpora + [full_corpus], normalization, repetitions) results = benchmark_results(benchmark, configurations, "matrix_speed.inner-product_results.corpus_corpus") # In[40]: df = pd.DataFrame(results) df["speed"] = df.corpus_actual_size**2 / df.duration del df["corpus_actual_size"] df = df.groupby(["dictionary_size", "corpus_size", "nonzero_limit", "normalized"]) def display(df): df["duration"] = [timedelta(0, duration) for duration in df["duration"]] df["speed"] = ["%.02f Kdoc pairs / s" % (speed / 1000) for speed in df["speed"]] return df # In[41]: display(df.mean()).loc[ [1000, 100000], :, [1, 10, 100, 1000], :].loc[ :, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]] # In[42]: display(df.apply(lambda x: (x - x.mean()).std())).loc[ [1000, 100000], :, [1, 100], :].loc[ :, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]]