#!/usr/bin/env python # coding: utf-8 # # Wikipedia training # # In this tutorial we will: # - Learn how to train the NMF topic model on English Wikipedia corpus # - Compare it with LDA model # - Evaluate results # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') import itertools import json import logging import numpy as np import pandas as pd import scipy.sparse import smart_open import time from tqdm import tqdm, tqdm_notebook import gensim.downloader as api from gensim import matutils from gensim.corpora import MmCorpus, Dictionary from gensim.models import LdaModel, CoherenceModel from gensim.models.nmf import Nmf from gensim.parsing.preprocessing import preprocess_string tqdm.pandas() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # # Preprocessing # ### Load wikipedia dump # Let's use `gensim.downloader.api` for that # In[2]: data = api.load("wiki-english-20171001") article = next(iter(data)) for section_title, section_text in zip( article['section_titles'], article['section_texts'] ): print("Section title: %s" % section_title) print("Section text: %s" % section_text[:100]) # Preprocess and save articles # In[3]: def save_preprocessed_articles(filename, articles): with smart_open(filename, 'w+', encoding="utf8") as writer: for article in tqdm_notebook(articles): article_text = " ".join( " ".join(section) for section in zip( article['section_titles'], article['section_texts'] ) ) article_text = preprocess_string(article_text) writer.write(json.dumps(article_text) + '\n') def get_preprocessed_articles(filename): with smart_open(filename, 'r', encoding="utf8") as reader: for line in tqdm_notebook(reader): yield json.loads( line ) # In[4]: SAVE_ARTICLES = False if SAVE_ARTICLES: save_preprocessed_articles('wiki_articles.jsonlines', data) # ### Create and save dictionary # In[5]: SAVE_DICTIONARY = False if SAVE_DICTIONARY: dictionary = Dictionary(get_preprocessed_articles('wiki_articles.jsonlines')) dictionary.save('wiki.dict') # ### Load and filter dictionary # In[6]: dictionary = Dictionary.load('wiki.dict') dictionary.filter_extremes() dictionary.compactify() # ### MmCorpus wrapper # In this way we'll: # # - Make sure that documents are shuffled # - Be able to train-test split corpus without rewriting it # In[7]: class RandomCorpus(MmCorpus): def __init__(self, random_seed=42, testset=False, testsize=1000, *args, **kwargs): super().__init__(*args, **kwargs) random_state = np.random.RandomState(random_seed) self.indices = random_state.permutation(range(self.num_docs)) if testset: self.indices = self.indices[:testsize] else: self.indices = self.indices[testsize:] def __iter__(self): for doc_id in self.indices: yield self[doc_id] def __len__(self): return len(self.indices) # ### Create and save corpus # In[8]: SAVE_CORPUS = False if SAVE_CORPUS: corpus = ( dictionary.doc2bow(article) for article in get_preprocessed_articles('wiki_articles.jsonlines') ) RandomCorpus.serialize('wiki.mm', corpus) # ### Load train and test corpus # Using `RandomCorpus` wrapper # In[9]: train_corpus = RandomCorpus( random_seed=42, testset=False, testsize=2000, fname='wiki.mm' ) test_corpus = RandomCorpus( random_seed=42, testset=True, testsize=2000, fname='wiki.mm' ) # ## Metrics # In[10]: def get_execution_time(func): start = time.time() result = func() return (time.time() - start), result def get_tm_metrics(model, test_corpus): W = model.get_topics().T H = np.zeros((model.num_topics, len(test_corpus))) for bow_id, bow in enumerate(test_corpus): for topic_id, word_count in model.get_document_topics(bow): H[topic_id, bow_id] = word_count pred_factors = W.dot(H) pred_factors /= pred_factors.sum(axis=0) dense_corpus = matutils.corpus2dense(test_corpus, pred_factors.shape[0]) perplexity = get_tm_perplexity(pred_factors, dense_corpus) l2_norm = get_tm_l2_norm(pred_factors, dense_corpus) model.normalize = True coherence = CoherenceModel( model=model, corpus=test_corpus, coherence='u_mass' ).get_coherence() topics = model.show_topics() model.normalize = False return dict( perplexity=perplexity, coherence=coherence, topics=topics, l2_norm=l2_norm, ) def get_tm_perplexity(pred_factors, dense_corpus): return np.exp(-(np.log(pred_factors, where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum()) def get_tm_l2_norm(pred_factors, dense_corpus): return np.linalg.norm(dense_corpus / dense_corpus.sum(axis=0) - pred_factors) # Define dataframe in which we'll store metrics # In[11]: tm_metrics = pd.DataFrame() # ### Define common params for models # In[12]: params = dict( corpus=train_corpus, chunksize=2000, num_topics=50, id2word=dictionary, passes=1, eval_every=10, minimum_probability=0, random_state=42, ) # ## Training # ### Train NMF and save it # Normalization is turned off to compute metrics correctly # In[13]: row = dict() row['model'] = 'nmf' row['train_time'], nmf = get_execution_time( lambda: Nmf( use_r=False, normalize=False, **params ) ) nmf.save('nmf.model') # ### Load NMF and store metrics # In[14]: nmf = Nmf.load('nmf.model') row.update(get_tm_metrics(nmf, test_corpus)) tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True) nmf.show_topics(50) # ### Train NMF with residuals and save it # Residuals add regularization to the model thus increasing quality, but slows down training # In[15]: row = dict() row['model'] = 'nmf_with_r' row['train_time'], nmf_with_r = get_execution_time( lambda: Nmf( use_r=True, lambda_=200, normalize=False, **params ) ) nmf_with_r.save('nmf_with_r.model') # ### Load NMF with residuals and store metrics # In[16]: nmf_with_r = Nmf.load('nmf_with_r.model') row.update(get_tm_metrics(nmf_with_r, test_corpus)) tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True) nmf_with_r.show_topics(50) # ### Train LDA and save it # That's a common model to do Topic Modeling # In[17]: row = dict() row['model'] = 'lda' row['train_time'], lda = get_execution_time( lambda: LdaModel(**params) ) lda.save('lda.model') # ### Load LDA and store metrics # In[18]: lda = LdaModel.load('lda.model') row.update(get_tm_metrics(lda, test_corpus)) tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True) lda.show_topics(50) # ## Results # In[19]: tm_metrics # #### RAM Usage: # - nmf: 100-150Mb # - nmf_with_r: 3-9Gb # - lda: 100Mb # In[20]: for row_idx, row in tm_metrics.iterrows(): print('='*20) print(row['model']) print('='*20) print() for topic_idx, tokens in row['topics']: print('Topic: {}'.format(topic_idx)) print(tokens) print() print() # As we can see, NMF can be significantly faster than LDA without sacrificing quality of topics too much (or not sacrificing at all) # # Moreover, NMF can be very flexible on RAM usage due to sparsity option, which leaves only small amount of elements in inner matrices. # In[ ]: