#!/usr/bin/env python
# coding: utf-8

# # Wikipedia training
# 
# In this tutorial we will:
#  - Learn how to train the NMF topic model on English Wikipedia corpus
#  - Compare it with LDA model
#  - Evaluate results

# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

import itertools
import json
import logging
import numpy as np
import pandas as pd
import scipy.sparse
import smart_open
import time
from tqdm import tqdm, tqdm_notebook

import gensim.downloader as api
from gensim import matutils
from gensim.corpora import MmCorpus, Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.models.nmf import Nmf
from gensim.parsing.preprocessing import preprocess_string

tqdm.pandas()

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)


# # Preprocessing

# ### Load wikipedia dump
# Let's use `gensim.downloader.api` for that

# In[2]:


data = api.load("wiki-english-20171001")
article = next(iter(data))

for section_title, section_text in zip(
    article['section_titles'],
    article['section_texts']
):
    print("Section title: %s" % section_title)
    print("Section text: %s" % section_text[:100])


# Preprocess and save articles

# In[3]:


def save_preprocessed_articles(filename, articles):
    with smart_open(filename, 'w+', encoding="utf8") as writer:
        for article in tqdm_notebook(articles):
            article_text = " ".join(
                " ".join(section)
                for section
                in zip(
                    article['section_titles'],
                    article['section_texts']
                )
            )
            article_text = preprocess_string(article_text)

            writer.write(json.dumps(article_text) + '\n')


def get_preprocessed_articles(filename):
    with smart_open(filename, 'r', encoding="utf8") as reader:
        for line in tqdm_notebook(reader):
            yield json.loads(
                line
            )


# In[4]:


SAVE_ARTICLES = False

if SAVE_ARTICLES:
    save_preprocessed_articles('wiki_articles.jsonlines', data)


# ### Create and save dictionary

# In[5]:


SAVE_DICTIONARY = False

if SAVE_DICTIONARY:
    dictionary = Dictionary(get_preprocessed_articles('wiki_articles.jsonlines'))
    dictionary.save('wiki.dict')


# ### Load and filter dictionary

# In[6]:


dictionary = Dictionary.load('wiki.dict')
dictionary.filter_extremes()
dictionary.compactify()


# ### MmCorpus wrapper
# In this way we'll:
# 
# - Make sure that documents are shuffled
# - Be able to train-test split corpus without rewriting it

# In[7]:


class RandomCorpus(MmCorpus):
    def __init__(self, random_seed=42, testset=False, testsize=1000, *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        random_state = np.random.RandomState(random_seed)
        self.indices = random_state.permutation(range(self.num_docs))
        if testset:
            self.indices = self.indices[:testsize]
        else:
            self.indices = self.indices[testsize:]

    def __iter__(self):
        for doc_id in self.indices:
            yield self[doc_id]
            
    def __len__(self):
        return len(self.indices)


# ### Create and save corpus

# In[8]:


SAVE_CORPUS = False

if SAVE_CORPUS:
    corpus = (
        dictionary.doc2bow(article)
        for article
        in get_preprocessed_articles('wiki_articles.jsonlines')
    )
    
    RandomCorpus.serialize('wiki.mm', corpus)


# ### Load train and test corpus
# Using `RandomCorpus` wrapper

# In[9]:


train_corpus = RandomCorpus(
    random_seed=42, testset=False, testsize=2000, fname='wiki.mm'
)
test_corpus = RandomCorpus(
    random_seed=42, testset=True, testsize=2000, fname='wiki.mm'
)


# ## Metrics

# In[10]:


def get_execution_time(func):
    start = time.time()

    result = func()

    return (time.time() - start), result


def get_tm_metrics(model, test_corpus):
    W = model.get_topics().T
    H = np.zeros((model.num_topics, len(test_corpus)))
    for bow_id, bow in enumerate(test_corpus):
        for topic_id, word_count in model.get_document_topics(bow):
            H[topic_id, bow_id] = word_count

    pred_factors = W.dot(H)
    pred_factors /= pred_factors.sum(axis=0)
    
    dense_corpus = matutils.corpus2dense(test_corpus, pred_factors.shape[0])

    perplexity = get_tm_perplexity(pred_factors, dense_corpus)

    l2_norm = get_tm_l2_norm(pred_factors, dense_corpus)

    model.normalize = True

    coherence = CoherenceModel(
        model=model,
        corpus=test_corpus,
        coherence='u_mass'
    ).get_coherence()

    topics = model.show_topics()

    model.normalize = False

    return dict(
        perplexity=perplexity,
        coherence=coherence,
        topics=topics,
        l2_norm=l2_norm,
    )


def get_tm_perplexity(pred_factors, dense_corpus):
    return np.exp(-(np.log(pred_factors, where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum())


def get_tm_l2_norm(pred_factors, dense_corpus):
    return np.linalg.norm(dense_corpus / dense_corpus.sum(axis=0) - pred_factors)


# Define dataframe in which we'll store metrics

# In[11]:


tm_metrics = pd.DataFrame()


# ### Define common params for models

# In[12]:


params = dict(
    corpus=train_corpus,
    chunksize=2000,
    num_topics=50,
    id2word=dictionary,
    passes=1,
    eval_every=10,
    minimum_probability=0,
    random_state=42,
)


# ## Training

# ### Train NMF and save it
# Normalization is turned off to compute metrics correctly

# In[13]:


row = dict()
row['model'] = 'nmf'
row['train_time'], nmf = get_execution_time(
    lambda: Nmf(
        use_r=False,
        normalize=False,
        **params
    )
)
nmf.save('nmf.model')


# ### Load NMF and store metrics

# In[14]:


nmf = Nmf.load('nmf.model')
row.update(get_tm_metrics(nmf, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

nmf.show_topics(50)


# ### Train NMF with residuals and save it
# Residuals add regularization to the model thus increasing quality, but slows down training

# In[15]:


row = dict()
row['model'] = 'nmf_with_r'
row['train_time'], nmf_with_r = get_execution_time(
    lambda: Nmf(
        use_r=True,
        lambda_=200,
        normalize=False,
        **params
    )
)
nmf_with_r.save('nmf_with_r.model')


# ### Load NMF with residuals and store metrics

# In[16]:


nmf_with_r = Nmf.load('nmf_with_r.model')
row.update(get_tm_metrics(nmf_with_r, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

nmf_with_r.show_topics(50)


# ### Train LDA and save it
# That's a common model to do Topic Modeling

# In[17]:


row = dict()
row['model'] = 'lda'
row['train_time'], lda = get_execution_time(
    lambda: LdaModel(**params)
)
lda.save('lda.model')


# ### Load LDA and store metrics

# In[18]:


lda = LdaModel.load('lda.model')
row.update(get_tm_metrics(lda, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

lda.show_topics(50)


# ## Results

# In[19]:


tm_metrics


# #### RAM Usage:
# - nmf: 100-150Mb
# - nmf_with_r: 3-9Gb
# - lda: 100Mb

# In[20]:


for row_idx, row in tm_metrics.iterrows():
    print('='*20)
    print(row['model'])
    print('='*20)
    print()
    for topic_idx, tokens in row['topics']:
        print('Topic: {}'.format(topic_idx))
        print(tokens)
        print()
    print()


# As we can see, NMF can be significantly faster than LDA without sacrificing quality of topics too much (or not sacrificing at all)
# 
# Moreover, NMF can be very flexible on RAM usage due to sparsity option, which leaves only small amount of elements in inner matrices.

# In[ ]: