In this tutorial we will:
%load_ext autoreload
%autoreload 2
import itertools
import json
import logging
import numpy as np
import pandas as pd
import scipy.sparse
import smart_open
import time
from tqdm import tqdm, tqdm_notebook
import gensim.downloader as api
from gensim import matutils
from gensim.corpora import MmCorpus, Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.models.nmf import Nmf
from gensim.parsing.preprocessing import preprocess_string
tqdm.pandas()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
Let's use gensim.downloader.api
for that
data = api.load("wiki-english-20171001")
article = next(iter(data))
for section_title, section_text in zip(
article['section_titles'],
article['section_texts']
):
print("Section title: %s" % section_title)
print("Section text: %s" % section_text[:100])
Section title: Introduction Section text: '''Anarchism''' is a political philosophy that advocates self-governed societies based on volun Section title: Etymology and terminology Section text: The word ''anarchism'' is composed from the word ''anarchy'' and the suffix ''-ism'', themselves d Section title: History Section text: ===Origins=== Woodcut from a Diggers document by William Everard The earliest anarchist themes ca Section title: Anarchist schools of thought Section text: Portrait of philosopher Pierre-Joseph Proudhon (1809–1865) by Gustave Courbet. Proudhon was the pri Section title: Internal issues and debates Section text: consistent with anarchist values is a controversial subject among anarchists. Anarchism is a philo Section title: Topics of interest Section text: Intersecting and overlapping between various schools of thought, certain topics of interest and inte Section title: Criticisms Section text: Criticisms of anarchism include moral criticisms and pragmatic criticisms. Anarchism is often evalu Section title: See also Section text: * Anarchism by country Section title: References Section text: Section title: Further reading Section text: * Barclay, Harold, ''People Without Government: An Anthropology of Anarchy'' (2nd ed.), Left Bank Bo Section title: External links Section text: * *
Preprocess and save articles
def save_preprocessed_articles(filename, articles):
with smart_open(filename, 'w+', encoding="utf8") as writer:
for article in tqdm_notebook(articles):
article_text = " ".join(
" ".join(section)
for section
in zip(
article['section_titles'],
article['section_texts']
)
)
article_text = preprocess_string(article_text)
writer.write(json.dumps(article_text) + '\n')
def get_preprocessed_articles(filename):
with smart_open(filename, 'r', encoding="utf8") as reader:
for line in tqdm_notebook(reader):
yield json.loads(
line
)
SAVE_ARTICLES = False
if SAVE_ARTICLES:
save_preprocessed_articles('wiki_articles.jsonlines', data)
SAVE_DICTIONARY = False
if SAVE_DICTIONARY:
dictionary = Dictionary(get_preprocessed_articles('wiki_articles.jsonlines'))
dictionary.save('wiki.dict')
dictionary = Dictionary.load('wiki.dict')
dictionary.filter_extremes()
dictionary.compactify()
2019-01-15 19:31:03,151 : INFO : loading Dictionary object from wiki.dict 2019-01-15 19:31:04,024 : INFO : loaded wiki.dict 2019-01-15 19:31:06,292 : INFO : discarding 1910258 tokens: [('abdelrahim', 49), ('abstention', 120), ('anarcha', 101), ('anarchica', 40), ('anarchosyndicalist', 20), ('antimilitar', 68), ('arbet', 194), ('archo', 100), ('arkhē', 5), ('autonomedia', 118)]... 2019-01-15 19:31:06,293 : INFO : keeping 100000 tokens which were in no less than 5 and no more than 2462447 (=50.0%) documents 2019-01-15 19:31:06,645 : INFO : resulting dictionary: Dictionary(100000 unique tokens: ['abandon', 'abil', 'abl', 'abolit', 'abstent']...)
In this way we'll:
class RandomCorpus(MmCorpus):
def __init__(self, random_seed=42, testset=False, testsize=1000, *args,
**kwargs):
super().__init__(*args, **kwargs)
random_state = np.random.RandomState(random_seed)
self.indices = random_state.permutation(range(self.num_docs))
if testset:
self.indices = self.indices[:testsize]
else:
self.indices = self.indices[testsize:]
def __iter__(self):
for doc_id in self.indices:
yield self[doc_id]
def __len__(self):
return len(self.indices)
SAVE_CORPUS = False
if SAVE_CORPUS:
corpus = (
dictionary.doc2bow(article)
for article
in get_preprocessed_articles('wiki_articles.jsonlines')
)
RandomCorpus.serialize('wiki.mm', corpus)
Using RandomCorpus
wrapper
train_corpus = RandomCorpus(
random_seed=42, testset=False, testsize=2000, fname='wiki.mm'
)
test_corpus = RandomCorpus(
random_seed=42, testset=True, testsize=2000, fname='wiki.mm'
)
2019-01-15 19:31:07,323 : INFO : loaded corpus index from wiki.mm.index 2019-01-15 19:31:07,324 : INFO : initializing cython corpus reader from wiki.mm 2019-01-15 19:31:07,325 : INFO : accepted corpus with 4924894 documents, 100000 features, 683375728 non-zero entries 2019-01-15 19:31:08,544 : INFO : loaded corpus index from wiki.mm.index 2019-01-15 19:31:08,544 : INFO : initializing cython corpus reader from wiki.mm 2019-01-15 19:31:08,545 : INFO : accepted corpus with 4924894 documents, 100000 features, 683375728 non-zero entries
def get_execution_time(func):
start = time.time()
result = func()
return (time.time() - start), result
def get_tm_metrics(model, test_corpus):
W = model.get_topics().T
H = np.zeros((model.num_topics, len(test_corpus)))
for bow_id, bow in enumerate(test_corpus):
for topic_id, word_count in model.get_document_topics(bow):
H[topic_id, bow_id] = word_count
pred_factors = W.dot(H)
pred_factors /= pred_factors.sum(axis=0)
dense_corpus = matutils.corpus2dense(test_corpus, pred_factors.shape[0])
perplexity = get_tm_perplexity(pred_factors, dense_corpus)
l2_norm = get_tm_l2_norm(pred_factors, dense_corpus)
model.normalize = True
coherence = CoherenceModel(
model=model,
corpus=test_corpus,
coherence='u_mass'
).get_coherence()
topics = model.show_topics()
model.normalize = False
return dict(
perplexity=perplexity,
coherence=coherence,
topics=topics,
l2_norm=l2_norm,
)
def get_tm_perplexity(pred_factors, dense_corpus):
return np.exp(-(np.log(pred_factors, where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum())
def get_tm_l2_norm(pred_factors, dense_corpus):
return np.linalg.norm(dense_corpus / dense_corpus.sum(axis=0) - pred_factors)
Define dataframe in which we'll store metrics
tm_metrics = pd.DataFrame()
params = dict(
corpus=train_corpus,
chunksize=2000,
num_topics=50,
id2word=dictionary,
passes=1,
eval_every=10,
minimum_probability=0,
random_state=42,
)
Normalization is turned off to compute metrics correctly
row = dict()
row['model'] = 'nmf'
row['train_time'], nmf = get_execution_time(
lambda: Nmf(
use_r=False,
normalize=False,
**params
)
)
nmf.save('nmf.model')
2019-01-15 19:33:21,875 : INFO : Loss (no outliers): 2186.768444126956 Loss (with outliers): 2186.768444126956 2019-01-15 19:34:49,514 : INFO : Loss (no outliers): 2298.434152045061 Loss (with outliers): 2298.434152045061 ==Truncated== 2019-01-15 20:44:23,913 : INFO : Loss (no outliers): 1322.9664709183141 Loss (with outliers): 1322.9664709183141 2019-01-15 20:44:23,928 : INFO : saving Nmf object under nmf.model, separately None 2019-01-15 20:44:24,625 : INFO : saved nmf.model
nmf = Nmf.load('nmf.model')
row.update(get_tm_metrics(nmf, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)
nmf.show_topics(50)
2019-01-15 20:44:24,872 : INFO : loading Nmf object from nmf.model 2019-01-15 20:44:25,150 : INFO : loading id2word recursively from nmf.model.id2word.* with mmap=None 2019-01-15 20:44:25,151 : INFO : loaded nmf.model 2019-01-15 20:44:54,148 : INFO : CorpusAccumulator accumulated stats from 1000 documents 2019-01-15 20:44:54,336 : INFO : CorpusAccumulator accumulated stats from 2000 documents
[(0, '0.075*"parti" + 0.071*"elect" + 0.042*"democrat" + 0.029*"republican" + 0.022*"vote" + 0.018*"conserv" + 0.017*"liber" + 0.014*"candid" + 0.013*"seat" + 0.013*"labour"'), (1, '0.039*"book" + 0.038*"centuri" + 0.032*"histori" + 0.032*"languag" + 0.032*"publish" + 0.024*"english" + 0.023*"world" + 0.022*"law" + 0.022*"govern" + 0.021*"nation"'), (2, '0.050*"war" + 0.036*"forc" + 0.026*"armi" + 0.023*"battl" + 0.021*"attack" + 0.019*"militari" + 0.018*"german" + 0.016*"british" + 0.015*"command" + 0.014*"kill"'), (3, '0.119*"race" + 0.106*"car" + 0.073*"engin" + 0.035*"model" + 0.030*"driver" + 0.029*"vehicl" + 0.029*"ford" + 0.028*"lap" + 0.023*"electr" + 0.020*"power"'), (4, '0.102*"leagu" + 0.092*"club" + 0.049*"footbal" + 0.047*"cup" + 0.029*"plai" + 0.028*"season" + 0.028*"divis" + 0.028*"goal" + 0.022*"team" + 0.021*"unit"'), (5, '0.055*"award" + 0.041*"best" + 0.008*"nomin" + 0.008*"year" + 0.006*"actress" + 0.006*"actor" + 0.005*"perform" + 0.005*"artist" + 0.005*"won" + 0.005*"outstand"'), (6, '0.115*"citi" + 0.014*"airport" + 0.013*"area" + 0.011*"popul" + 0.010*"san" + 0.009*"region" + 0.008*"center" + 0.007*"municip" + 0.007*"intern" + 0.007*"ukrainian"'), (7, '0.316*"act" + 0.046*"amend" + 0.020*"order" + 0.018*"ireland" + 0.016*"law" + 0.015*"regul" + 0.013*"court" + 0.011*"scotland" + 0.011*"road" + 0.009*"public"'), (8, '0.102*"align" + 0.084*"left" + 0.022*"right" + 0.012*"text" + 0.011*"style" + 0.007*"center" + 0.004*"bar" + 0.003*"till" + 0.003*"bgcolor" + 0.003*"color"'), (9, '0.092*"team" + 0.027*"race" + 0.025*"ret" + 0.014*"championship" + 0.007*"nation" + 0.006*"time" + 0.006*"sport" + 0.005*"stage" + 0.005*"coach" + 0.005*"finish"'), (10, '0.135*"compani" + 0.089*"ship" + 0.035*"product" + 0.028*"oper" + 0.024*"navi" + 0.022*"corpor" + 0.021*"oil" + 0.021*"launch" + 0.021*"bank" + 0.021*"built"'), (11, '0.053*"new" + 0.019*"york" + 0.004*"zealand" + 0.003*"jersei" + 0.003*"american" + 0.002*"time" + 0.002*"australia" + 0.002*"radio" + 0.002*"press" + 0.002*"washington"'), (12, '0.036*"world" + 0.034*"championship" + 0.032*"final" + 0.029*"match" + 0.026*"win" + 0.026*"round" + 0.019*"open" + 0.018*"won" + 0.015*"defeat" + 0.015*"cup"'), (13, '0.019*"album" + 0.017*"record" + 0.014*"band" + 0.008*"releas" + 0.005*"tour" + 0.005*"guitar" + 0.005*"vocal" + 0.004*"rock" + 0.004*"track" + 0.004*"music"'), (14, '0.100*"church" + 0.017*"cathol" + 0.014*"christian" + 0.012*"centuri" + 0.012*"saint" + 0.011*"bishop" + 0.011*"built" + 0.009*"list" + 0.009*"build" + 0.008*"roman"'), (15, '0.088*"presid" + 0.072*"minist" + 0.046*"prime" + 0.015*"govern" + 0.014*"gener" + 0.011*"met" + 0.011*"governor" + 0.010*"foreign" + 0.010*"visit" + 0.009*"council"'), (16, '0.182*"speci" + 0.112*"famili" + 0.101*"nov" + 0.092*"valid" + 0.066*"genu" + 0.045*"format" + 0.040*"member" + 0.037*"gen" + 0.036*"bird" + 0.034*"type"'), (17, '0.029*"season" + 0.013*"yard" + 0.013*"game" + 0.011*"plai" + 0.008*"team" + 0.007*"score" + 0.007*"win" + 0.007*"record" + 0.006*"run" + 0.006*"coach"'), (18, '0.214*"counti" + 0.064*"township" + 0.017*"area" + 0.016*"statist" + 0.007*"ohio" + 0.006*"metropolitan" + 0.006*"combin" + 0.005*"pennsylvania" + 0.005*"texa" + 0.005*"washington"'), (19, '0.017*"area" + 0.016*"river" + 0.015*"water" + 0.006*"larg" + 0.006*"region" + 0.006*"lake" + 0.006*"power" + 0.006*"high" + 0.005*"bar" + 0.005*"form"'), (20, '0.031*"us" + 0.025*"gener" + 0.024*"model" + 0.022*"data" + 0.021*"design" + 0.020*"time" + 0.019*"function" + 0.019*"number" + 0.018*"process" + 0.017*"exampl"'), (21, '0.202*"order" + 0.098*"group" + 0.098*"regul" + 0.076*"amend" + 0.041*"road" + 0.034*"traffic" + 0.033*"temporari" + 0.032*"prohibit" + 0.027*"trunk" + 0.021*"junction"'), (22, '0.096*"film" + 0.010*"product" + 0.010*"director" + 0.010*"festiv" + 0.009*"star" + 0.009*"produc" + 0.009*"movi" + 0.008*"direct" + 0.007*"releas" + 0.007*"actor"'), (23, '0.163*"music" + 0.046*"viola" + 0.045*"radio" + 0.042*"piano" + 0.029*"perform" + 0.028*"station" + 0.027*"orchestra" + 0.026*"compos" + 0.025*"song" + 0.015*"rock"'), (24, '0.052*"mount" + 0.051*"lemmon" + 0.051*"peak" + 0.051*"kitt" + 0.051*"spacewatch" + 0.026*"survei" + 0.015*"octob" + 0.012*"septemb" + 0.009*"css" + 0.009*"catalina"'), (25, '0.075*"air" + 0.035*"forc" + 0.030*"squadron" + 0.029*"aircraft" + 0.028*"oper" + 0.023*"unit" + 0.018*"flight" + 0.017*"airport" + 0.017*"wing" + 0.017*"base"'), (26, '0.105*"hous" + 0.038*"term" + 0.020*"march" + 0.019*"build" + 0.019*"member" + 0.017*"serv" + 0.014*"congress" + 0.014*"hall" + 0.012*"januari" + 0.010*"window"'), (27, '0.129*"district" + 0.019*"pennsylvania" + 0.016*"grade" + 0.012*"fund" + 0.012*"educ" + 0.012*"basic" + 0.011*"level" + 0.010*"oblast" + 0.010*"rural" + 0.009*"tax"'), (28, '0.042*"year" + 0.012*"dai" + 0.007*"time" + 0.005*"ag" + 0.004*"month" + 0.003*"includ" + 0.003*"follow" + 0.003*"later" + 0.003*"old" + 0.003*"student"'), (29, '0.113*"station" + 0.109*"line" + 0.076*"road" + 0.072*"railwai" + 0.048*"rout" + 0.035*"oper" + 0.034*"train" + 0.023*"street" + 0.020*"cross" + 0.020*"railroad"'), (30, '0.036*"park" + 0.029*"town" + 0.025*"north" + 0.020*"south" + 0.018*"west" + 0.017*"east" + 0.017*"street" + 0.015*"nation" + 0.014*"build" + 0.013*"river"'), (31, '0.066*"women" + 0.044*"men" + 0.030*"nation" + 0.024*"right" + 0.014*"athlet" + 0.013*"intern" + 0.013*"rank" + 0.013*"countri" + 0.012*"advanc" + 0.011*"event"'), (32, '0.127*"linear" + 0.126*"socorro" + 0.029*"septemb" + 0.026*"neat" + 0.023*"palomar" + 0.021*"octob" + 0.016*"kitt" + 0.016*"peak" + 0.015*"spacewatch" + 0.015*"anderson"'), (33, '0.152*"univers" + 0.055*"colleg" + 0.019*"institut" + 0.018*"student" + 0.018*"scienc" + 0.015*"professor" + 0.012*"research" + 0.011*"campu" + 0.011*"educ" + 0.011*"technolog"'), (34, '0.072*"state" + 0.032*"unit" + 0.005*"court" + 0.005*"law" + 0.004*"feder" + 0.004*"american" + 0.003*"nation" + 0.003*"govern" + 0.003*"kingdom" + 0.003*"senat"'), (35, '0.074*"game" + 0.017*"player" + 0.007*"plai" + 0.006*"releas" + 0.005*"develop" + 0.005*"video" + 0.005*"charact" + 0.004*"playstat" + 0.004*"version" + 0.004*"world"'), (36, '0.141*"south" + 0.098*"american" + 0.081*"india" + 0.059*"commun" + 0.053*"west" + 0.053*"director" + 0.053*"africa" + 0.049*"usa" + 0.049*"indian" + 0.041*"servic"'), (37, '0.111*"servic" + 0.025*"commun" + 0.021*"offic" + 0.012*"polic" + 0.011*"educ" + 0.011*"public" + 0.010*"chief" + 0.009*"late" + 0.009*"manag" + 0.008*"mr"'), (38, '0.112*"royal" + 0.085*"john" + 0.083*"william" + 0.054*"lieuten" + 0.044*"georg" + 0.041*"offic" + 0.041*"jame" + 0.038*"sergeant" + 0.037*"major" + 0.035*"charl"'), (39, '0.051*"song" + 0.043*"releas" + 0.042*"singl" + 0.027*"chart" + 0.025*"album" + 0.017*"number" + 0.014*"video" + 0.013*"version" + 0.012*"love" + 0.011*"featur"'), (40, '0.031*"time" + 0.028*"later" + 0.026*"appear" + 0.025*"man" + 0.024*"kill" + 0.020*"charact" + 0.019*"work" + 0.018*"father" + 0.018*"death" + 0.018*"famili"'), (41, '0.126*"seri" + 0.064*"episod" + 0.026*"season" + 0.021*"televis" + 0.015*"comic" + 0.013*"charact" + 0.012*"dvd" + 0.012*"anim" + 0.012*"star" + 0.011*"appear"'), (42, '0.143*"born" + 0.073*"american" + 0.027*"footbal" + 0.024*"player" + 0.024*"william" + 0.023*"singer" + 0.019*"actor" + 0.017*"politician" + 0.015*"actress" + 0.013*"english"'), (43, '0.044*"march" + 0.042*"septemb" + 0.036*"octob" + 0.033*"januari" + 0.032*"april" + 0.031*"august" + 0.031*"juli" + 0.029*"novemb" + 0.029*"june" + 0.028*"decemb"'), (44, '0.149*"island" + 0.013*"south" + 0.013*"australia" + 0.009*"sea" + 0.008*"north" + 0.008*"bai" + 0.008*"western" + 0.008*"airport" + 0.007*"coast" + 0.006*"pacif"'), (45, '0.028*"studi" + 0.026*"research" + 0.023*"health" + 0.019*"human" + 0.019*"term" + 0.019*"develop" + 0.018*"includ" + 0.018*"peopl" + 0.017*"report" + 0.017*"cell"'), (46, '0.112*"school" + 0.028*"high" + 0.016*"student" + 0.012*"educ" + 0.009*"grade" + 0.008*"primari" + 0.007*"public" + 0.006*"colleg" + 0.006*"elementari" + 0.006*"pennsylvania"'), (47, '0.137*"royal" + 0.121*"capt" + 0.103*"armi" + 0.090*"maj" + 0.089*"corp" + 0.075*"col" + 0.074*"temp" + 0.048*"servic" + 0.040*"engin" + 0.033*"reg"'), (48, '0.183*"art" + 0.117*"museum" + 0.071*"paint" + 0.062*"work" + 0.046*"artist" + 0.043*"galleri" + 0.040*"exhibit" + 0.034*"collect" + 0.027*"histori" + 0.022*"jpg"'), (49, '0.068*"regiment" + 0.062*"divis" + 0.049*"battalion" + 0.045*"infantri" + 0.036*"brigad" + 0.024*"armi" + 0.023*"artilleri" + 0.019*"compani" + 0.018*"gener" + 0.018*"colonel"')]
Residuals add regularization to the model thus increasing quality, but slows down training
row = dict()
row['model'] = 'nmf_with_r'
row['train_time'], nmf_with_r = get_execution_time(
lambda: Nmf(
use_r=True,
lambda_=200,
normalize=False,
**params
)
)
nmf_with_r.save('nmf_with_r.model')
2019-01-15 20:54:05,363 : INFO : Loss (no outliers): 2179.9524465227146 Loss (with outliers): 2102.354108449905 2019-01-15 20:57:12,821 : INFO : Loss (no outliers): 2268.3200929871823 Loss (with outliers): 2110.928651253909 ==Truncated== 2019-01-16 04:05:46,589 : INFO : Loss (no outliers): 1321.521323758918 Loss (with outliers): 1282.9364495345592 2019-01-16 04:05:46,599 : INFO : saving Nmf object under nmf_with_r.model, separately None 2019-01-16 04:05:46,601 : INFO : storing scipy.sparse array '_r' under nmf_with_r.model._r.npy 2019-01-16 04:05:47,781 : INFO : saved nmf_with_r.model
nmf_with_r = Nmf.load('nmf_with_r.model')
row.update(get_tm_metrics(nmf_with_r, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)
nmf_with_r.show_topics(50)
2019-01-16 04:05:48,017 : INFO : loading Nmf object from nmf_with_r.model 2019-01-16 04:05:48,272 : INFO : loading id2word recursively from nmf_with_r.model.id2word.* with mmap=None 2019-01-16 04:05:48,273 : INFO : loading _r from nmf_with_r.model._r.npy with mmap=None 2019-01-16 04:05:48,304 : INFO : loaded nmf_with_r.model 2019-01-16 04:06:27,119 : INFO : CorpusAccumulator accumulated stats from 1000 documents 2019-01-16 04:06:27,253 : INFO : CorpusAccumulator accumulated stats from 2000 documents
[(0, '0.062*"parti" + 0.061*"elect" + 0.031*"democrat" + 0.020*"republican" + 0.020*"vote" + 0.013*"liber" + 0.012*"candid" + 0.012*"conserv" + 0.011*"seat" + 0.010*"member"'), (1, '0.052*"book" + 0.040*"centuri" + 0.039*"publish" + 0.031*"languag" + 0.027*"histori" + 0.025*"work" + 0.023*"english" + 0.022*"king" + 0.019*"polit" + 0.019*"author"'), (2, '0.031*"armi" + 0.028*"divis" + 0.025*"regiment" + 0.022*"forc" + 0.020*"battalion" + 0.019*"infantri" + 0.019*"command" + 0.017*"brigad" + 0.016*"gener" + 0.012*"corp"'), (3, '0.110*"race" + 0.059*"car" + 0.033*"engin" + 0.025*"lap" + 0.023*"driver" + 0.021*"ret" + 0.020*"ford" + 0.015*"finish" + 0.015*"motorsport" + 0.015*"chevrolet"'), (4, '0.130*"club" + 0.068*"cup" + 0.046*"footbal" + 0.044*"goal" + 0.032*"leagu" + 0.031*"unit" + 0.031*"plai" + 0.030*"match" + 0.026*"score" + 0.021*"player"'), (5, '0.041*"award" + 0.030*"best" + 0.006*"nomin" + 0.005*"actress" + 0.005*"year" + 0.004*"actor" + 0.004*"won" + 0.004*"perform" + 0.003*"outstand" + 0.003*"artist"'), (6, '0.087*"citi" + 0.013*"town" + 0.009*"popul" + 0.008*"area" + 0.007*"san" + 0.006*"center" + 0.006*"airport" + 0.006*"unit" + 0.006*"locat" + 0.005*"municip"'), (7, '0.171*"act" + 0.021*"amend" + 0.018*"order" + 0.010*"ireland" + 0.009*"law" + 0.007*"court" + 0.007*"regul" + 0.006*"road" + 0.006*"scotland" + 0.006*"nation"'), (8, '0.064*"leagu" + 0.014*"divis" + 0.012*"left" + 0.011*"align" + 0.009*"basebal" + 0.008*"footbal" + 0.007*"run" + 0.007*"major" + 0.005*"home" + 0.005*"hit"'), (9, '0.086*"team" + 0.013*"championship" + 0.007*"nation" + 0.007*"race" + 0.007*"coach" + 0.005*"time" + 0.004*"sport" + 0.004*"ret" + 0.004*"player" + 0.004*"match"'), (10, '0.100*"episod" + 0.055*"compani" + 0.021*"product" + 0.011*"produc" + 0.011*"televis" + 0.009*"role" + 0.009*"busi" + 0.008*"market" + 0.008*"corpor" + 0.007*"bank"'), (11, '0.050*"new" + 0.017*"york" + 0.003*"zealand" + 0.003*"jersei" + 0.002*"time" + 0.002*"radio" + 0.002*"broadcast" + 0.002*"station" + 0.002*"washington" + 0.002*"australia"'), (12, '0.035*"final" + 0.033*"world" + 0.030*"round" + 0.030*"championship" + 0.025*"win" + 0.025*"match" + 0.021*"open" + 0.017*"won" + 0.016*"tournament" + 0.015*"event"'), (13, '0.020*"record" + 0.019*"band" + 0.015*"album" + 0.007*"releas" + 0.007*"guitar" + 0.006*"tour" + 0.005*"rock" + 0.005*"vocal" + 0.004*"plai" + 0.004*"live"'), (14, '0.096*"church" + 0.015*"cathol" + 0.012*"christian" + 0.010*"saint" + 0.010*"bishop" + 0.009*"centuri" + 0.008*"build" + 0.007*"parish" + 0.007*"built" + 0.007*"roman"'), (15, '0.084*"presid" + 0.055*"minist" + 0.037*"prime" + 0.014*"govern" + 0.012*"gener" + 0.010*"governor" + 0.010*"nation" + 0.008*"council" + 0.008*"secretari" + 0.008*"visit"'), (16, '0.089*"yard" + 0.035*"pass" + 0.035*"touchdown" + 0.028*"field" + 0.025*"run" + 0.023*"win" + 0.022*"score" + 0.021*"quarter" + 0.017*"record" + 0.016*"second"'), (17, '0.042*"season" + 0.006*"plai" + 0.004*"coach" + 0.004*"final" + 0.004*"second" + 0.004*"win" + 0.004*"record" + 0.003*"career" + 0.003*"finish" + 0.003*"point"'), (18, '0.174*"counti" + 0.034*"township" + 0.014*"area" + 0.013*"statist" + 0.004*"texa" + 0.004*"ohio" + 0.004*"virginia" + 0.004*"washington" + 0.003*"metropolitan" + 0.003*"pennsylvania"'), (19, '0.012*"water" + 0.010*"area" + 0.010*"speci" + 0.007*"larg" + 0.006*"order" + 0.006*"region" + 0.006*"includ" + 0.005*"black" + 0.005*"famili" + 0.005*"popul"'), (20, '0.020*"us" + 0.015*"gener" + 0.014*"design" + 0.014*"model" + 0.012*"develop" + 0.012*"time" + 0.012*"data" + 0.011*"number" + 0.011*"function" + 0.011*"process"'), (21, '0.165*"group" + 0.023*"left" + 0.022*"align" + 0.021*"member" + 0.017*"text" + 0.015*"bar" + 0.011*"order" + 0.011*"point" + 0.010*"till" + 0.009*"stage"'), (22, '0.095*"film" + 0.009*"director" + 0.008*"star" + 0.008*"movi" + 0.008*"product" + 0.008*"festiv" + 0.008*"releas" + 0.008*"produc" + 0.007*"direct" + 0.006*"featur"'), (23, '0.107*"music" + 0.024*"perform" + 0.019*"piano" + 0.018*"song" + 0.017*"compos" + 0.017*"orchestra" + 0.017*"viola" + 0.012*"plai" + 0.011*"radio" + 0.011*"danc"'), (24, '0.023*"septemb" + 0.023*"march" + 0.020*"octob" + 0.020*"juli" + 0.019*"june" + 0.019*"april" + 0.019*"august" + 0.018*"januari" + 0.018*"novemb" + 0.017*"decemb"'), (25, '0.078*"air" + 0.041*"forc" + 0.031*"aircraft" + 0.027*"squadron" + 0.026*"oper" + 0.021*"unit" + 0.016*"base" + 0.016*"wing" + 0.016*"flight" + 0.015*"fighter"'), (26, '0.101*"hous" + 0.023*"build" + 0.021*"term" + 0.015*"member" + 0.014*"serv" + 0.014*"march" + 0.014*"left" + 0.012*"congress" + 0.011*"hall" + 0.010*"street"'), (27, '0.123*"district" + 0.024*"pennsylvania" + 0.019*"grade" + 0.016*"educ" + 0.015*"fund" + 0.014*"basic" + 0.013*"level" + 0.011*"student" + 0.011*"receiv" + 0.010*"tax"'), (28, '0.048*"year" + 0.007*"dai" + 0.005*"time" + 0.005*"ag" + 0.003*"month" + 0.003*"old" + 0.003*"student" + 0.003*"includ" + 0.003*"later" + 0.002*"million"'), (29, '0.090*"line" + 0.083*"station" + 0.054*"road" + 0.053*"railwai" + 0.036*"rout" + 0.030*"train" + 0.027*"oper" + 0.020*"street" + 0.016*"servic" + 0.016*"open"'), (30, '0.031*"park" + 0.030*"south" + 0.030*"north" + 0.023*"west" + 0.020*"river" + 0.020*"east" + 0.015*"area" + 0.014*"town" + 0.013*"lake" + 0.013*"nation"'), (31, '0.071*"women" + 0.041*"men" + 0.027*"nation" + 0.023*"right" + 0.012*"countri" + 0.012*"intern" + 0.012*"athlet" + 0.011*"advanc" + 0.011*"rank" + 0.010*"law"'), (32, '0.104*"linear" + 0.104*"socorro" + 0.025*"septemb" + 0.020*"neat" + 0.018*"palomar" + 0.018*"octob" + 0.013*"decemb" + 0.013*"august" + 0.012*"anderson" + 0.012*"mesa"'), (33, '0.089*"univers" + 0.011*"scienc" + 0.009*"institut" + 0.008*"research" + 0.008*"professor" + 0.006*"student" + 0.005*"technolog" + 0.005*"faculti" + 0.005*"studi" + 0.005*"engin"'), (34, '0.064*"state" + 0.024*"unit" + 0.005*"court" + 0.005*"law" + 0.004*"feder" + 0.003*"nation" + 0.003*"govern" + 0.002*"senat" + 0.002*"california" + 0.002*"constitut"'), (35, '0.085*"colleg" + 0.019*"univers" + 0.014*"student" + 0.008*"campu" + 0.007*"institut" + 0.006*"educ" + 0.005*"hall" + 0.005*"program" + 0.005*"commun" + 0.005*"state"'), (36, '0.118*"class" + 0.079*"director" + 0.053*"rifl" + 0.050*"south" + 0.048*"×mm" + 0.046*"action" + 0.045*"san" + 0.044*"actor" + 0.041*"angel" + 0.037*"lo"'), (37, '0.092*"servic" + 0.025*"offic" + 0.023*"commun" + 0.013*"john" + 0.012*"chief" + 0.011*"polic" + 0.011*"public" + 0.011*"british" + 0.010*"late" + 0.010*"director"'), (38, '0.156*"royal" + 0.072*"william" + 0.068*"john" + 0.058*"corp" + 0.051*"lieuten" + 0.046*"capt" + 0.041*"engin" + 0.041*"armi" + 0.039*"georg" + 0.039*"temp"'), (39, '0.042*"song" + 0.039*"album" + 0.034*"releas" + 0.029*"singl" + 0.024*"chart" + 0.013*"number" + 0.011*"video" + 0.010*"love" + 0.010*"featur" + 0.010*"track"'), (40, '0.028*"time" + 0.025*"later" + 0.023*"kill" + 0.019*"appear" + 0.018*"man" + 0.016*"death" + 0.016*"father" + 0.015*"return" + 0.015*"son" + 0.014*"charact"'), (41, '0.110*"seri" + 0.016*"charact" + 0.016*"episod" + 0.015*"comic" + 0.013*"televis" + 0.012*"anim" + 0.011*"appear" + 0.009*"stori" + 0.009*"origin" + 0.009*"featur"'), (42, '0.091*"born" + 0.070*"american" + 0.022*"player" + 0.021*"footbal" + 0.020*"william" + 0.016*"actor" + 0.014*"politician" + 0.014*"singer" + 0.013*"john" + 0.012*"actress"'), (43, '0.072*"game" + 0.017*"player" + 0.011*"plai" + 0.004*"releas" + 0.004*"point" + 0.004*"develop" + 0.004*"score" + 0.003*"video" + 0.003*"time" + 0.003*"card"'), (44, '0.110*"island" + 0.007*"australia" + 0.007*"ship" + 0.007*"south" + 0.007*"sea" + 0.006*"bai" + 0.005*"coast" + 0.004*"pacif" + 0.004*"western" + 0.004*"british"'), (45, '0.029*"health" + 0.028*"studi" + 0.027*"research" + 0.022*"peopl" + 0.020*"human" + 0.019*"medic" + 0.019*"cell" + 0.018*"report" + 0.018*"ag" + 0.017*"includ"'), (46, '0.113*"school" + 0.025*"high" + 0.014*"student" + 0.011*"educ" + 0.007*"grade" + 0.006*"public" + 0.005*"elementari" + 0.005*"primari" + 0.004*"pennsylvania" + 0.004*"teacher"'), (47, '0.050*"war" + 0.021*"german" + 0.017*"american" + 0.016*"british" + 0.016*"world" + 0.012*"french" + 0.010*"battl" + 0.010*"germani" + 0.009*"ship" + 0.009*"soviet"'), (48, '0.174*"art" + 0.099*"museum" + 0.058*"paint" + 0.057*"work" + 0.044*"artist" + 0.041*"galleri" + 0.038*"exhibit" + 0.031*"collect" + 0.023*"histori" + 0.021*"design"'), (49, '0.067*"peak" + 0.066*"kitt" + 0.066*"mount" + 0.066*"spacewatch" + 0.065*"lemmon" + 0.033*"survei" + 0.026*"octob" + 0.024*"septemb" + 0.015*"novemb" + 0.012*"march"')]
That's a common model to do Topic Modeling
row = dict()
row['model'] = 'lda'
row['train_time'], lda = get_execution_time(
lambda: LdaModel(**params)
)
lda.save('lda.model')
2019-01-16 04:06:27,576 : INFO : using symmetric alpha at 0.02 2019-01-16 04:06:27,576 : INFO : using symmetric eta at 0.02 2019-01-16 04:06:27,589 : INFO : using serial LDA version on this node 2019-01-16 04:06:28,185 : INFO : running online (single-pass) LDA training, 50 topics, 1 passes over the supplied corpus of 4922894 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000 2019-01-16 04:06:28,910 : INFO : PROGRESS: pass 0, at document #2000/4922894 ==Truncated== 2019-01-16 06:24:26,456 : INFO : topic diff=0.003897, rho=0.020154 2019-01-16 06:24:26,465 : INFO : saving LdaState object under lda.model.state, separately None 2019-01-16 06:24:26,680 : INFO : saved lda.model.state 2019-01-16 06:24:26,732 : INFO : saving LdaModel object under lda.model, separately ['expElogbeta', 'sstats'] 2019-01-16 06:24:26,732 : INFO : storing np array 'expElogbeta' to lda.model.expElogbeta.npy 2019-01-16 06:24:26,812 : INFO : not storing attribute dispatcher 2019-01-16 06:24:26,814 : INFO : not storing attribute id2word 2019-01-16 06:24:26,815 : INFO : not storing attribute state 2019-01-16 06:24:26,828 : INFO : saved lda.model
lda = LdaModel.load('lda.model')
row.update(get_tm_metrics(lda, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)
lda.show_topics(50)
2019-01-16 06:24:27,064 : INFO : loading LdaModel object from lda.model 2019-01-16 06:24:27,070 : INFO : loading expElogbeta from lda.model.expElogbeta.npy with mmap=None 2019-01-16 06:24:27,077 : INFO : setting ignored attribute dispatcher to None 2019-01-16 06:24:27,078 : INFO : setting ignored attribute id2word to None 2019-01-16 06:24:27,078 : INFO : setting ignored attribute state to None 2019-01-16 06:24:27,079 : INFO : loaded lda.model 2019-01-16 06:24:27,079 : INFO : loading LdaState object from lda.model.state 2019-01-16 06:24:27,173 : INFO : loaded lda.model.state 2019-01-16 06:24:41,257 : INFO : CorpusAccumulator accumulated stats from 1000 documents 2019-01-16 06:24:41,452 : INFO : CorpusAccumulator accumulated stats from 2000 documents
[(0, '0.033*"war" + 0.028*"armi" + 0.021*"forc" + 0.020*"command" + 0.015*"militari" + 0.015*"battl" + 0.013*"gener" + 0.012*"offic" + 0.011*"divis" + 0.011*"regiment"'), (1, '0.038*"album" + 0.028*"song" + 0.026*"releas" + 0.026*"record" + 0.021*"band" + 0.016*"singl" + 0.015*"music" + 0.014*"chart" + 0.013*"track" + 0.010*"guitar"'), (2, '0.062*"german" + 0.039*"germani" + 0.025*"van" + 0.023*"von" + 0.020*"der" + 0.019*"dutch" + 0.019*"berlin" + 0.015*"swedish" + 0.014*"netherland" + 0.014*"sweden"'), (3, '0.032*"john" + 0.027*"william" + 0.019*"british" + 0.015*"georg" + 0.015*"london" + 0.014*"thoma" + 0.014*"sir" + 0.014*"jame" + 0.013*"royal" + 0.013*"henri"'), (4, '0.137*"school" + 0.040*"colleg" + 0.039*"student" + 0.033*"univers" + 0.030*"high" + 0.028*"educ" + 0.016*"year" + 0.011*"graduat" + 0.010*"state" + 0.009*"campu"'), (5, '0.030*"game" + 0.009*"develop" + 0.009*"player" + 0.008*"releas" + 0.008*"us" + 0.008*"softwar" + 0.008*"version" + 0.008*"user" + 0.007*"data" + 0.007*"includ"'), (6, '0.061*"music" + 0.030*"perform" + 0.019*"theatr" + 0.018*"compos" + 0.016*"plai" + 0.016*"festiv" + 0.015*"danc" + 0.014*"orchestra" + 0.012*"opera" + 0.011*"piano"'), (7, '0.013*"number" + 0.011*"function" + 0.010*"model" + 0.009*"valu" + 0.008*"set" + 0.008*"exampl" + 0.007*"gener" + 0.007*"theori" + 0.007*"point" + 0.006*"method"'), (8, '0.048*"india" + 0.037*"indian" + 0.020*"http" + 0.016*"www" + 0.015*"pakistan" + 0.015*"iran" + 0.013*"sri" + 0.012*"khan" + 0.012*"islam" + 0.012*"tamil"'), (9, '0.067*"film" + 0.025*"award" + 0.022*"seri" + 0.021*"episod" + 0.021*"best" + 0.015*"star" + 0.012*"role" + 0.012*"actor" + 0.011*"televis" + 0.011*"produc"'), (10, '0.020*"engin" + 0.013*"power" + 0.011*"product" + 0.011*"design" + 0.010*"model" + 0.009*"produc" + 0.008*"us" + 0.008*"electr" + 0.008*"type" + 0.007*"vehicl"'), (11, '0.024*"law" + 0.021*"court" + 0.016*"state" + 0.016*"act" + 0.011*"polic" + 0.010*"case" + 0.009*"offic" + 0.009*"report" + 0.009*"right" + 0.007*"legal"'), (12, '0.056*"elect" + 0.041*"parti" + 0.023*"member" + 0.020*"vote" + 0.020*"presid" + 0.017*"democrat" + 0.017*"minist" + 0.013*"council" + 0.013*"repres" + 0.012*"polit"'), (13, '0.057*"state" + 0.035*"new" + 0.029*"american" + 0.024*"unit" + 0.024*"york" + 0.020*"counti" + 0.015*"citi" + 0.014*"california" + 0.012*"washington" + 0.010*"texa"'), (14, '0.027*"univers" + 0.015*"research" + 0.014*"institut" + 0.012*"nation" + 0.012*"scienc" + 0.012*"work" + 0.012*"intern" + 0.011*"award" + 0.011*"develop" + 0.010*"organ"'), (15, '0.034*"england" + 0.024*"unit" + 0.021*"london" + 0.019*"cricket" + 0.019*"town" + 0.016*"citi" + 0.015*"scotland" + 0.013*"manchest" + 0.013*"west" + 0.012*"scottish"'), (16, '0.031*"church" + 0.017*"famili" + 0.017*"di" + 0.016*"son" + 0.015*"marri" + 0.014*"year" + 0.013*"father" + 0.013*"life" + 0.013*"born" + 0.012*"daughter"'), (17, '0.060*"race" + 0.020*"car" + 0.017*"team" + 0.012*"finish" + 0.012*"tour" + 0.012*"driver" + 0.011*"ford" + 0.011*"time" + 0.011*"championship" + 0.011*"year"'), (18, '0.010*"water" + 0.007*"light" + 0.007*"energi" + 0.007*"high" + 0.006*"surfac" + 0.006*"earth" + 0.006*"time" + 0.005*"effect" + 0.005*"temperatur" + 0.005*"materi"'), (19, '0.022*"radio" + 0.020*"new" + 0.019*"broadcast" + 0.018*"station" + 0.014*"televis" + 0.013*"channel" + 0.013*"dai" + 0.011*"program" + 0.011*"host" + 0.011*"air"'), (20, '0.035*"win" + 0.018*"contest" + 0.017*"wrestl" + 0.017*"fight" + 0.016*"match" + 0.016*"titl" + 0.015*"championship" + 0.014*"team" + 0.012*"world" + 0.011*"defeat"'), (21, '0.011*"languag" + 0.007*"word" + 0.007*"form" + 0.006*"peopl" + 0.006*"differ" + 0.006*"cultur" + 0.006*"us" + 0.006*"mean" + 0.005*"tradit" + 0.005*"term"'), (22, '0.051*"popul" + 0.033*"ag" + 0.030*"citi" + 0.029*"town" + 0.027*"famili" + 0.026*"censu" + 0.023*"household" + 0.023*"commun" + 0.021*"peopl" + 0.021*"counti"'), (23, '0.016*"medic" + 0.014*"health" + 0.014*"hospit" + 0.013*"cell" + 0.011*"diseas" + 0.010*"patient" + 0.009*"ret" + 0.009*"caus" + 0.008*"human" + 0.008*"treatment"'), (24, '0.037*"ship" + 0.017*"navi" + 0.015*"sea" + 0.012*"island" + 0.012*"boat" + 0.011*"port" + 0.010*"naval" + 0.010*"coast" + 0.010*"gun" + 0.009*"fleet"'), (25, '0.044*"round" + 0.044*"final" + 0.025*"tournament" + 0.023*"group" + 0.020*"point" + 0.020*"winner" + 0.018*"open" + 0.015*"place" + 0.013*"qualifi" + 0.012*"won"'), (26, '0.032*"world" + 0.030*"women" + 0.028*"championship" + 0.026*"olymp" + 0.023*"men" + 0.022*"event" + 0.022*"medal" + 0.018*"athlet" + 0.017*"gold" + 0.017*"nation"'), (27, '0.056*"born" + 0.034*"russian" + 0.026*"american" + 0.020*"russia" + 0.020*"soviet" + 0.017*"polish" + 0.015*"jewish" + 0.014*"poland" + 0.014*"republ" + 0.013*"moscow"'), (28, '0.029*"build" + 0.025*"hous" + 0.014*"built" + 0.012*"locat" + 0.012*"street" + 0.012*"site" + 0.011*"histor" + 0.009*"park" + 0.009*"citi" + 0.009*"place"'), (29, '0.039*"leagu" + 0.036*"club" + 0.035*"plai" + 0.031*"team" + 0.026*"footbal" + 0.026*"season" + 0.023*"cup" + 0.018*"goal" + 0.016*"player" + 0.016*"match"'), (30, '0.053*"french" + 0.041*"franc" + 0.027*"italian" + 0.025*"pari" + 0.022*"saint" + 0.020*"itali" + 0.018*"jean" + 0.014*"de" + 0.011*"loui" + 0.011*"le"'), (31, '0.067*"australia" + 0.058*"australian" + 0.051*"new" + 0.040*"china" + 0.033*"zealand" + 0.032*"south" + 0.027*"chines" + 0.021*"sydnei" + 0.015*"melbourn" + 0.013*"queensland"'), (32, '0.026*"speci" + 0.011*"famili" + 0.009*"plant" + 0.008*"white" + 0.008*"bird" + 0.007*"genu" + 0.007*"red" + 0.007*"forest" + 0.007*"fish" + 0.006*"tree"'), (33, '0.033*"compani" + 0.013*"million" + 0.012*"busi" + 0.012*"market" + 0.011*"product" + 0.010*"bank" + 0.010*"year" + 0.009*"industri" + 0.008*"oper" + 0.008*"new"'), (34, '0.085*"island" + 0.073*"canada" + 0.065*"canadian" + 0.026*"toronto" + 0.025*"ontario" + 0.017*"korean" + 0.017*"korea" + 0.016*"quebec" + 0.016*"montreal" + 0.016*"british"'), (35, '0.034*"kong" + 0.034*"japanes" + 0.033*"hong" + 0.023*"lee" + 0.021*"singapor" + 0.019*"chines" + 0.018*"kim" + 0.015*"japan" + 0.014*"indonesia" + 0.014*"thailand"'), (36, '0.054*"art" + 0.034*"museum" + 0.030*"jpg" + 0.027*"file" + 0.024*"work" + 0.022*"paint" + 0.020*"artist" + 0.019*"design" + 0.017*"imag" + 0.017*"exhibit"'), (37, '0.008*"time" + 0.007*"man" + 0.005*"later" + 0.005*"appear" + 0.005*"charact" + 0.005*"kill" + 0.004*"like" + 0.004*"friend" + 0.004*"return" + 0.004*"end"'), (38, '0.014*"govern" + 0.012*"state" + 0.012*"nation" + 0.010*"war" + 0.009*"polit" + 0.008*"countri" + 0.008*"peopl" + 0.007*"group" + 0.007*"unit" + 0.007*"support"'), (39, '0.050*"air" + 0.026*"aircraft" + 0.026*"oper" + 0.025*"airport" + 0.017*"forc" + 0.017*"flight" + 0.015*"squadron" + 0.014*"unit" + 0.012*"base" + 0.011*"wing"'), (40, '0.052*"bar" + 0.038*"africa" + 0.033*"text" + 0.033*"african" + 0.031*"till" + 0.029*"color" + 0.026*"south" + 0.023*"black" + 0.013*"tropic" + 0.013*"storm"'), (41, '0.039*"book" + 0.033*"publish" + 0.021*"work" + 0.015*"new" + 0.013*"press" + 0.013*"univers" + 0.013*"edit" + 0.011*"stori" + 0.011*"novel" + 0.011*"author"'), (42, '0.026*"king" + 0.019*"centuri" + 0.010*"princ" + 0.009*"empir" + 0.009*"kingdom" + 0.009*"emperor" + 0.009*"greek" + 0.008*"roman" + 0.007*"ancient" + 0.006*"year"'), (43, '0.033*"san" + 0.022*"spanish" + 0.017*"mexico" + 0.016*"del" + 0.013*"spain" + 0.012*"santa" + 0.011*"brazil" + 0.011*"juan" + 0.010*"josé" + 0.009*"francisco"'), (44, '0.029*"game" + 0.027*"season" + 0.023*"team" + 0.015*"plai" + 0.014*"coach" + 0.014*"player" + 0.011*"footbal" + 0.010*"year" + 0.010*"leagu" + 0.009*"record"'), (45, '0.015*"john" + 0.011*"david" + 0.010*"michael" + 0.008*"paul" + 0.008*"smith" + 0.007*"robert" + 0.007*"jame" + 0.006*"peter" + 0.006*"jack" + 0.006*"jone"'), (46, '0.133*"class" + 0.062*"align" + 0.060*"left" + 0.056*"wikit" + 0.046*"style" + 0.043*"center" + 0.035*"right" + 0.032*"philippin" + 0.032*"list" + 0.026*"text"'), (47, '0.025*"river" + 0.024*"station" + 0.021*"line" + 0.020*"road" + 0.017*"railwai" + 0.015*"rout" + 0.013*"lake" + 0.012*"park" + 0.011*"bridg" + 0.011*"area"'), (48, '0.072*"octob" + 0.070*"septemb" + 0.069*"march" + 0.062*"decemb" + 0.062*"januari" + 0.062*"novemb" + 0.061*"juli" + 0.061*"august" + 0.060*"april" + 0.058*"june"'), (49, '0.093*"district" + 0.066*"villag" + 0.047*"region" + 0.039*"east" + 0.039*"west" + 0.038*"north" + 0.036*"counti" + 0.033*"south" + 0.032*"municip" + 0.029*"provinc"')]
tm_metrics
coherence | l2_norm | model | perplexity | topics | train_time | |
---|---|---|---|---|---|---|
0 | -2.814135 | 7.265412 | nmf | 975.740399 | [(24, 0.131*"mount" + 0.129*"lemmon" + 0.129*"... | 4394.560518 |
1 | -2.436650 | 7.268837 | nmf_with_r | 985.570926 | [(49, 0.112*"peak" + 0.111*"kitt" + 0.111*"mou... | 26451.927848 |
2 | -2.514469 | 7.371544 | lda | 4727.075546 | [(35, 0.034*"kong" + 0.034*"japanes" + 0.033*"... | 8278.891060 |
for row_idx, row in tm_metrics.iterrows():
print('='*20)
print(row['model'])
print('='*20)
print()
for topic_idx, tokens in row['topics']:
print('Topic: {}'.format(topic_idx))
print(tokens)
print()
print()
==================== nmf ==================== Topic: 24 0.131*"mount" + 0.129*"lemmon" + 0.129*"peak" + 0.127*"kitt" + 0.127*"spacewatch" + 0.065*"survei" + 0.037*"octob" + 0.031*"septemb" + 0.023*"css" + 0.023*"catalina" Topic: 32 0.196*"linear" + 0.195*"socorro" + 0.045*"septemb" + 0.039*"neat" + 0.035*"palomar" + 0.032*"octob" + 0.024*"kitt" + 0.024*"peak" + 0.024*"spacewatch" + 0.023*"anderson" Topic: 8 0.331*"align" + 0.270*"left" + 0.071*"right" + 0.040*"text" + 0.035*"style" + 0.022*"center" + 0.013*"bar" + 0.009*"till" + 0.008*"bgcolor" + 0.008*"color" Topic: 27 0.186*"district" + 0.027*"pennsylvania" + 0.022*"grade" + 0.017*"fund" + 0.017*"educ" + 0.017*"basic" + 0.016*"level" + 0.014*"oblast" + 0.014*"rural" + 0.013*"tax" Topic: 48 0.103*"art" + 0.066*"museum" + 0.040*"paint" + 0.035*"work" + 0.026*"artist" + 0.024*"galleri" + 0.022*"exhibit" + 0.019*"collect" + 0.015*"histori" + 0.013*"jpg" Topic: 11 0.122*"new" + 0.043*"york" + 0.009*"zealand" + 0.007*"jersei" + 0.006*"american" + 0.006*"time" + 0.006*"australia" + 0.005*"radio" + 0.005*"press" + 0.005*"washington" Topic: 20 0.008*"us" + 0.006*"gener" + 0.006*"model" + 0.006*"data" + 0.006*"design" + 0.005*"time" + 0.005*"function" + 0.005*"number" + 0.005*"process" + 0.005*"exampl" Topic: 28 0.074*"year" + 0.022*"dai" + 0.012*"time" + 0.008*"ag" + 0.006*"month" + 0.006*"includ" + 0.006*"follow" + 0.005*"later" + 0.005*"old" + 0.005*"student" Topic: 38 0.033*"royal" + 0.025*"john" + 0.025*"william" + 0.016*"lieuten" + 0.013*"georg" + 0.012*"offic" + 0.012*"jame" + 0.011*"sergeant" + 0.011*"major" + 0.010*"charl" Topic: 19 0.012*"area" + 0.011*"river" + 0.010*"water" + 0.004*"larg" + 0.004*"region" + 0.004*"lake" + 0.004*"power" + 0.004*"high" + 0.004*"bar" + 0.004*"form" ==================== nmf_with_r ==================== Topic: 49 0.112*"peak" + 0.111*"kitt" + 0.111*"mount" + 0.111*"spacewatch" + 0.109*"lemmon" + 0.055*"survei" + 0.044*"octob" + 0.041*"septemb" + 0.026*"novemb" + 0.021*"march" Topic: 32 0.194*"linear" + 0.193*"socorro" + 0.047*"septemb" + 0.038*"neat" + 0.034*"palomar" + 0.034*"octob" + 0.025*"decemb" + 0.024*"august" + 0.023*"anderson" + 0.023*"mesa" Topic: 48 0.112*"art" + 0.063*"museum" + 0.037*"paint" + 0.036*"work" + 0.028*"artist" + 0.026*"galleri" + 0.025*"exhibit" + 0.020*"collect" + 0.015*"histori" + 0.014*"design" Topic: 4 0.093*"club" + 0.049*"cup" + 0.033*"footbal" + 0.031*"goal" + 0.022*"leagu" + 0.022*"unit" + 0.022*"plai" + 0.022*"match" + 0.018*"score" + 0.015*"player" Topic: 27 0.159*"district" + 0.031*"pennsylvania" + 0.025*"grade" + 0.021*"educ" + 0.019*"fund" + 0.018*"basic" + 0.017*"level" + 0.015*"student" + 0.014*"receiv" + 0.014*"tax" Topic: 17 0.095*"season" + 0.014*"plai" + 0.010*"coach" + 0.009*"final" + 0.009*"second" + 0.008*"win" + 0.008*"record" + 0.008*"career" + 0.008*"finish" + 0.007*"point" Topic: 40 0.009*"time" + 0.008*"later" + 0.007*"kill" + 0.006*"appear" + 0.005*"man" + 0.005*"death" + 0.005*"father" + 0.005*"return" + 0.005*"son" + 0.004*"charact" Topic: 20 0.008*"us" + 0.006*"gener" + 0.005*"design" + 0.005*"model" + 0.005*"develop" + 0.005*"time" + 0.004*"data" + 0.004*"number" + 0.004*"function" + 0.004*"process" Topic: 19 0.009*"water" + 0.008*"area" + 0.008*"speci" + 0.005*"larg" + 0.004*"order" + 0.004*"region" + 0.004*"includ" + 0.004*"black" + 0.004*"famili" + 0.004*"popul" Topic: 38 0.044*"royal" + 0.020*"william" + 0.019*"john" + 0.016*"corp" + 0.014*"lieuten" + 0.013*"capt" + 0.012*"engin" + 0.011*"armi" + 0.011*"georg" + 0.011*"temp" ==================== lda ==================== Topic: 35 0.034*"kong" + 0.034*"japanes" + 0.033*"hong" + 0.023*"lee" + 0.021*"singapor" + 0.019*"chines" + 0.018*"kim" + 0.015*"japan" + 0.014*"indonesia" + 0.014*"thailand" Topic: 23 0.016*"medic" + 0.014*"health" + 0.014*"hospit" + 0.013*"cell" + 0.011*"diseas" + 0.010*"patient" + 0.009*"ret" + 0.009*"caus" + 0.008*"human" + 0.008*"treatment" Topic: 47 0.025*"river" + 0.024*"station" + 0.021*"line" + 0.020*"road" + 0.017*"railwai" + 0.015*"rout" + 0.013*"lake" + 0.012*"park" + 0.011*"bridg" + 0.011*"area" Topic: 14 0.027*"univers" + 0.015*"research" + 0.014*"institut" + 0.012*"nation" + 0.012*"scienc" + 0.012*"work" + 0.012*"intern" + 0.011*"award" + 0.011*"develop" + 0.010*"organ" Topic: 39 0.050*"air" + 0.026*"aircraft" + 0.026*"oper" + 0.025*"airport" + 0.017*"forc" + 0.017*"flight" + 0.015*"squadron" + 0.014*"unit" + 0.012*"base" + 0.011*"wing" Topic: 17 0.060*"race" + 0.020*"car" + 0.017*"team" + 0.012*"finish" + 0.012*"tour" + 0.012*"driver" + 0.011*"ford" + 0.011*"time" + 0.011*"championship" + 0.011*"year" Topic: 4 0.137*"school" + 0.040*"colleg" + 0.039*"student" + 0.033*"univers" + 0.030*"high" + 0.028*"educ" + 0.016*"year" + 0.011*"graduat" + 0.010*"state" + 0.009*"campu" Topic: 8 0.048*"india" + 0.037*"indian" + 0.020*"http" + 0.016*"www" + 0.015*"pakistan" + 0.015*"iran" + 0.013*"sri" + 0.012*"khan" + 0.012*"islam" + 0.012*"tamil" Topic: 2 0.062*"german" + 0.039*"germani" + 0.025*"van" + 0.023*"von" + 0.020*"der" + 0.019*"dutch" + 0.019*"berlin" + 0.015*"swedish" + 0.014*"netherland" + 0.014*"sweden" Topic: 11 0.024*"law" + 0.021*"court" + 0.016*"state" + 0.016*"act" + 0.011*"polic" + 0.010*"case" + 0.009*"offic" + 0.009*"report" + 0.009*"right" + 0.007*"legal"
As we can see, NMF can be significantly faster than LDA without sacrificing quality of topics too much (or not sacrificing at all)
Moreover, NMF can be very flexible on RAM usage due to sparsity option, which leaves only small amount of elements in inner matrices.