from lda2vec import preprocess, Corpus
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
try:
import seaborn
except:
pass
You must be using a very recent version of pyLDAvis to use the lda2vec outputs.
As of this writing, anything past Jan 6 2016 or this commit 14e7b5f60d8360eb84969ff08a1b77b365a5878e
should work.
You can do this quickly by installing it directly from master like so:
# pip install git+https://github.com/bmabey/pyLDAvis.git@master#egg=pyLDAvis
import pyLDAvis
pyLDAvis.enable_notebook()
After runnning lda2vec_run.py
script in examples/twenty_newsgroups/lda2vec
directory a topics.pyldavis.npz
will be created that contains the topic-to-word probabilities and frequencies. What's left is to visualize and label each topic from the it's prevalent words.
npz = np.load(open('topics.pyldavis.npz', 'r'))
dat = {k: v for (k, v) in npz.iteritems()}
dat['vocab'] = dat['vocab'].tolist()
# dat['term_frequency'] = dat['term_frequency'] * 1.0 / dat['term_frequency'].sum()
top_n = 10
topic_to_topwords = {}
for j, topic_to_word in enumerate(dat['topic_term_dists']):
top = np.argsort(topic_to_word)[::-1][:top_n]
msg = 'Topic %i ' % j
top_words = [dat['vocab'][i].strip()[:35] for i in top]
msg += ' '.join(top_words)
print msg
topic_to_topwords[j] = top_words
Topic 0 x11r5 xv window xterm server motif font xlib // sunos Topic 1 jesus son father matthew sin mary g'd disciples christ sins Topic 2 s1 nsa s2 clipper chip administration q escrow private sector serial number encryption technology Topic 3 leafs games playoffs hockey game players pens yankees bike phillies Topic 4 van - 0 pp en 1 njd standings 02 6 Topic 5 out_of_vocabulary out_of_vocabulary anonymity hiv homicide adl ripem bullock encryption technology eff Topic 6 hiv magi prof erzurum venus van 2.5 million ankara satellite launched Topic 7 nsa escrow clipper chip encryption government phones warrant vat decrypt wiretap Topic 8 mac controller shipping disk printer mb ethernet enable os/2 port Topic 9 leafs cooper weaver karabagh myers agdam phillies flyers playoffs fired Topic 10 obfuscated = ciphertext jesus gentiles matthew judas { x int Topic 11 jesus ra bobby faith god homosexuality bible sin msg islam Topic 12 jesus sin scripture matthew christ islam god sins prophet faith Topic 13 mac i thanks monitor apple upgrade card connect using windows Topic 14 i quadra monitor my apple duo hard drive mac mouse thanks Topic 15 { shipping } + mac mb os/2 $ 3.5 manuals Topic 16 playoffs morris yankees leafs // pitching players } team wins Topic 17 :> taxes guns flame .. clinton kids jobs hey drugs Topic 18 revolver tires pitching saturn ball trigger car ice team engine Topic 19 stephanopoulos leafs mamma karabagh mr. koresh apartment fired myers sumgait
import warnings
warnings.filterwarnings('ignore')
prepared_data = pyLDAvis.prepare(dat['topic_term_dists'], dat['doc_topic_dists'],
dat['doc_lengths'] * 1.0, dat['vocab'], dat['term_frequency'] * 1.0, mds='tsne')
pyLDAvis.display(prepared_data)