In [1]:

import pyLDAvis
import pyLDAvis.sklearn
from pyLDAvis._prepare import (js_PCoA, js_MMDS, js_TSNE)

In [2]:

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:

pyLDAvis.enable_notebook()

load data¶

In [4]:

newsgroup = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
docs_raw = newsgroup.data
print len(docs_raw)

document-term matrix¶

In [5]:

tfidf_vectorizer = TfidfVectorizer(strip_accents = 'unicode', 
                                   stop_words = 'english', 
                                   token_pattern = r'\b[a-zA-Z]{2,}\b',
                                   max_df = 0.5, 
                                   min_df = 10)

In [6]:

dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print dtm_tfidf.shape

(11314, 9597)

latent dirichlet allocation¶

In [7]:

lda = LatentDirichletAllocation(n_topics=20, learning_method='batch', random_state=0)
lda.fit(dtm_tfidf)

Out[7]:

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=20, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

pyLDAvis¶

PCoA / CMDS¶

In [8]:

pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_PCoA)

Out[8]:

MMDS¶

In [9]:

pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_MMDS)

Out[9]:

TSNE¶

In [10]:

pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_TSNE)

Out[10]:

In [ ]: