import pyLDAvis
import pyLDAvis.sklearn
from pyLDAvis._prepare import (js_PCoA, js_MMDS, js_TSNE)
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
pyLDAvis.enable_notebook()
newsgroup = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
docs_raw = newsgroup.data
print len(docs_raw)
11314
tfidf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
stop_words = 'english',
token_pattern = r'\b[a-zA-Z]{2,}\b',
max_df = 0.5,
min_df = 10)
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print dtm_tfidf.shape
(11314, 9597)
lda = LatentDirichletAllocation(n_topics=20, learning_method='batch', random_state=0)
lda.fit(dtm_tfidf)
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method='batch', learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_jobs=1, n_topics=20, perp_tol=0.1, random_state=0, topic_word_prior=None, total_samples=1000000.0, verbose=0)
pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_PCoA)
pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_MMDS)
pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_TSNE)