# Author: Olivier Grisel <olivier.grisel@ensta.org>
# Lars Buitinck
# Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20
batch_size = 128
init = "nndsvda"
def plot_top_words(model, feature_names, n_top_words, title):
fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
axes = axes.flatten()
for topic_idx, topic in enumerate(model.components_):
top_features_ind = topic.argsort()[-n_top_words:]
top_features = feature_names[top_features_ind]
weights = topic[top_features_ind]
ax = axes[topic_idx]
ax.barh(top_features, weights, height=0.7)
ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
ax.tick_params(axis="both", which="major", labelsize=20)
for i in "top right left".split():
ax.spines[i].set_visible(False)
fig.suptitle(title, fontsize=40)
plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
plt.show()
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.
print("Loading dataset...")
Loading dataset...
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
t0 = time()
data, _ = fetch_20newsgroups(
shuffle=True,
random_state=1,
remove=("headers", "footers", "quotes"),
return_X_y=True,
)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))
print(len(data_samples))
data_samples[1999]
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
done in 3.208s. 2000
"\n\n\nNeither did he!\n\n\nOverall? How do you figure?\n\n\nSo far my radio hasn't exploded from not being tuned to 660...\n"
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(
max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
type(tfidf)
tfidf.shape
print(tfidf)
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
Extracting tf-idf features for NMF... done in 1.101s.
scipy.sparse._csr.csr_matrix
def __init__(arg1, shape=None, dtype=None, copy=False)
Compressed Sparse Row matrix This can be instantiated in several ways: csr_array(D) with a dense matrix or rank-2 ndarray D csr_array(S) with another sparse matrix S (equivalent to S.tocsr()) csr_array((M, N), [dtype]) to construct an empty matrix with shape (M, N) dtype is optional, defaulting to dtype='d'. csr_array((data, (row_ind, col_ind)), [shape=(M, N)]) where ``data``, ``row_ind`` and ``col_ind`` satisfy the relationship ``a[row_ind[k], col_ind[k]] = data[k]``. csr_array((data, indices, indptr), [shape=(M, N)]) is the standard CSR representation where the column indices for row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``. If the shape parameter is not supplied, the matrix dimensions are inferred from the index arrays. Attributes ---------- dtype : dtype Data type of the matrix shape : 2-tuple Shape of the matrix ndim : int Number of dimensions (this is always 2) nnz Number of stored values, including explicit zeros data CSR format data array of the matrix indices CSR format index array of the matrix indptr CSR format index pointer array of the matrix has_sorted_indices Whether indices are sorted Notes ----- Sparse matrices can be used in arithmetic operations: they support addition, subtraction, multiplication, division, and matrix power. Advantages of the CSR format - efficient arithmetic operations CSR + CSR, CSR * CSR, etc. - efficient row slicing - fast matrix vector products Disadvantages of the CSR format - slow column slicing operations (consider CSC) - changes to the sparsity structure are expensive (consider LIL or DOK) Canonical Format - Within each row, indices are sorted by column. - There are no duplicate entries. Examples -------- >>> import numpy as np >>> from scipy.sparse import csr_array >>> csr_array((3, 4), dtype=np.int8).toarray() array([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], dtype=int8) >>> row = np.array([0, 0, 1, 2, 2, 2]) >>> col = np.array([0, 2, 2, 0, 1, 2]) >>> data = np.array([1, 2, 3, 4, 5, 6]) >>> csr_array((data, (row, col)), shape=(3, 3)).toarray() array([[1, 0, 2], [0, 0, 3], [4, 5, 6]]) >>> indptr = np.array([0, 2, 3, 6]) >>> indices = np.array([0, 2, 2, 0, 1, 2]) >>> data = np.array([1, 2, 3, 4, 5, 6]) >>> csr_array((data, indices, indptr), shape=(3, 3)).toarray() array([[1, 0, 2], [0, 0, 3], [4, 5, 6]]) Duplicate entries are summed together: >>> row = np.array([0, 1, 2, 0]) >>> col = np.array([0, 1, 1, 0]) >>> data = np.array([1, 2, 4, 8]) >>> csr_array((data, (row, col)), shape=(3, 3)).toarray() array([[9, 0, 0], [0, 2, 0], [0, 4, 0]]) As an example of how to construct a CSR matrix incrementally, the following snippet builds a term-document matrix from texts: >>> docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]] >>> indptr = [0] >>> indices = [] >>> data = [] >>> vocabulary = {} >>> for d in docs: ... for term in d: ... index = vocabulary.setdefault(term, len(vocabulary)) ... indices.append(index) ... data.append(1) ... indptr.append(len(indices)) ... >>> csr_array((data, indices, indptr), dtype=int).toarray() array([[2, 1, 0, 0], [0, 1, 1, 1]])
(2000, 1000)
(0, 708) 0.12621877625178227 (0, 410) 0.11650651629173196 (0, 493) 0.1631127602376565 (0, 548) 0.11873384536901997 (0, 130) 0.13595955391213657 (0, 567) 0.13595955391213657 (0, 412) 0.12831668397369733 (0, 750) 0.15376128408643466 (0, 841) 0.18564440175793037 (0, 206) 0.15810189392327795 (0, 764) 0.1640284908630232 (0, 748) 0.13595955391213657 (0, 904) 0.08983671288492111 (0, 923) 0.11966934266418663 (0, 527) 0.1690393571774018 (0, 432) 0.13369075280946802 (0, 988) 0.12740095334833063 (0, 488) 0.3750048191807266 (0, 717) 0.17767638066823058 (0, 587) 0.6454209423982519 (0, 862) 0.1551447391479567 (0, 286) 0.11115911128919416 (0, 867) 0.15810189392327795 (0, 881) 0.11227372176926384 (1, 381) 0.20157910011124136 : : (1998, 504) 0.04875543232365812 (1998, 991) 0.053978162418983656 (1998, 566) 0.03637572081429063 (1998, 611) 0.05504978412016225 (1998, 171) 0.047384737904817335 (1998, 414) 0.08876861152823663 (1998, 268) 0.23575826480007847 (1998, 491) 0.1114848475886964 (1998, 271) 0.05622767285588837 (1998, 907) 0.06818500433590943 (1998, 710) 0.05998220148907317 (1998, 998) 0.04605022195294345 (1998, 173) 0.10248793661244614 (1998, 122) 0.05810140044184461 (1998, 984) 0.0397488592737133 (1998, 533) 0.05951387738098097 (1998, 306) 0.030847223209189208 (1998, 540) 0.12852849537452227 (1998, 130) 0.04971790762820881 (1998, 750) 0.05622767285588837 (1998, 286) 0.04064884201283483 (1999, 738) 0.5707845186348437 (1999, 366) 0.56500361648845 (1999, 356) 0.44578463121221495 (1999, 286) 0.3952872489933768
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(
max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()
# Fit the NMF model
print(
"Fitting the NMF model (Frobenius norm) with tf-idf features, "
"n_samples=%d and n_features=%d..." % (n_samples, n_features)
)
t0 = time()
nmf = NMF(
n_components=n_components,
random_state=1,
init=init,
beta_loss="frobenius",
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=1,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (Frobenius norm)"
)
# Fit the NMF model
print(
"\n" * 2,
"Fitting the NMF model (generalized Kullback-Leibler "
"divergence) with tf-idf features, n_samples=%d and n_features=%d..."
% (n_samples, n_features),
)
t0 = time()
nmf = NMF(
n_components=n_components,
random_state=1,
init=init,
beta_loss="kullback-leibler",
solver="mu",
max_iter=1000,
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
nmf,
tfidf_feature_names,
n_top_words,
"Topics in NMF model (generalized Kullback-Leibler divergence)",
)
# Fit the MiniBatchNMF model
print(
"\n" * 2,
"Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
"features, n_samples=%d and n_features=%d, batch_size=%d..."
% (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
n_components=n_components,
random_state=1,
batch_size=batch_size,
init=init,
beta_loss="frobenius",
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
mbnmf,
tfidf_feature_names,
n_top_words,
"Topics in MiniBatchNMF model (Frobenius norm)",
)
# Fit the MiniBatchNMF model
print(
"\n" * 2,
"Fitting the MiniBatchNMF model (generalized Kullback-Leibler "
"divergence) with tf-idf features, n_samples=%d and n_features=%d, "
"batch_size=%d..." % (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
n_components=n_components,
random_state=1,
batch_size=batch_size,
init=init,
beta_loss="kullback-leibler",
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
mbnmf,
tfidf_feature_names,
n_top_words,
"Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)",
)
print(
"\n" * 2,
"Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
% (n_samples, n_features),
)
lda = LatentDirichletAllocation(
n_components=n_components,
max_iter=5,
learning_method="online",
learning_offset=50.0,
random_state=0,
)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))
tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model")
As before, let's consider a specific set of categories: https://scikit-learn.org/stable/datasets/real_world.html#the-20-newsgroups-text-dataset
The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.
This module contains two loaders. The first one, sklearn.datasets.fetch_20newsgroups, returns a list of the raw texts that can be fed to text feature extractors such as CountVectorizer with custom parameters so as to extract feature vectors. The second one, sklearn.datasets.fetch_20newsgroups_vectorized, returns ready-to-use features, i.e., it is not necessary to use a feature extractor.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
from sklearn.datasets import fetch_20newsgroups
def load_dataset(sset, cats):
if cats==[]:
newsgroups_dset = fetch_20newsgroups(subset=sset,
remove=('headers', 'footers', 'quotes'),
shuffle=True)
else:
newsgroups_dset = fetch_20newsgroups(subset=sset, categories=cats,
remove=('headers', 'footers', 'quotes'),
shuffle=True)
return newsgroups_dset
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
categories = ["comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles", "rec.sport.baseball"]
categories += ["rec.sport.hockey", "sci.crypt", "sci.med", "sci.space", "talk.politics.mideast"]
newsgroups_all = load_dataset('all', categories)
print(len(newsgroups_all.data))
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
9850
newsgroups_all.keys()
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
Convert word forms to stems to get concise representations for the documents:
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
def stem(text):
return stemmer.stem(text)
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as stopwords
#print(stopwords)
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text, min_len=4):
if token not in stopwords: #and len(token) > 3:
result.append(stem(token))
return result
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
Check how each document is represented. For example, look into the very first one:
doc_sample = newsgroups_all.data[0]
print('Original document: ')
print(doc_sample)
print('\n\nTokenized document: ')
words = []
for token in gensim.utils.tokenize(doc_sample):
words.append(token)
print(words)
print('\n\nPreprocessed document: ')
print(preprocess(doc_sample))
Original document: Hi Xperts! How can I move the cursor with the keyboard (i.e. cursor keys), if no mouse is available? Any hints welcome. Thanks. Tokenized document: ['Hi', 'Xperts', 'How', 'can', 'I', 'move', 'the', 'cursor', 'with', 'the', 'keyboard', 'i', 'e', 'cursor', 'keys', 'if', 'no', 'mouse', 'is', 'available', 'Any', 'hints', 'welcome', 'Thanks'] Preprocessed document: ['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint', 'welcom', 'thank']
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
How do the first 10 look like?
for i in range(0, 10):
print(str(i) + "\t" + ", ".join(preprocess(newsgroups_all.data[i])[:10]))
0 xpert, cursor, keyboard, cursor, key, mous, avail, hint, welcom, thank 1 obtain, copi, open, look, widget, obtain, need, order, copi, thank 2 right, signal, strong, live, west, philadelphia, perfect, sport, fan, dream 3 canadian, thing, coach, boston, bruin, colorado, rocki, summari, post, gather 4 heck, feel, like, time, includ, cafeteria, work, half, time, headach 5 damn, right, late, climb, meet, morn, bother, right, foot, asleep 6 olympus, stylus, pocket, camera, smallest, class, includ, time, date, stamp 7 includ, follow, chmos, clock, generat, driver, processor, chmos, eras, prom 8 chang, intel, discov, xclient, xload, longer, work, bomb, messag, error 9 termin, like, power, server, run, window, manag, special, client, program
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
Now let's represent each document as a dictionary of relevant words. Each word (value in the dictionary) has a unique identifier (key):
processed_docs = []
for i in range(0, len(newsgroups_all.data)):
processed_docs.append(preprocess(newsgroups_all.data[i]))
print(len(processed_docs))
dictionary = gensim.corpora.Dictionary(processed_docs)
print(len(dictionary))
index = 0
for key, value in dictionary.iteritems():
print(key, value)
index += 1
if index > 9:
break
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
9850 39350 0 avail 1 cursor 2 hint 3 key 4 keyboard 5 mous 6 thank 7 welcom 8 xpert 9 copi
Put some contraints on the dictionary of terms: for instance, keep up to $100,000$ words that occur more frequently than $10$ times (no_below
) and less frequently than in $50\%$ of the documents (no_above
). This should help you extract the most useful terms, while still keeping a reasonable number of them.
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)
print(len(dictionary))
5868
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
Let's see how a particular document is represented in this dictionary: for example, look into the very first post, or into the 100th:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]
#bow_corpus[99]
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]
Let's decode what each index (key) in this dictionary points to:
#bow_doc = bow_corpus[99]
bow_doc = bow_corpus[0]
for i in range(len(bow_doc)):
print(f"Key {bow_doc[i][0]} =\"{dictionary[bow_doc[i][0]]}\":\
occurrences={bow_doc[i][1]}")
Key 0 ="avail": occurrences=1 Key 1 ="cursor": occurrences=2 Key 2 ="hint": occurrences=1 Key 3 ="key": occurrences=1 Key 4 ="keyboard": occurrences=1 Key 5 ="mous": occurrences=1 Key 6 ="thank": occurrences=1 Key 7 ="welcom": occurrences=1 Key 8 ="xpert": occurrences=1
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
# Create the dictionary
id2word = dictionary
# Create the corpus with word frequencies
corpus = bow_corpus
# Build the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=10,
random_state=100,
update_every=1,
chunksize=1000,
passes=10,
alpha='symmetric',
iterations=100,
per_word_topics=True)
for index, topic in lda_model.print_topics(-1):
print(f"Topic: {index} \nWords: {topic}")
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
Topic: 0 Words: 0.021*"encrypt" + 0.018*"secur" + 0.018*"chip" + 0.016*"govern" + 0.013*"clipper" + 0.012*"public" + 0.010*"privaci" + 0.010*"key" + 0.010*"phone" + 0.009*"algorithm" Topic: 1 Words: 0.017*"appear" + 0.014*"copi" + 0.013*"cover" + 0.013*"star" + 0.013*"book" + 0.011*"penalti" + 0.010*"black" + 0.009*"comic" + 0.008*"blue" + 0.008*"green" Topic: 2 Words: 0.031*"window" + 0.015*"server" + 0.012*"program" + 0.012*"file" + 0.012*"applic" + 0.012*"display" + 0.011*"widget" + 0.010*"version" + 0.010*"motif" + 0.010*"support" Topic: 3 Words: 0.015*"space" + 0.007*"launch" + 0.007*"year" + 0.007*"medic" + 0.006*"patient" + 0.006*"orbit" + 0.006*"research" + 0.006*"diseas" + 0.005*"develop" + 0.005*"nasa" Topic: 4 Words: 0.018*"armenian" + 0.011*"peopl" + 0.008*"kill" + 0.008*"said" + 0.007*"turkish" + 0.006*"muslim" + 0.006*"jew" + 0.006*"govern" + 0.005*"state" + 0.005*"greek" Topic: 5 Words: 0.024*"price" + 0.021*"sale" + 0.020*"offer" + 0.017*"drive" + 0.017*"sell" + 0.016*"includ" + 0.013*"ship" + 0.013*"interest" + 0.011*"ask" + 0.010*"condit" Topic: 6 Words: 0.018*"mail" + 0.016*"list" + 0.015*"file" + 0.015*"inform" + 0.013*"send" + 0.012*"post" + 0.012*"avail" + 0.010*"request" + 0.010*"program" + 0.009*"includ" Topic: 7 Words: 0.019*"like" + 0.016*"know" + 0.011*"time" + 0.011*"look" + 0.010*"think" + 0.008*"want" + 0.008*"thing" + 0.008*"good" + 0.007*"go" + 0.007*"bike" Topic: 8 Words: 0.033*"game" + 0.022*"team" + 0.017*"play" + 0.015*"year" + 0.013*"player" + 0.011*"season" + 0.008*"hockey" + 0.008*"score" + 0.007*"leagu" + 0.007*"goal" Topic: 9 Words: 0.013*"peopl" + 0.012*"think" + 0.011*"like" + 0.009*"time" + 0.009*"right" + 0.009*"israel" + 0.009*"know" + 0.006*"reason" + 0.006*"point" + 0.006*"thing"
What is the most representative topic in each document?
def analyse_topics(ldamodel, corpus, texts):
main_topic = {}
percentage = {}
keywords = {}
text_snippets = {}
# Get main topic in each document
for i, topic_list in enumerate(ldamodel[corpus]):
#print("\n")
#print(topic_list)
#print("\n")
#for i in range(0, len(topic_list)):
# print (topic_list[i])
topic = topic_list[0] if ldamodel.per_word_topics else topic_list
#print(topic)
topic = sorted(topic, key=lambda x: (x[1]), reverse=True)
# Get the main topic, contribution (%) and keywords for each document
for j, (topic_num, prop_topic) in enumerate(topic):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp[:5]])
main_topic[i] = int(topic_num)
percentage[i] = round(prop_topic,4)
keywords[i] = topic_keywords
text_snippets[i] = texts[i][:8]
else:
break
return main_topic, percentage, keywords, text_snippets
main_topic, percentage, keywords, text_snippets = analyse_topics(
lda_model, bow_corpus, processed_docs)
indexes = []
rows = []
for i in range(0, 10):
indexes.append(i)
rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet'])
for idx in indexes:
rows.append([str(idx), f"{main_topic.get(idx)}",
f"{percentage.get(idx):.4f}",
f"{keywords.get(idx)}\n",
f"{text_snippets.get(idx)}"])
columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
print(''.join(' {:{width}} '.format(row[i], width=column_widths[i])
for i in range(0, len(row))))
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
ID Main Topic Contribution (%) Keywords Snippet 0 2 0.8268 window, server, program, file, applic ['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint'] 1 6 0.4741 mail, list, file, inform, send ['obtain', 'copi', 'open', 'look', 'widget', 'obtain', 'need', 'order'] 2 7 0.4230 like, know, time, look, think ['right', 'signal', 'strong', 'live', 'west', 'philadelphia', 'perfect', 'sport'] 3 8 0.4159 game, team, play, year, player ['canadian', 'thing', 'coach', 'boston', 'bruin', 'colorado', 'rocki', 'summari'] 4 9 0.9039 peopl, think, like, time, right ['heck', 'feel', 'like', 'time', 'includ', 'cafeteria', 'work', 'half'] 5 7 0.6291 like, know, time, look, think ['damn', 'right', 'late', 'climb', 'meet', 'morn', 'bother', 'right'] 6 3 0.3485 space, launch, year, medic, patient ['olympus', 'stylus', 'pocket', 'camera', 'smallest', 'class', 'includ', 'time'] 7 5 0.3799 price, sale, offer, drive, sell ['includ', 'follow', 'chmos', 'clock', 'generat', 'driver', 'processor', 'chmos'] 8 2 0.7943 window, server, program, file, applic ['chang', 'intel', 'discov', 'xclient', 'xload', 'longer', 'work', 'bomb'] 9 2 0.6383 window, server, program, file, applic ['termin', 'like', 'power', 'server', 'run', 'window', 'manag', 'special']
!pip install pyLDAvis
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
Requirement already satisfied: pyLDAvis in /usr/local/lib/python3.10/dist-packages (3.4.1) Requirement already satisfied: numpy>=1.24.2 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (1.25.2) Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (1.11.4) Requirement already satisfied: pandas>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (2.0.3) Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (1.4.2) Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (3.1.4) Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (2.10.0) Requirement already satisfied: funcy in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (2.0) Requirement already satisfied: scikit-learn>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (1.2.2) Requirement already satisfied: gensim in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (4.3.2) Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (67.7.2) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=2.0.0->pyLDAvis) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=2.0.0->pyLDAvis) (2023.4) Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=2.0.0->pyLDAvis) (2024.1) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0.0->pyLDAvis) (3.5.0) Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim->pyLDAvis) (6.4.0) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->pyLDAvis) (2.1.5) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas>=2.0.0->pyLDAvis) (1.16.0)
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
Note, for newer versions of gensim
, use the following code:
#import pyLDAvis.gensim_models
#pyLDAvis.enable_notebook()
#vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
#vis