#!/usr/bin/env python
# coding: utf-8
#
# # This code has two parts: the first one is from scikit learn and the second one is from the book by Kochmar 2022.
# # The first code: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
# In[ ]:
# Author: Olivier Grisel
# Lars Buitinck
# Chyi-Kwei Yau
# License: BSD 3 clause
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20
batch_size = 128
init = "nndsvda"
def plot_top_words(model, feature_names, n_top_words, title):
fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
axes = axes.flatten()
for topic_idx, topic in enumerate(model.components_):
top_features_ind = topic.argsort()[-n_top_words:]
top_features = feature_names[top_features_ind]
weights = topic[top_features_ind]
ax = axes[topic_idx]
ax.barh(top_features, weights, height=0.7)
ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
ax.tick_params(axis="both", which="major", labelsize=20)
for i in "top right left".split():
ax.spines[i].set_visible(False)
fig.suptitle(title, fontsize=40)
plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
plt.show()
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.
print("Loading dataset...")
# In[ ]:
t0 = time()
data, _ = fetch_20newsgroups(
shuffle=True,
random_state=1,
remove=("headers", "footers", "quotes"),
return_X_y=True,
)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))
print(len(data_samples))
data_samples[1999]
# In[ ]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(
max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
type(tfidf)
tfidf.shape
print(tfidf)
# In[ ]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(
max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()
# Fit the NMF model
print(
"Fitting the NMF model (Frobenius norm) with tf-idf features, "
"n_samples=%d and n_features=%d..." % (n_samples, n_features)
)
t0 = time()
nmf = NMF(
n_components=n_components,
random_state=1,
init=init,
beta_loss="frobenius",
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=1,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (Frobenius norm)"
)
# Fit the NMF model
print(
"\n" * 2,
"Fitting the NMF model (generalized Kullback-Leibler "
"divergence) with tf-idf features, n_samples=%d and n_features=%d..."
% (n_samples, n_features),
)
t0 = time()
nmf = NMF(
n_components=n_components,
random_state=1,
init=init,
beta_loss="kullback-leibler",
solver="mu",
max_iter=1000,
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
nmf,
tfidf_feature_names,
n_top_words,
"Topics in NMF model (generalized Kullback-Leibler divergence)",
)
# Fit the MiniBatchNMF model
print(
"\n" * 2,
"Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
"features, n_samples=%d and n_features=%d, batch_size=%d..."
% (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
n_components=n_components,
random_state=1,
batch_size=batch_size,
init=init,
beta_loss="frobenius",
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
mbnmf,
tfidf_feature_names,
n_top_words,
"Topics in MiniBatchNMF model (Frobenius norm)",
)
# Fit the MiniBatchNMF model
print(
"\n" * 2,
"Fitting the MiniBatchNMF model (generalized Kullback-Leibler "
"divergence) with tf-idf features, n_samples=%d and n_features=%d, "
"batch_size=%d..." % (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
n_components=n_components,
random_state=1,
batch_size=batch_size,
init=init,
beta_loss="kullback-leibler",
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
mbnmf,
tfidf_feature_names,
n_top_words,
"Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)",
)
print(
"\n" * 2,
"Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
% (n_samples, n_features),
)
lda = LatentDirichletAllocation(
n_components=n_components,
max_iter=5,
learning_method="online",
learning_offset=50.0,
random_state=0,
)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))
tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model")
# # Source: https://github.com/ekochmar/Getting-Started-with-NLP/blob/master/Chapter10.ipynb Chapter 10: LDA for Topic Modeling
# ## Load Newsgroups data
#
# As before, let's consider a specific set of categories:
# https://scikit-learn.org/stable/datasets/real_world.html#the-20-newsgroups-text-dataset
#
# The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.
#
# This module contains two loaders. The first one, sklearn.datasets.fetch_20newsgroups, returns a list of the raw texts that can be fed to text feature extractors such as CountVectorizer with custom parameters so as to extract feature vectors. The second one, sklearn.datasets.fetch_20newsgroups_vectorized, returns ready-to-use features, i.e., it is not necessary to use a feature extractor.
#
# In[ ]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# In[ ]:
from sklearn.datasets import fetch_20newsgroups
def load_dataset(sset, cats):
if cats==[]:
newsgroups_dset = fetch_20newsgroups(subset=sset,
remove=('headers', 'footers', 'quotes'),
shuffle=True)
else:
newsgroups_dset = fetch_20newsgroups(subset=sset, categories=cats,
remove=('headers', 'footers', 'quotes'),
shuffle=True)
return newsgroups_dset
# In[ ]:
categories = ["comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles", "rec.sport.baseball"]
categories += ["rec.sport.hockey", "sci.crypt", "sci.med", "sci.space", "talk.politics.mideast"]
newsgroups_all = load_dataset('all', categories)
print(len(newsgroups_all.data))
# In[ ]:
newsgroups_all.keys()
# ## Preprocess
#
# Convert word forms to stems to get concise representations for the documents:
# In[ ]:
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
def stem(text):
return stemmer.stem(text)
# In[ ]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as stopwords
#print(stopwords)
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text, min_len=4):
if token not in stopwords: #and len(token) > 3:
result.append(stem(token))
return result
# Check how each document is represented. For example, look into the very first one:
# In[ ]:
doc_sample = newsgroups_all.data[0]
print('Original document: ')
print(doc_sample)
print('\n\nTokenized document: ')
words = []
for token in gensim.utils.tokenize(doc_sample):
words.append(token)
print(words)
print('\n\nPreprocessed document: ')
print(preprocess(doc_sample))
# How do the first 10 look like?
# In[ ]:
for i in range(0, 10):
print(str(i) + "\t" + ", ".join(preprocess(newsgroups_all.data[i])[:10]))
# Now let's represent each document as a dictionary of relevant words. Each word (*value* in the dictionary) has a unique identifier (*key*):
# In[ ]:
processed_docs = []
for i in range(0, len(newsgroups_all.data)):
processed_docs.append(preprocess(newsgroups_all.data[i]))
print(len(processed_docs))
dictionary = gensim.corpora.Dictionary(processed_docs)
print(len(dictionary))
index = 0
for key, value in dictionary.iteritems():
print(key, value)
index += 1
if index > 9:
break
# Put some contraints on the dictionary of terms: for instance, keep up to $100,000$ words that occur more frequently than $10$ times (`no_below`) and less frequently than in $50\%$ of the documents (`no_above`). This should help you extract the most useful terms, while still keeping a reasonable number of them.
# In[ ]:
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)
print(len(dictionary))
# Let's see how a particular document is represented in this dictionary: for example, look into the very first post, or into the 100th:
# In[ ]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]
#bow_corpus[99]
# Let's decode what each index (key) in this dictionary points to:
# In[ ]:
#bow_doc = bow_corpus[99]
bow_doc = bow_corpus[0]
for i in range(len(bow_doc)):
print(f"Key {bow_doc[i][0]} =\"{dictionary[bow_doc[i][0]]}\":\
occurrences={bow_doc[i][1]}")
# ## Train an LDA model
# In[ ]:
# Create the dictionary
id2word = dictionary
# Create the corpus with word frequencies
corpus = bow_corpus
# Build the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=10,
random_state=100,
update_every=1,
chunksize=1000,
passes=10,
alpha='symmetric',
iterations=100,
per_word_topics=True)
for index, topic in lda_model.print_topics(-1):
print(f"Topic: {index} \nWords: {topic}")
# ## Interpret the results
#
# What is the most representative topic in each document?
# In[ ]:
def analyse_topics(ldamodel, corpus, texts):
main_topic = {}
percentage = {}
keywords = {}
text_snippets = {}
# Get main topic in each document
for i, topic_list in enumerate(ldamodel[corpus]):
#print("\n")
#print(topic_list)
#print("\n")
#for i in range(0, len(topic_list)):
# print (topic_list[i])
topic = topic_list[0] if ldamodel.per_word_topics else topic_list
#print(topic)
topic = sorted(topic, key=lambda x: (x[1]), reverse=True)
# Get the main topic, contribution (%) and keywords for each document
for j, (topic_num, prop_topic) in enumerate(topic):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp[:5]])
main_topic[i] = int(topic_num)
percentage[i] = round(prop_topic,4)
keywords[i] = topic_keywords
text_snippets[i] = texts[i][:8]
else:
break
return main_topic, percentage, keywords, text_snippets
main_topic, percentage, keywords, text_snippets = analyse_topics(
lda_model, bow_corpus, processed_docs)
indexes = []
rows = []
for i in range(0, 10):
indexes.append(i)
rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet'])
for idx in indexes:
rows.append([str(idx), f"{main_topic.get(idx)}",
f"{percentage.get(idx):.4f}",
f"{keywords.get(idx)}\n",
f"{text_snippets.get(idx)}"])
columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
print(''.join(' {:{width}} '.format(row[i], width=column_widths[i])
for i in range(0, len(row))))
# ## Explore words and topics with pyLDAvis
# In[ ]:
get_ipython().system('pip install pyLDAvis')
# In[ ]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis
# Note, for newer versions of `gensim`, use the following code:
# In[ ]:
#import pyLDAvis.gensim_models
#pyLDAvis.enable_notebook()
#vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
#vis
# In[ ]: