#!/usr/bin/env python # coding: utf-8 # Open In Colab # # This code has two parts: the first one is from scikit learn and the second one is from the book by Kochmar 2022. # # The first code: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py # In[ ]: # Author: Olivier Grisel # Lars Buitinck # Chyi-Kwei Yau # License: BSD 3 clause from time import time import matplotlib.pyplot as plt from sklearn.datasets import fetch_20newsgroups from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer n_samples = 2000 n_features = 1000 n_components = 10 n_top_words = 20 batch_size = 128 init = "nndsvda" def plot_top_words(model, feature_names, n_top_words, title): fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True) axes = axes.flatten() for topic_idx, topic in enumerate(model.components_): top_features_ind = topic.argsort()[-n_top_words:] top_features = feature_names[top_features_ind] weights = topic[top_features_ind] ax = axes[topic_idx] ax.barh(top_features, weights, height=0.7) ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30}) ax.tick_params(axis="both", which="major", labelsize=20) for i in "top right left".split(): ax.spines[i].set_visible(False) fig.suptitle(title, fontsize=40) plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3) plt.show() # Load the 20 newsgroups dataset and vectorize it. We use a few heuristics # to filter out useless terms early on: the posts are stripped of headers, # footers and quoted replies, and common English words, words occurring in # only one document or in at least 95% of the documents are removed. print("Loading dataset...") # In[ ]: t0 = time() data, _ = fetch_20newsgroups( shuffle=True, random_state=1, remove=("headers", "footers", "quotes"), return_X_y=True, ) data_samples = data[:n_samples] print("done in %0.3fs." % (time() - t0)) print(len(data_samples)) data_samples[1999] # In[ ]: # Use tf-idf features for NMF. print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer( max_df=0.95, min_df=2, max_features=n_features, stop_words="english" ) t0 = time() tfidf = tfidf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) type(tfidf) tfidf.shape print(tfidf) # In[ ]: # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer( max_df=0.95, min_df=2, max_features=n_features, stop_words="english" ) t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) print() # Fit the NMF model print( "Fitting the NMF model (Frobenius norm) with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features) ) t0 = time() nmf = NMF( n_components=n_components, random_state=1, init=init, beta_loss="frobenius", alpha_W=0.00005, alpha_H=0.00005, l1_ratio=1, ).fit(tfidf) print("done in %0.3fs." % (time() - t0)) tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() plot_top_words( nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (Frobenius norm)" ) # Fit the NMF model print( "\n" * 2, "Fitting the NMF model (generalized Kullback-Leibler " "divergence) with tf-idf features, n_samples=%d and n_features=%d..." % (n_samples, n_features), ) t0 = time() nmf = NMF( n_components=n_components, random_state=1, init=init, beta_loss="kullback-leibler", solver="mu", max_iter=1000, alpha_W=0.00005, alpha_H=0.00005, l1_ratio=0.5, ).fit(tfidf) print("done in %0.3fs." % (time() - t0)) tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() plot_top_words( nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (generalized Kullback-Leibler divergence)", ) # Fit the MiniBatchNMF model print( "\n" * 2, "Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf " "features, n_samples=%d and n_features=%d, batch_size=%d..." % (n_samples, n_features, batch_size), ) t0 = time() mbnmf = MiniBatchNMF( n_components=n_components, random_state=1, batch_size=batch_size, init=init, beta_loss="frobenius", alpha_W=0.00005, alpha_H=0.00005, l1_ratio=0.5, ).fit(tfidf) print("done in %0.3fs." % (time() - t0)) tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() plot_top_words( mbnmf, tfidf_feature_names, n_top_words, "Topics in MiniBatchNMF model (Frobenius norm)", ) # Fit the MiniBatchNMF model print( "\n" * 2, "Fitting the MiniBatchNMF model (generalized Kullback-Leibler " "divergence) with tf-idf features, n_samples=%d and n_features=%d, " "batch_size=%d..." % (n_samples, n_features, batch_size), ) t0 = time() mbnmf = MiniBatchNMF( n_components=n_components, random_state=1, batch_size=batch_size, init=init, beta_loss="kullback-leibler", alpha_W=0.00005, alpha_H=0.00005, l1_ratio=0.5, ).fit(tfidf) print("done in %0.3fs." % (time() - t0)) tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() plot_top_words( mbnmf, tfidf_feature_names, n_top_words, "Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)", ) print( "\n" * 2, "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features), ) lda = LatentDirichletAllocation( n_components=n_components, max_iter=5, learning_method="online", learning_offset=50.0, random_state=0, ) t0 = time() lda.fit(tf) print("done in %0.3fs." % (time() - t0)) tf_feature_names = tf_vectorizer.get_feature_names_out() plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model") # # Source: https://github.com/ekochmar/Getting-Started-with-NLP/blob/master/Chapter10.ipynb Chapter 10: LDA for Topic Modeling # ## Load Newsgroups data # # As before, let's consider a specific set of categories: # https://scikit-learn.org/stable/datasets/real_world.html#the-20-newsgroups-text-dataset # # The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date. # # This module contains two loaders. The first one, sklearn.datasets.fetch_20newsgroups, returns a list of the raw texts that can be fed to text feature extractors such as CountVectorizer with custom parameters so as to extract feature vectors. The second one, sklearn.datasets.fetch_20newsgroups_vectorized, returns ready-to-use features, i.e., it is not necessary to use a feature extractor. # # In[ ]: from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # In[ ]: from sklearn.datasets import fetch_20newsgroups def load_dataset(sset, cats): if cats==[]: newsgroups_dset = fetch_20newsgroups(subset=sset, remove=('headers', 'footers', 'quotes'), shuffle=True) else: newsgroups_dset = fetch_20newsgroups(subset=sset, categories=cats, remove=('headers', 'footers', 'quotes'), shuffle=True) return newsgroups_dset # In[ ]: categories = ["comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles", "rec.sport.baseball"] categories += ["rec.sport.hockey", "sci.crypt", "sci.med", "sci.space", "talk.politics.mideast"] newsgroups_all = load_dataset('all', categories) print(len(newsgroups_all.data)) # In[ ]: newsgroups_all.keys() # ## Preprocess # # Convert word forms to stems to get concise representations for the documents: # In[ ]: import nltk from nltk.stem import SnowballStemmer stemmer = SnowballStemmer("english") def stem(text): return stemmer.stem(text) # In[ ]: import gensim from gensim.utils import simple_preprocess from gensim.parsing.preprocessing import STOPWORDS as stopwords #print(stopwords) def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text, min_len=4): if token not in stopwords: #and len(token) > 3: result.append(stem(token)) return result # Check how each document is represented. For example, look into the very first one: # In[ ]: doc_sample = newsgroups_all.data[0] print('Original document: ') print(doc_sample) print('\n\nTokenized document: ') words = [] for token in gensim.utils.tokenize(doc_sample): words.append(token) print(words) print('\n\nPreprocessed document: ') print(preprocess(doc_sample)) # How do the first 10 look like? # In[ ]: for i in range(0, 10): print(str(i) + "\t" + ", ".join(preprocess(newsgroups_all.data[i])[:10])) # Now let's represent each document as a dictionary of relevant words. Each word (*value* in the dictionary) has a unique identifier (*key*): # In[ ]: processed_docs = [] for i in range(0, len(newsgroups_all.data)): processed_docs.append(preprocess(newsgroups_all.data[i])) print(len(processed_docs)) dictionary = gensim.corpora.Dictionary(processed_docs) print(len(dictionary)) index = 0 for key, value in dictionary.iteritems(): print(key, value) index += 1 if index > 9: break # Put some contraints on the dictionary of terms: for instance, keep up to $100,000$ words that occur more frequently than $10$ times (`no_below`) and less frequently than in $50\%$ of the documents (`no_above`). This should help you extract the most useful terms, while still keeping a reasonable number of them. # In[ ]: dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000) print(len(dictionary)) # Let's see how a particular document is represented in this dictionary: for example, look into the very first post, or into the 100th: # In[ ]: bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] bow_corpus[0] #bow_corpus[99] # Let's decode what each index (key) in this dictionary points to: # In[ ]: #bow_doc = bow_corpus[99] bow_doc = bow_corpus[0] for i in range(len(bow_doc)): print(f"Key {bow_doc[i][0]} =\"{dictionary[bow_doc[i][0]]}\":\ occurrences={bow_doc[i][1]}") # ## Train an LDA model # In[ ]: # Create the dictionary id2word = dictionary # Create the corpus with word frequencies corpus = bow_corpus # Build the LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, update_every=1, chunksize=1000, passes=10, alpha='symmetric', iterations=100, per_word_topics=True) for index, topic in lda_model.print_topics(-1): print(f"Topic: {index} \nWords: {topic}") # ## Interpret the results # # What is the most representative topic in each document? # In[ ]: def analyse_topics(ldamodel, corpus, texts): main_topic = {} percentage = {} keywords = {} text_snippets = {} # Get main topic in each document for i, topic_list in enumerate(ldamodel[corpus]): #print("\n") #print(topic_list) #print("\n") #for i in range(0, len(topic_list)): # print (topic_list[i]) topic = topic_list[0] if ldamodel.per_word_topics else topic_list #print(topic) topic = sorted(topic, key=lambda x: (x[1]), reverse=True) # Get the main topic, contribution (%) and keywords for each document for j, (topic_num, prop_topic) in enumerate(topic): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp[:5]]) main_topic[i] = int(topic_num) percentage[i] = round(prop_topic,4) keywords[i] = topic_keywords text_snippets[i] = texts[i][:8] else: break return main_topic, percentage, keywords, text_snippets main_topic, percentage, keywords, text_snippets = analyse_topics( lda_model, bow_corpus, processed_docs) indexes = [] rows = [] for i in range(0, 10): indexes.append(i) rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet']) for idx in indexes: rows.append([str(idx), f"{main_topic.get(idx)}", f"{percentage.get(idx):.4f}", f"{keywords.get(idx)}\n", f"{text_snippets.get(idx)}"]) columns = zip(*rows) column_widths = [max(len(item) for item in col) for col in columns] for row in rows: print(''.join(' {:{width}} '.format(row[i], width=column_widths[i]) for i in range(0, len(row)))) # ## Explore words and topics with pyLDAvis # In[ ]: get_ipython().system('pip install pyLDAvis') # In[ ]: import pyLDAvis.gensim pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word) vis # Note, for newer versions of `gensim`, use the following code: # In[ ]: #import pyLDAvis.gensim_models #pyLDAvis.enable_notebook() #vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word) #vis # In[ ]: