#!/usr/bin/env python # coding: utf-8 # # Chapter 10: LDA for Topic Modeling # ## Load Newsgroups data # # As before, let's consider a specific set of categories: # In[1]: from sklearn.datasets import fetch_20newsgroups def load_dataset(sset, cats): if cats==[]: newsgroups_dset = fetch_20newsgroups(subset=sset, remove=('headers', 'footers', 'quotes'), shuffle=True) else: newsgroups_dset = fetch_20newsgroups(subset=sset, categories=cats, remove=('headers', 'footers', 'quotes'), shuffle=True) return newsgroups_dset # In[2]: categories = ["comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles", "rec.sport.baseball"] categories += ["rec.sport.hockey", "sci.crypt", "sci.med", "sci.space", "talk.politics.mideast"] newsgroups_all = load_dataset('all', categories) print(len(newsgroups_all.data)) # ## Preprocess # # Convert word forms to stems to get concise representations for the documents: # In[3]: import nltk from nltk.stem import SnowballStemmer stemmer = SnowballStemmer("english") def stem(text): return stemmer.stem(text) # In[4]: import gensim from gensim.utils import simple_preprocess from gensim.parsing.preprocessing import STOPWORDS as stopwords #print(stopwords) def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text, min_len=4): if token not in stopwords: #and len(token) > 3: result.append(stem(token)) return result # Check how each document is represented. For example, look into the very first one: # In[5]: doc_sample = newsgroups_all.data[0] print('Original document: ') print(doc_sample) print('\n\nTokenized document: ') words = [] for token in gensim.utils.tokenize(doc_sample): words.append(token) print(words) print('\n\nPreprocessed document: ') print(preprocess(doc_sample)) # How do the first 10 look like? # In[6]: for i in range(0, 10): print(str(i) + "\t" + ", ".join(preprocess(newsgroups_all.data[i])[:10])) # Now let's represent each document as a dictionary of relevant words. Each word (*value* in the dictionary) has a unique identifier (*key*): # In[7]: processed_docs = [] for i in range(0, len(newsgroups_all.data)): processed_docs.append(preprocess(newsgroups_all.data[i])) print(len(processed_docs)) dictionary = gensim.corpora.Dictionary(processed_docs) print(len(dictionary)) index = 0 for key, value in dictionary.iteritems(): print(key, value) index += 1 if index > 9: break # Put some contraints on the dictionary of terms: for instance, keep up to $100,000$ words that occur more frequently than $10$ times (`no_below`) and less frequently than in $50\%$ of the documents (`no_above`). This should help you extract the most useful terms, while still keeping a reasonable number of them. # In[8]: dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000) print(len(dictionary)) # Let's see how a particular document is represented in this dictionary: for example, look into the very first post, or into the 100th: # In[9]: bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] bow_corpus[0] #bow_corpus[99] # Let's decode what each index (key) in this dictionary points to: # In[10]: #bow_doc = bow_corpus[99] bow_doc = bow_corpus[0] for i in range(len(bow_doc)): print(f"Key {bow_doc[i][0]} =\"{dictionary[bow_doc[i][0]]}\":\ occurrences={bow_doc[i][1]}") # ## Train an LDA model # In[11]: # Create the dictionary id2word = dictionary # Create the corpus with word frequencies corpus = bow_corpus # Build the LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, update_every=1, chunksize=1000, passes=10, alpha='symmetric', iterations=100, per_word_topics=True) for index, topic in lda_model.print_topics(-1): print(f"Topic: {index} \nWords: {topic}") # ## Interpret the results # # What is the most representative topic in each document? # In[12]: def analyse_topics(ldamodel, corpus, texts): main_topic = {} percentage = {} keywords = {} text_snippets = {} # Get main topic in each document for i, topic_list in enumerate(ldamodel[corpus]): #print("\n") #print(topic_list) #print("\n") #for i in range(0, len(topic_list)): # print (topic_list[i]) topic = topic_list[0] if ldamodel.per_word_topics else topic_list #print(topic) topic = sorted(topic, key=lambda x: (x[1]), reverse=True) # Get the main topic, contribution (%) and keywords for each document for j, (topic_num, prop_topic) in enumerate(topic): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp[:5]]) main_topic[i] = int(topic_num) percentage[i] = round(prop_topic,4) keywords[i] = topic_keywords text_snippets[i] = texts[i][:8] else: break return main_topic, percentage, keywords, text_snippets main_topic, percentage, keywords, text_snippets = analyse_topics( lda_model, bow_corpus, processed_docs) indexes = [] rows = [] for i in range(0, 10): indexes.append(i) rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet']) for idx in indexes: rows.append([str(idx), f"{main_topic.get(idx)}", f"{percentage.get(idx):.4f}", f"{keywords.get(idx)}\n", f"{text_snippets.get(idx)}"]) columns = zip(*rows) column_widths = [max(len(item) for item in col) for col in columns] for row in rows: print(''.join(' {:{width}} '.format(row[i], width=column_widths[i]) for i in range(0, len(row)))) # ## Explore words and topics with pyLDAvis # In[13]: import pyLDAvis.gensim pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word) vis # Note, for newer versions of `gensim`, use the following code: # In[ ]: #import pyLDAvis.gensim_models #pyLDAvis.enable_notebook() #vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word) #vis # In[ ]: