As before, let's consider a specific set of categories:
from sklearn.datasets import fetch_20newsgroups
def load_dataset(sset, cats):
if cats==[]:
newsgroups_dset = fetch_20newsgroups(subset=sset,
remove=('headers', 'footers', 'quotes'),
shuffle=True)
else:
newsgroups_dset = fetch_20newsgroups(subset=sset, categories=cats,
remove=('headers', 'footers', 'quotes'),
shuffle=True)
return newsgroups_dset
categories = ["comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles", "rec.sport.baseball"]
categories += ["rec.sport.hockey", "sci.crypt", "sci.med", "sci.space", "talk.politics.mideast"]
newsgroups_all = load_dataset('all', categories)
print(len(newsgroups_all.data))
9850
Convert word forms to stems to get concise representations for the documents:
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
def stem(text):
return stemmer.stem(text)
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as stopwords
#print(stopwords)
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text, min_len=4):
if token not in stopwords: #and len(token) > 3:
result.append(stem(token))
return result
Check how each document is represented. For example, look into the very first one:
doc_sample = newsgroups_all.data[0]
print('Original document: ')
print(doc_sample)
print('\n\nTokenized document: ')
words = []
for token in gensim.utils.tokenize(doc_sample):
words.append(token)
print(words)
print('\n\nPreprocessed document: ')
print(preprocess(doc_sample))
Original document: Hi Xperts! How can I move the cursor with the keyboard (i.e. cursor keys), if no mouse is available? Any hints welcome. Thanks. Tokenized document: ['Hi', 'Xperts', 'How', 'can', 'I', 'move', 'the', 'cursor', 'with', 'the', 'keyboard', 'i', 'e', 'cursor', 'keys', 'if', 'no', 'mouse', 'is', 'available', 'Any', 'hints', 'welcome', 'Thanks'] Preprocessed document: ['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint', 'welcom', 'thank']
How do the first 10 look like?
for i in range(0, 10):
print(str(i) + "\t" + ", ".join(preprocess(newsgroups_all.data[i])[:10]))
0 xpert, cursor, keyboard, cursor, key, mous, avail, hint, welcom, thank 1 obtain, copi, open, look, widget, obtain, need, order, copi, thank 2 right, signal, strong, live, west, philadelphia, perfect, sport, fan, dream 3 canadian, thing, coach, boston, bruin, colorado, rocki, summari, post, gather 4 heck, feel, like, time, includ, cafeteria, work, half, time, headach 5 damn, right, late, climb, meet, morn, bother, right, foot, asleep 6 olympus, stylus, pocket, camera, smallest, class, includ, time, date, stamp 7 includ, follow, chmos, clock, generat, driver, processor, chmos, eras, prom 8 chang, intel, discov, xclient, xload, longer, work, bomb, messag, error 9 termin, like, power, server, run, window, manag, special, client, program
Now let's represent each document as a dictionary of relevant words. Each word (value in the dictionary) has a unique identifier (key):
processed_docs = []
for i in range(0, len(newsgroups_all.data)):
processed_docs.append(preprocess(newsgroups_all.data[i]))
print(len(processed_docs))
dictionary = gensim.corpora.Dictionary(processed_docs)
print(len(dictionary))
index = 0
for key, value in dictionary.iteritems():
print(key, value)
index += 1
if index > 9:
break
9850 39350 0 avail 1 cursor 2 hint 3 key 4 keyboard 5 mous 6 thank 7 welcom 8 xpert 9 copi
Put some contraints on the dictionary of terms: for instance, keep up to $100,000$ words that occur more frequently than $10$ times (no_below
) and less frequently than in $50\%$ of the documents (no_above
). This should help you extract the most useful terms, while still keeping a reasonable number of them.
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)
print(len(dictionary))
5868
Let's see how a particular document is represented in this dictionary: for example, look into the very first post, or into the 100th:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]
#bow_corpus[99]
[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]
Let's decode what each index (key) in this dictionary points to:
#bow_doc = bow_corpus[99]
bow_doc = bow_corpus[0]
for i in range(len(bow_doc)):
print(f"Key {bow_doc[i][0]} =\"{dictionary[bow_doc[i][0]]}\":\
occurrences={bow_doc[i][1]}")
Key 0 ="avail": occurrences=1 Key 1 ="cursor": occurrences=2 Key 2 ="hint": occurrences=1 Key 3 ="key": occurrences=1 Key 4 ="keyboard": occurrences=1 Key 5 ="mous": occurrences=1 Key 6 ="thank": occurrences=1 Key 7 ="welcom": occurrences=1 Key 8 ="xpert": occurrences=1
# Create the dictionary
id2word = dictionary
# Create the corpus with word frequencies
corpus = bow_corpus
# Build the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=10,
random_state=100,
update_every=1,
chunksize=1000,
passes=10,
alpha='symmetric',
iterations=100,
per_word_topics=True)
for index, topic in lda_model.print_topics(-1):
print(f"Topic: {index} \nWords: {topic}")
Topic: 0 Words: 0.021*"encrypt" + 0.018*"secur" + 0.018*"chip" + 0.016*"govern" + 0.013*"clipper" + 0.012*"public" + 0.010*"privaci" + 0.010*"key" + 0.010*"phone" + 0.009*"algorithm" Topic: 1 Words: 0.017*"appear" + 0.014*"copi" + 0.013*"cover" + 0.013*"star" + 0.013*"book" + 0.011*"penalti" + 0.010*"black" + 0.009*"comic" + 0.008*"blue" + 0.008*"green" Topic: 2 Words: 0.031*"window" + 0.015*"server" + 0.012*"program" + 0.012*"file" + 0.012*"applic" + 0.012*"display" + 0.011*"widget" + 0.010*"version" + 0.010*"motif" + 0.010*"support" Topic: 3 Words: 0.015*"space" + 0.007*"launch" + 0.007*"year" + 0.007*"medic" + 0.006*"patient" + 0.006*"orbit" + 0.006*"research" + 0.006*"diseas" + 0.005*"develop" + 0.005*"nasa" Topic: 4 Words: 0.018*"armenian" + 0.011*"peopl" + 0.008*"kill" + 0.008*"said" + 0.007*"turkish" + 0.006*"muslim" + 0.006*"jew" + 0.006*"govern" + 0.005*"state" + 0.005*"greek" Topic: 5 Words: 0.024*"price" + 0.021*"sale" + 0.020*"offer" + 0.017*"drive" + 0.017*"sell" + 0.016*"includ" + 0.013*"ship" + 0.013*"interest" + 0.011*"ask" + 0.010*"condit" Topic: 6 Words: 0.018*"mail" + 0.016*"list" + 0.015*"file" + 0.015*"inform" + 0.013*"send" + 0.012*"post" + 0.012*"avail" + 0.010*"request" + 0.010*"program" + 0.009*"includ" Topic: 7 Words: 0.019*"like" + 0.016*"know" + 0.011*"time" + 0.011*"look" + 0.010*"think" + 0.008*"want" + 0.008*"thing" + 0.008*"good" + 0.007*"go" + 0.007*"bike" Topic: 8 Words: 0.033*"game" + 0.022*"team" + 0.017*"play" + 0.015*"year" + 0.013*"player" + 0.011*"season" + 0.008*"hockey" + 0.008*"score" + 0.007*"leagu" + 0.007*"goal" Topic: 9 Words: 0.013*"peopl" + 0.012*"think" + 0.011*"like" + 0.009*"time" + 0.009*"right" + 0.009*"israel" + 0.009*"know" + 0.006*"reason" + 0.006*"point" + 0.006*"thing"
What is the most representative topic in each document?
def analyse_topics(ldamodel, corpus, texts):
main_topic = {}
percentage = {}
keywords = {}
text_snippets = {}
# Get main topic in each document
for i, topic_list in enumerate(ldamodel[corpus]):
#print("\n")
#print(topic_list)
#print("\n")
#for i in range(0, len(topic_list)):
# print (topic_list[i])
topic = topic_list[0] if ldamodel.per_word_topics else topic_list
#print(topic)
topic = sorted(topic, key=lambda x: (x[1]), reverse=True)
# Get the main topic, contribution (%) and keywords for each document
for j, (topic_num, prop_topic) in enumerate(topic):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp[:5]])
main_topic[i] = int(topic_num)
percentage[i] = round(prop_topic,4)
keywords[i] = topic_keywords
text_snippets[i] = texts[i][:8]
else:
break
return main_topic, percentage, keywords, text_snippets
main_topic, percentage, keywords, text_snippets = analyse_topics(
lda_model, bow_corpus, processed_docs)
indexes = []
rows = []
for i in range(0, 10):
indexes.append(i)
rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet'])
for idx in indexes:
rows.append([str(idx), f"{main_topic.get(idx)}",
f"{percentage.get(idx):.4f}",
f"{keywords.get(idx)}\n",
f"{text_snippets.get(idx)}"])
columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
print(''.join(' {:{width}} '.format(row[i], width=column_widths[i])
for i in range(0, len(row))))
ID Main Topic Contribution (%) Keywords Snippet 0 2 0.8268 window, server, program, file, applic ['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint'] 1 6 0.4745 mail, list, file, inform, send ['obtain', 'copi', 'open', 'look', 'widget', 'obtain', 'need', 'order'] 2 7 0.4229 like, know, time, look, think ['right', 'signal', 'strong', 'live', 'west', 'philadelphia', 'perfect', 'sport'] 3 8 0.4159 game, team, play, year, player ['canadian', 'thing', 'coach', 'boston', 'bruin', 'colorado', 'rocki', 'summari'] 4 9 0.9039 peopl, think, like, time, right ['heck', 'feel', 'like', 'time', 'includ', 'cafeteria', 'work', 'half'] 5 7 0.6291 like, know, time, look, think ['damn', 'right', 'late', 'climb', 'meet', 'morn', 'bother', 'right'] 6 3 0.3483 space, launch, year, medic, patient ['olympus', 'stylus', 'pocket', 'camera', 'smallest', 'class', 'includ', 'time'] 7 5 0.3798 price, sale, offer, drive, sell ['includ', 'follow', 'chmos', 'clock', 'generat', 'driver', 'processor', 'chmos'] 8 2 0.7943 window, server, program, file, applic ['chang', 'intel', 'discov', 'xclient', 'xload', 'longer', 'work', 'bomb'] 9 2 0.6384 window, server, program, file, applic ['termin', 'like', 'power', 'server', 'run', 'window', 'manag', 'special']
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis
/Users/ekaterinakochmar/opt/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/_online_lda.py:29: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations EPS = np.finfo(np.float).eps
Note, for newer versions of gensim
, use the following code:
#import pyLDAvis.gensim_models
#pyLDAvis.enable_notebook()
#vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
#vis