"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need this cell
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/sklearn/topic_modelling/book_titles')
!pip install pyldavis
import nltk
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
N_TOPICS = 10
MAX_TERMS = 5
def text_clean(raw_str):
raw_str = raw_str.lower()
tokens = nltk.tokenize.word_tokenize(raw_str)
tokens = [token for token in tokens if len(token)>2]
tokens = [token for token in tokens if token not in stopwords]
tokens = [token for token in tokens if not any(c.isdigit() for c in token)]
return ' '.join(tokens)
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words('english')).union({
'introduction', 'edition', 'series', 'application',
'approach', 'card', 'access', 'package', 'plus', 'etext',
'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
'third', 'second', 'fourth'})
tfidf = TfidfVectorizer()
lda = LatentDirichletAllocation(N_TOPICS, max_iter=100, learning_offset=50)
with open('all_book_titles.txt') as f:
documents = [text_clean(line.rstrip()) for line in f]
tfidf_repr = tfidf.fit_transform(documents)
lda.fit(tfidf_repr)
for topic_idx, term_vals in enumerate(lda.components_):
message = "Topic #%d: " % topic_idx
message += " ".join([tfidf.get_feature_names()[i]
for i in term_vals.argsort()[:-MAX_TERMS - 1:-1]])
print(message)
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.save_html(pyLDAvis.sklearn.prepare(lda, tfidf_repr, tfidf), 'lda.html')