In [1]:
"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need this cell
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/sklearn/topic_modelling/book_titles')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [2]:
!pip install pyldavis
Requirement already satisfied: pyldavis in /usr/local/lib/python3.6/dist-packages (2.1.2)
Requirement already satisfied: pandas>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (0.24.2)
Requirement already satisfied: funcy in /usr/local/lib/python3.6/dist-packages (from pyldavis) (1.13)
Requirement already satisfied: numexpr in /usr/local/lib/python3.6/dist-packages (from pyldavis) (2.7.0)
Requirement already satisfied: scipy>=0.18.0 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (1.3.1)
Requirement already satisfied: wheel>=0.23.0 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (0.33.6)
Requirement already satisfied: pytest in /usr/local/lib/python3.6/dist-packages (from pyldavis) (3.6.4)
Requirement already satisfied: numpy>=1.9.2 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (1.16.4)
Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from pyldavis) (0.16.0)
Requirement already satisfied: joblib>=0.8.4 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (0.13.2)
Requirement already satisfied: jinja2>=2.7.2 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (2.10.1)
Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas>=0.17.0->pyldavis) (2018.9)
Requirement already satisfied: python-dateutil>=2.5.0 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.17.0->pyldavis) (2.5.3)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (41.2.0)
Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (1.8.0)
Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (0.7.1)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (1.12.0)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (19.1.0)
Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (1.3.0)
Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (7.2.0)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.7.2->pyldavis) (1.1.1)
In [0]:
import nltk
from nltk.corpus import stopwords

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
In [0]:
N_TOPICS = 10
MAX_TERMS = 5
In [0]:
def text_clean(raw_str):
  raw_str = raw_str.lower()
  tokens = nltk.tokenize.word_tokenize(raw_str)
  tokens = [token for token in tokens if len(token)>2]
  tokens = [token for token in tokens if token not in stopwords]
  tokens = [token for token in tokens if not any(c.isdigit() for c in token)]
  return ' '.join(tokens)
In [6]:
nltk.download('punkt')
nltk.download('stopwords')

stopwords = set(stopwords.words('english')).union({
  'introduction', 'edition', 'series', 'application',
  'approach', 'card', 'access', 'package', 'plus', 'etext',
  'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
  'third', 'second', 'fourth'})
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [0]:
tfidf = TfidfVectorizer()
lda = LatentDirichletAllocation(N_TOPICS, max_iter=100, learning_offset=50)
In [8]:
with open('all_book_titles.txt') as f:
  documents = [text_clean(line.rstrip()) for line in f]
tfidf_repr = tfidf.fit_transform(documents)
lda.fit(tfidf_repr)
Out[8]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=50,
                          max_doc_update_iter=100, max_iter=100,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)
In [9]:
for topic_idx, term_vals in enumerate(lda.components_):
  message = "Topic #%d: " % topic_idx
  message += " ".join([tfidf.get_feature_names()[i]
                       for i in term_vals.argsort()[:-MAX_TERMS - 1:-1]])
  print(message)
Topic #0: film hinduism music judaism islam
Topic #1: human anatomy physiology sexuality analysis
Topic #2: sociology theory finance game understanding
Topic #3: computer java health data structures
Topic #4: mechanics physics calculus quantum microbiology
Topic #5: marketing art history accounting literature
Topic #6: algorithms life networks learning nutrition
Topic #7: psychology statistics science political anthropology
Topic #8: chemistry biology economics business organic
Topic #9: criminology earth language religions science
In [0]:
import pyLDAvis
import pyLDAvis.sklearn
In [11]:
pyLDAvis.save_html(pyLDAvis.sklearn.prepare(lda, tfidf_repr, tfidf), 'lda.html')
/usr/local/lib/python3.6/dist-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))