#!/usr/bin/env python # coding: utf-8 # __Note__: This is best viewed on [NBViewer](http://nbviewer.ipython.org/github/tdhopper/notes-on-dirichlet-processes/blob/master/2015-10-07-econtalk-topics.ipynb). It is part of a series on [Dirichlet Processes and Nonparametric Bayes](https://github.com/tdhopper/notes-on-dirichlet-processes). # # Nonparametric Latent Dirichlet Allocation # # ## Analysis of the topics of [Econtalk](http://www.econtalk.org/) # # In 2003, a groundbreaking statistical model called "[Latent Dirichlet Allocation](https://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf)" was presented by David Blei, Andrew Ng, and Michael Jordan. # # LDA provides a method for summarizing the topics discussed in a document. LDA defines topics to be discrete probability distrbutions over words. For an introduction to LDA, see [Edwin Chen's post](http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/). # # The original LDA model requires the number of topics in the document to be specfied as a known parameter of the model. In 2005, Yee Whye Teh and others published [a "nonparametric" version of this model](http://www.cs.berkeley.edu/~jordan/papers/hdp.pdf) that doesn't require the number of topics to be specified. This model uses a prior distribution over the topics called a hierarchical Dirichlet process. [I wrote an introduction to this HDP-LDA model](https://github.com/tdhopper/notes-on-dirichlet-processes/blob/master/2015-08-03-nonparametric-latent-dirichlet-allocation.ipynb) earlier this year. # # For the last six months, I have been developing a Python-based Gibbs sampler for the HDP-LDA model. This is part of a larger library of "robust, validated Bayesian nonparametric models for discovering structure in data" known as [Data Microscopes](http://datamicroscopes.github.io). # # This notebook demonstrates the functionality of this implementation. # # The Data Microscopes library is available on [anaconda.org](https://anaconda.org/datamicroscopes/) for Linux and OS X. `microscopes-lda` can be installed with: # # $ conda install -c datamicroscopes -c distributions microscopes-lda # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pyLDAvis import json import sys import cPickle from microscopes.common.rng import rng from microscopes.lda.definition import model_definition from microscopes.lda.model import initialize from microscopes.lda import utils from microscopes.lda import model, runner from numpy import genfromtxt from numpy import linalg from numpy import array # `dtm.csv` contains a document-term matrix representation of the words used in Econtalk transcripts. The columns of the matrix correspond to the words in `vocab.txt`. The rows in the matrix correspond to the show urls in `urls.txt`. # # Our LDA implementation takes input data as a list of lists of hashable objects (typically words). We can use a utility function to convert the document-term matrix to the list of tokenized documents. # In[2]: vocab = genfromtxt('./econtalk-data/vocab.txt', delimiter=",", dtype='str').tolist() dtm = genfromtxt('./econtalk-data/dtm.csv', delimiter=",", dtype=int) docs = utils.docs_from_document_term_matrix(dtm, vocab=vocab) urls = [s.strip() for s in open('./econtalk-data/urls.txt').readlines()] # In[3]: dtm.shape[1] == len(vocab) # In[4]: dtm.shape[0] == len(urls) # Here's a utility method to get the title of a webpage that we'll use later. # In[5]: def get_title(url): """Scrape webpage title """ import lxml.html t = lxml.html.parse(url) return t.find(".//title").text.split("|")[0].strip() # Let's set up our model. First we created a model definition describing the basic structure of our data. Next we initialize an MCMC state object using the model definition, documents, random number generator, and hyper-parameters. # In[8]: N, V = len(docs), len(vocab) defn = model_definition(N, V) prng = rng(12345) state = initialize(defn, docs, prng, vocab_hp=1, dish_hps={"alpha": .6, "gamma": 2}) r = runner.runner(defn, docs, state, ) # When we first create a state object, the words are randomly assigned to topics. Thus, our perplexity (model score) is quite high. After we start to run the MCMC, the score will drop quickly. # In[9]: print "randomly initialized model:" print " number of documents", defn.n print " vocabulary size", defn.v print " perplexity:", state.perplexity(), "num topics:", state.ntopics() # Run one iteration of the MCMC to make sure everything is working. # In[17]: get_ipython().run_cell_magic('time', '', 'r.run(prng, 1)\n') # Now lets run 1000 generations of the MCMC. # # Unfortunately, MCMC is slow going. # In[15]: get_ipython().run_cell_magic('time', '', 'r.run(prng, 500)\n') # In[16]: with open('./econtalk-data/2015-10-07-state.pkl', 'w') as f: cPickle.dump(state, f) # In[17]: get_ipython().run_cell_magic('time', '', 'r.run(prng, 500)\n') # In[18]: with open('./econtalk-data/2015-10-07-state.pkl', 'w') as f: cPickle.dump(state, f) # Now that we've run the MCMC, the perplexity has dropped significantly. # In[19]: print "after 1000 iterations:" print " perplexity:", state.perplexity(), "num topics:", state.ntopics() # [pyLDAvis](https://github.com/bmabey/pyLDAvis) projects the topics into two dimensions using techniques described by [Carson Sievert](http://stat-graphics.org/movies/ldavis.html). # In[21]: vis = pyLDAvis.prepare(**state.pyldavis_data()) pyLDAvis.display(vis) # We can extract the term relevance (shown in the right hand side of the visualization) right from our state object. Here are the 10 most relevant words for each topic: # # In[23]: relevance = state.term_relevance_by_topic() for i, topic in enumerate(relevance): print "topic", i, ":", for term, _ in topic[:10]: print term, print # We could assign titles to each of these topics. For example, _Topic 5_ appears to be about the _foundations of classical liberalism_. _Topic 6_ is obviously _Bitcoin and Software_. _Topic 0_ is the _financial system and monetary policy_. _Topic 4_ seems to be _generic words used in most episodes_; unfortunately, the prevalence of "don" is a result of my preprocessing which splits up the contraction "don't". # We can also get the topic distributions for each document. # In[24]: topic_distributions = state.topic_distribution_by_document() # Topic 5 appears to be about the theory of classical liberalism. Let's find the 20 episodes which have the highest proportion of words from that topic. # In[25]: austrian_topic = 5 foundations_episodes = sorted([(dist[austrian_topic], url) for url, dist in zip(urls, topic_distributions)], reverse=True) for url in [url for _, url in foundations_episodes][:20]: print get_title(url), url # We could also find the episodes that have notable discussion of both politics AND the financial system. # In[29]: topic_a = 0 topic_b = 1 joint_episodes = [url for url, dist in zip(urls, topic_distributions) if dist[0] > 0.18 and dist[1] > 0.18] for url in joint_episodes: print get_title(url), url # We can look at the topic distributions as projections of the documents into a much lower dimension (16). # We can try to find shows that are similar by comparing the topic distributions of the documents. # In[30]: def find_similar(url, max_distance=0.2): """Find episodes most similar to input url. """ index = urls.index(url) for td, url in zip(topic_distributions, urls): if linalg.norm(array(topic_distributions[index]) - array(td)) < max_distance: print get_title(url), url # Which Econtalk episodes are most similar, in content, to "Mike Munger on the Division of Labor"? # In[31]: find_similar('http://www.econtalk.org/archives/2007/04/mike_munger_on.html') # How about episodes similar to "Kling on Freddie and Fannie and the Recent History of the U.S. Housing Market"? # In[32]: find_similar('http://www.econtalk.org/archives/2008/09/kling_on_freddi.html') # The model also gives us distributions over words for each topic. # In[33]: word_dists = state.word_distribution_by_topic() # We can use this to find the topics a word is most likely to occur in. # In[34]: def bars(x, scale_factor=10000): return int(x * scale_factor) * "=" def topics_related_to_word(word, n=10): for wd, rel in zip(word_dists, relevance): score = wd[word] rel_words = ' '.join([w for w, _ in rel][:n]) if bars(score): print bars(score), rel_words # What topics are most likely to contain the word "Munger" (as in [Mike Munger](http://www.michaelmunger.com/)). The number of equal signs indicates the probability the word is generated by the topic. If a topic isn't shown, it's extremely unlikley to generate the word. # In[35]: topics_related_to_word('munger') # Where does Munger come up? In discussing the moral foundations of classical liberalism and microeconomics! # # # How about the word "lovely"? Russ Roberts uses it often when talking about the _Theory of Moral Sentiments_. It looks like it also comes up when talking about schools. # In[36]: topics_related_to_word('lovely') # If you have feedback on this implementation of HDP-LDA, you can reach me on [Twitter](http://twitter.com/tdhopper) or open an [issue on Github](https://github.com/datamicroscopes/lda/issues).