#!/usr/bin/env python
# coding: utf-8

# __Note__: This is best viewed on [NBViewer](http://nbviewer.ipython.org/github/tdhopper/notes-on-dirichlet-processes/blob/master/2015-10-07-econtalk-topics.ipynb). It is part of a series on [Dirichlet Processes and Nonparametric Bayes](https://github.com/tdhopper/notes-on-dirichlet-processes).

# # Nonparametric Latent Dirichlet Allocation
# 
# ## Analysis of the topics of [Econtalk](http://www.econtalk.org/)
# 
# In 2003, a groundbreaking statistical model called "[Latent Dirichlet Allocation](https://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf)" was presented by David Blei, Andrew Ng, and Michael Jordan.
# 
# LDA provides a method for summarizing the topics discussed in a document. LDA defines topics to be discrete probability distrbutions over words. For an introduction to LDA, see [Edwin Chen's post](http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/).
# 
# The original LDA model requires the number of topics in the document to be specfied as a known parameter of the model. In 2005, Yee Whye Teh and others published [a "nonparametric" version of this model](http://www.cs.berkeley.edu/~jordan/papers/hdp.pdf) that doesn't require the number of topics to be specified. This model uses a prior distribution over the topics called a hierarchical Dirichlet process. [I wrote an introduction to this HDP-LDA model](https://github.com/tdhopper/notes-on-dirichlet-processes/blob/master/2015-08-03-nonparametric-latent-dirichlet-allocation.ipynb) earlier this year.
# 
# For the last six months, I have been developing a Python-based Gibbs sampler for the HDP-LDA model. This is part of a larger library of "robust, validated Bayesian nonparametric models for discovering structure in data" known as [Data Microscopes](http://datamicroscopes.github.io).
# 
# This notebook demonstrates the functionality of this implementation.
# 
# The Data Microscopes library is available on [anaconda.org](https://anaconda.org/datamicroscopes/) for Linux and OS X. `microscopes-lda` can be installed with:
# 
#     $ conda install -c datamicroscopes -c distributions microscopes-lda 

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pyLDAvis
import json
import sys
import cPickle

from microscopes.common.rng import rng
from microscopes.lda.definition import model_definition
from microscopes.lda.model import initialize
from microscopes.lda import utils
from microscopes.lda import model, runner

from numpy import genfromtxt 
from numpy import linalg
from numpy import array


# `dtm.csv` contains a document-term matrix representation of the words used in Econtalk transcripts. The columns of the matrix correspond to the words in `vocab.txt`. The rows in the matrix correspond to the show urls in `urls.txt`.
# 
# Our LDA implementation takes input data as a list of lists of hashable objects (typically words). We can use a utility function to convert the document-term matrix to the list of tokenized documents. 

# In[2]:


vocab = genfromtxt('./econtalk-data/vocab.txt', delimiter=",", dtype='str').tolist()
dtm = genfromtxt('./econtalk-data/dtm.csv', delimiter=",", dtype=int)
docs = utils.docs_from_document_term_matrix(dtm, vocab=vocab)
urls = [s.strip() for s in open('./econtalk-data/urls.txt').readlines()]


# In[3]:


dtm.shape[1] == len(vocab)


# In[4]:


dtm.shape[0] == len(urls)


# Here's a utility method to get the title of a webpage that we'll use later.

# In[5]:


def get_title(url):
    """Scrape webpage title
    """
    import lxml.html
    t = lxml.html.parse(url)
    return t.find(".//title").text.split("|")[0].strip()


# Let's set up our model. First we created a model definition describing the basic structure of our data. Next we initialize an MCMC state object using the model definition, documents, random number generator, and hyper-parameters.

# In[8]:


N, V = len(docs), len(vocab)
defn = model_definition(N, V)
prng = rng(12345)
state = initialize(defn, docs, prng,
                        vocab_hp=1,
                        dish_hps={"alpha": .6, "gamma": 2})
r = runner.runner(defn, docs, state, )


# When we first create a state object, the words are randomly assigned to topics. Thus, our perplexity (model score) is quite high. After we start to run the MCMC, the score will drop quickly.

# In[9]:


print "randomly initialized model:"
print " number of documents", defn.n
print " vocabulary size", defn.v
print " perplexity:", state.perplexity(), "num topics:", state.ntopics()


# Run one iteration of the MCMC to make sure everything is working.

# In[17]:


get_ipython().run_cell_magic('time', '', 'r.run(prng, 1)\n')


# Now lets run 1000 generations of the MCMC.
# 
# Unfortunately, MCMC is slow going.

# In[15]:


get_ipython().run_cell_magic('time', '', 'r.run(prng, 500)\n')


# In[16]:


with open('./econtalk-data/2015-10-07-state.pkl', 'w') as f:
    cPickle.dump(state, f)


# In[17]:


get_ipython().run_cell_magic('time', '', 'r.run(prng, 500)\n')


# In[18]:


with open('./econtalk-data/2015-10-07-state.pkl', 'w') as f:
    cPickle.dump(state, f)


# Now that we've run the MCMC, the perplexity has dropped significantly. 

# In[19]:


print "after 1000 iterations:"
print " perplexity:", state.perplexity(), "num topics:", state.ntopics()


# [pyLDAvis](https://github.com/bmabey/pyLDAvis) projects the topics into two dimensions using techniques described by [Carson Sievert](http://stat-graphics.org/movies/ldavis.html).

# In[21]:


vis = pyLDAvis.prepare(**state.pyldavis_data())
pyLDAvis.display(vis)


# We can extract the term relevance (shown in the right hand side of the visualization) right from our state object. Here are the 10 most relevant words for each topic:
# 

# In[23]:


relevance = state.term_relevance_by_topic()
for i, topic in enumerate(relevance):
    print "topic", i, ":",
    for term, _ in topic[:10]:
        print term, 
    print 


# We could assign titles to each of these topics. For example, _Topic 5_ appears to be about the _foundations of classical liberalism_. _Topic 6_ is obviously _Bitcoin and Software_.  _Topic 0_ is the _financial system and monetary policy_. _Topic 4_ seems to be _generic words used in most episodes_; unfortunately, the prevalence of "don" is a result of my preprocessing which splits up the contraction "don't".

# We can also get the topic distributions for each document. 

# In[24]:


topic_distributions = state.topic_distribution_by_document()


# Topic 5 appears to be about the theory of classical liberalism. Let's find the 20 episodes which have the highest proportion of words from that topic.

# In[25]:


austrian_topic = 5
foundations_episodes = sorted([(dist[austrian_topic], url) for url, dist in zip(urls, topic_distributions)], reverse=True)
for url in [url for _, url in foundations_episodes][:20]:
    print get_title(url), url


# We could also find the episodes that have notable discussion of both politics AND the financial system.

# In[29]:


topic_a = 0
topic_b = 1
joint_episodes = [url for url, dist in zip(urls, topic_distributions) if dist[0] > 0.18 and dist[1] > 0.18]
for url in joint_episodes:
    print get_title(url), url


# We can look at the topic distributions as projections of the documents into a much lower dimension (16). 
# We can try to find shows that are similar by comparing the topic distributions of the documents. 

# In[30]:


def find_similar(url, max_distance=0.2):
    """Find episodes most similar to input url.
    """
    index = urls.index(url)
    for td, url in zip(topic_distributions, urls):
        if linalg.norm(array(topic_distributions[index]) - array(td)) < max_distance:
            print get_title(url), url


# Which Econtalk episodes are most similar, in content, to "Mike Munger on the Division of Labor"?

# In[31]:


find_similar('http://www.econtalk.org/archives/2007/04/mike_munger_on.html')


# How about episodes similar to "Kling on Freddie and Fannie and the Recent History of the U.S. Housing Market"?

# In[32]:


find_similar('http://www.econtalk.org/archives/2008/09/kling_on_freddi.html')


# The model also gives us distributions over words for each topic.

# In[33]:


word_dists = state.word_distribution_by_topic()


# We can use this to find the topics a word is most likely to occur in.

# In[34]:


def bars(x, scale_factor=10000):
    return int(x * scale_factor) * "="

def topics_related_to_word(word, n=10):
    for wd, rel in zip(word_dists, relevance):
        score = wd[word]
        rel_words = ' '.join([w for w, _ in rel][:n]) 
        if bars(score):
            print bars(score), rel_words


# What topics are most likely to contain the word "Munger" (as in [Mike Munger](http://www.michaelmunger.com/)). The number of equal signs indicates the probability the word is generated by the topic. If a topic isn't shown, it's extremely unlikley to generate the word.

# In[35]:


topics_related_to_word('munger')


# Where does Munger come up? In discussing the moral foundations of classical liberalism and microeconomics!

# 
# 
# How about the word "lovely"? Russ Roberts uses it often when talking about the _Theory of Moral Sentiments_. It looks like it also comes up when talking about schools.

# In[36]:


topics_related_to_word('lovely')


# If you have feedback on this implementation of HDP-LDA, you can reach me on [Twitter](http://twitter.com/tdhopper) or open an [issue on Github](https://github.com/datamicroscopes/lda/issues).