#!/usr/bin/env python # coding: utf-8 # # Markov chain modeling # # In this tutorial, we implement a Markov chain model that allows for different orders. We demonstrate the model on human navigation data derived from Wikispeedia. # ## Markov chain model # # First, we implement the model # In[1]: # further implementation can be found at https://github.com/psinger/PathTools from __future__ import division import itertools from scipy.sparse import csr_matrix from scipy.special import gammaln from scipy.stats import chi2 from collections import defaultdict from sklearn.preprocessing import normalize import numpy as np class MarkovChain(): """ Markov Chain Model """ # dummy constant for our reset state RESET_STATE = "-1" def __init__(self, order=1, reset=True): """ Constructor for class MarkovChain Args: order: Order of Markov Chain model reset: Boolean for using additional reset states """ self.order = order self.reset = reset def fit(self,sequences): """ Function for fitting the Markov Chain model given data Args: sequences: Data of sequences, list of lists """ # first, we derive all basic states from given sequences states = set(itertools.chain.from_iterable(sequences)) self.state_count = len(states) if self.reset: self.state_count += 1 # dictionary of dictionaries for counting transitions between states transitions = defaultdict(lambda : defaultdict(float)) # remember all start and target states that we observe # this is just necessary for memory saving reasons start_states = set() target_states = set() # iterate through sequences for seq in sequences: i = 0 # Prepend and append correct amount of reset states if flag set if self.reset: seq = self.order*[self.RESET_STATE] + seq + [self.RESET_STATE] # iterate through elements of a sequence for j in xrange(self.order, len(seq)): # start state based on order elemA = tuple(seq[i:j]) # target state based on order elemB = tuple(seq[j-self.order+1:j+1]) i += 1 # increase transition count transitions[elemA][elemB] += 1 # remember start and target states start_states.add(elemA) target_states.add(elemB) # build vocabularies for mapping states to indices self.start_vocab = dict((v,k) for k,v in enumerate(start_states)) self.target_vocab = dict((v,k) for k,v in enumerate(target_states)) # transform transition dictionary of dictionaries to sparse csr_matrix i_indices = [] j_indices = [] values = [] for s,ts in transitions.iteritems(): for t,c in ts.iteritems(): i_indices.append(self.start_vocab[s]) j_indices.append(self.target_vocab[t]) values.append(c) shape = (len(start_states), len(target_states)) self.transitions = csr_matrix((values, (i_indices, j_indices)), shape=shape) #print "fit done" def loglikelihood(self): """ Returns the loglikelihood given fitted model Returns: loglikelihood """ # get mle by row-normalizing transition matrix self.mle = normalize(self.transitions,norm="l1",axis=1) # log mle mle_log = self.mle.copy() mle_log.data = np.log(mle_log.data) return self.transitions.multiply(mle_log).sum() @staticmethod def lrt(ln, pn, lt, pt): """ Performs a likelihood ratio test given a null and alternative model Args: ln: loglikelihood of null model pn: nr. of parameters of null model lt: loglikelihood of alternative model pt: nr. of parameters of alternative model Returns: lr: log-likelihood ratio p_val: p value """ # likelihood ratio lr = float(-2*(ln-lt)) # degrees of freedom dof = float(pt - pn) # chi2 test p_val = chi2.sf(lr,dof) return lr, p_val def aic(self): """ Determines the AIC given fitted model Returns: aic """ ll = self.loglikelihood() # parameter count of model para = self.state_count**self.order*(self.state_count-1) return 2*para - 2*ll def bic(self): """ Determines the BIC given fitted model Returns: bic """ ll = self.loglikelihood() # parameter count of model para = self.state_count**self.order*(self.state_count-1) return para*np.log(self.transitions.sum()) - 2*ll def evidence(self, prior=1.): """ Determines Bayesian evidence given fitted model Args: prior: Dirichlet prior parameter (symmetric) Returns: evidence """ i_dim, j_dim = self.transitions.shape # elegantly calculate evidence evidence = 0 evidence += i_dim * gammaln(self.state_count*prior) evidence -= gammaln(self.transitions.sum(axis=1)+self.state_count*prior).sum() evidence += gammaln(self.transitions.data+prior).sum() evidence -= len(self.transitions.data) * gammaln(prior) return evidence # ## Slides example # # For a first demonstration, we present the example from the slides consisting of blue and yellow states. # ### Data # In[2]: sequences = [["b","b","b","y","y","b","y","b","b","b","b"]] # ### Fitting # # We fit both a first and second order model and print the transition matrices. # In[3]: print "First-order model" mc1 = MarkovChain(reset=True, order=1) mc1.fit(sequences) print mc1.transitions.todense() print "---------" print "Second-order model" mc2 = MarkovChain(reset=True, order=2) mc2.fit(sequences) print mc2.transitions.todense() # ### Model selection # # Next, let us compare the two models. # In[4]: print "First-order model" print "Log-likelihood:", mc1.loglikelihood() print "Evidence:", mc1.evidence() print "AIC:", mc1.aic() print "BIC:", mc1.bic() print "---------" print "Second-order model" print "Log-likelihood:", mc2.loglikelihood() print "Evidence:", mc2.evidence() print "AIC:", mc2.aic() print "BIC:", mc2.bic() # The simpler, first-order model is to be preferred. # ## Synthetic example # # Now, let us generate synthetic data with given memory. We always start with the state "A" and add additional "B" states based on a given memory. Then, we repeat this process 1000 times. For example, for a simple Markovian first-order process, we would produce "ABABABAB...". For a third-order process we would produce "ABBBABBB...". Then, the model comparison criteria correctly determine corresponding Markov chain model as the most appropriate ones. # In[12]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt for o in xrange(1,5): sample = [(["A"]+["B"]*o)*1000] loglikelihoods = [] aics = [] bics = [] evidences = [] max_order = 5 fig = plt.figure() for i in xrange(1,max_order+1): mc = MarkovChain(reset=True, order=i) mc.fit(sample) loglikelihoods.append(mc.loglikelihood()) aics.append(mc.aic()) bics.append(mc.bic()) evidences.append(mc.evidence()) # let us just plot the evidences here, the other criteria confirm these results ax = fig.add_subplot(111) ax.plot(xrange(1,max_order+1), evidences) max_index = evidences.index(max(evidences)) ax.plot((xrange(1,max_order+1))[max_index], evidences[max_index], 'rD', clip_on = False) ax.set_title(sample[0][:o+1]) # As expected, the modeling selection techniques also indicate those orders as the most plausible after which the data has been generated. # ## Wikispeedia human Web navigation example # # We now focus on an example of human Web navigation. # ### Data # # We load our data into a list of lists where each element corresponds to a sequence. # In[6]: import csv sequences = [] # from https://snap.stanford.edu/data/wikispeedia.html for line in csv.reader((row for row in open("data/paths_finished.tsv") if not row.startswith('#')), delimiter='\t'): if len(line) == 0: continue # for simplicity, let us remove back clicks seq = line[3].split(";") # for simplicity, let us remove back clicks seq = [x for x in seq if x != "<"] sequences.append(seq) # In[7]: print sequences[:10] # In[8]: len(sequences) # ### Fitting # # We now fit the Markov chain model to the data. We vary the order from 1 to 5. Additionally, we determine all model comparison techniques and store them for each order at hand. # In[9]: loglikelihoods = [] aics = [] bics = [] evidences = [] lrts = [] max_order = 5 for i in xrange(1,max_order+1): mc = MarkovChain(reset=True, order=i) mc.fit(sequences) loglikelihoods.append(mc.loglikelihood()) aics.append(mc.aic()) bics.append(mc.bic()) evidences.append(mc.evidence()) for i, ln in enumerate(loglikelihoods): for j, lt in enumerate(loglikelihoods[i+1:]): lrt = mc.lrt(ln, mc.state_count**(i+1)*(mc.state_count-1), lt, mc.state_count**(i+1+j+1)*(mc.state_count-1)) lrts.append([str(i+1)+" vs. "+str(i+1+j+1),lrt[0],lrt[1]]) # ### Model Selection # # Next, we select the most appropriate model. # #### Likelihood ratio test # # Perform likelihood ratio test between all fits # In[10]: print "orders | ", "likelihood ratio | ", "p-value" for x in lrts: print " | ".join([str(k) for k in x]) # #### AIC, BIC and Evidence # # Now, we determine the AIC, BIC and Evidence for all fits. Additionally, we plot the simple log-likelihood of the fits. # In[11]: f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2,figsize=(8,6)) x = xrange(1,max_order+1) ax1.plot(x, loglikelihoods) max_index = loglikelihoods.index(max(loglikelihoods)) ax1.plot(x[max_index], loglikelihoods[max_index], 'rD', clip_on = False) ax1.set_ylabel("Log-likelihood") ax2.plot(x, aics) min_index = aics.index(min(aics)) ax2.plot(x[min_index], aics[min_index], 'rD', clip_on = False) ax2.set_ylabel("AIC") ax3.plot(x, bics) min_index = bics.index(min(bics)) ax3.plot(x[min_index], bics[min_index], 'rD', clip_on = False) ax3.set_ylabel("BIC") ax4.plot(x, evidences) max_index = evidences.index(max(evidences)) ax4.plot(x[max_index], evidences[max_index], 'rD', clip_on = False) ax4.set_ylabel("Evidence") plt.tight_layout() # #### Summary # # All modeling comparison techniques indicate that a first-order Markov chain model sufficiently models given data. While higher order models always produce better fits due to nested models, their increasing number of necessary parameters does not fully justify their increasing complexity. However, with less states and/or more data, this might be different.