#!/usr/bin/env python
# coding: utf-8

# ### Hessians to compute variances
# 
#  - We have our point estimates, but now we want variances of the estimates
#  - We can approximate them (under some asymptotic conditions) using the _Hessian at the (negative) MLE minimum_
#  - Think of the the Hessian as a matrix representing the curvature at the minimum point. If we move slightly away from that minimum, how much change do we see in the likelihood?
#    - high curvature (think of a tight peak) => low variance of estimate.
#    - low curvature (think of a broad hill) => high variance of estimate.
# 
# To extract the parameter estimates, we invert the Hessian at the MLE, $\hat{\theta}$, and take the diagonal:
# 
# $$\text{Var}(\hat{\theta}_i) = \text{diag}(H(\hat{\theta})^{-1})_i $$
# 
# The Hessian is the matrix of all second derivatives. `autograd` has this built in:

# In[1]:


from autograd import numpy as np
from autograd import elementwise_grad, value_and_grad, hessian
from scipy.optimize import minimize


# In[2]:


T = (np.random.exponential(size=1000)/1.5) ** 2.3
E = np.random.binomial(1, 0.95, size=1000)


# In[3]:


# seen all this...
def cumulative_hazard(params, t):
    lambda_, rho_ = params
    return (t / lambda_) ** rho_

hazard = elementwise_grad(cumulative_hazard, argnum=1)

def log_hazard(params, t):
    return np.log(hazard(params, t))

def log_likelihood(params, t, e):
    return np.sum(e * log_hazard(params, t)) - np.sum(cumulative_hazard(params, t))

def negative_log_likelihood(params, t, e):
    return -log_likelihood(params, t, e)

from autograd import value_and_grad

results = minimize(
        value_and_grad(negative_log_likelihood), 
        x0 = np.array([1.0, 1.0]),
        method=None, 
        args=(T, E),
        jac=True,
        bounds=((0.00001, None), (0.00001, None)))

print(results)


# In[4]:


estimates_ = results.x

# Note: this will produce a matrix
hessian(negative_log_likelihood)(estimates_, T, E)


# In[5]:


H = hessian(negative_log_likelihood)(estimates_, T, E)
variance_ = np.diag(np.linalg.inv(H))
print(variance_)


# In[6]:


std_error = np.sqrt(variance_)
print(std_error)


# In[7]:


print("lambda_  %.3f (±%.2f)" % (estimates_[0], std_error[0]))
print("rho_     %.3f (±%.2f)" % (estimates_[1], std_error[1]))


# From here you can construct confidence intervals (CIs), and such. Let's move to Part 6, which is where you connect these abstract parameters to business logic.