#!/usr/bin/env python
# coding: utf-8

# # Tutorial: Policy Gradient
# 
# **Resources**:
# 1. [Rich Sutton's RL Textbook (2nd edition draft)](https://drive.google.com/file/d/1xeUDVGWGUUv1-ccUMAZHJLej2C7aAFWY/view)
# 2. [The original REINFORCE paper by Williams (1992)](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) 
# 3. [David Silver's Policy Gradient Slides](http://www0.cs.ucl.ac.uk/staff/D.Silver/web/Teaching_files/pg.pdf)
# 3. [Example Notebook using Policy Gradients with REINFORCEMENT to train an agent in a GridWorld environment (PyTorch)](https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb)

# ## Change of Variables
# The change of variables formula:
# 
# $p_X(x) = p_Z(z) \big|\det\frac{\partial x}{\partial z}\big|^{-1}$
# 
# $p_Z(z) = p_X(x) \big|\det\frac{\partial x}{\partial z}\big|$
# 
# **Example**:
# 
# Let $p(x) = 2x, 0 \leq x \leq 1$, and $z = f(x) = \sqrt{x}$
# 
# $\sqrt{x} = z \Leftrightarrow x = z^2$
# 
# $\frac{\partial x}{\partial z} = 2z$
# 
# $p_X(\textbf{x}) = p_Z(z) \bigg| \det \bigg( \frac{\partial \textbf{x}}{\partial z}\bigg) \bigg|^{-1} $
# 
# $p_Z(z) = p_X(\textbf{x}) \bigg| \det \bigg( \frac{\partial \textbf{x}}{\partial z}\bigg) \bigg| $
# 
# $p_Z(z) = (2x)| 2z |$
# 
# $p_Z(z) = (2(z^2))2z$
# 
# $p_Z(z) = 4z^3$
# 
# Let's try to visualize this!

# In[1]:


import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib.ticker import NullFormatter


# We are going to plot 3 graphs: $p(x)$, $p(z)$, and $z = f(x) = \sqrt{x}$

# In[2]:


nullfmt = NullFormatter()         # no labels
plt.figure(1, figsize=(6, 6))

# definitions for the axes
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
bottom_h = left_h = left + width + 0.02

rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, 0.2]
rect_histy = [left_h, bottom, 0.4, height]

axFunc = plt.axes(rect_scatter)
axPX = plt.axes(rect_histx)
axPZ = plt.axes(rect_histy)

# no labels
axPX.xaxis.set_major_formatter(nullfmt)
axPZ.yaxis.set_major_formatter(nullfmt)

# Title the axes
axPX.set_title(r"$p(x) = 2x$", fontsize=16)
axPX.set_ylabel(r"$p(x) = 2x$", fontsize=16)
axPZ.set_xlabel(r"$p(z) = 4z^3$", fontsize=16)
axFunc.set_xlabel(r"$x$", fontsize=16)
axFunc.set_ylabel(r"$z = \sqrt{x}$", fontsize=16)

# Plot function
x = np.linspace(0, 1, 50)
z = np.linspace(0, 1, 50)
axFunc.plot(x, np.sqrt(x))
axPX.plot(x, 2*x)
axPZ.plot(4*z**3, z)

# Plot a fill area
x_fill = np.linspace(0.45, 0.55, 3)
z_fill = np.sqrt(x_fill)
axPX.fill_between(x_fill, 2*x_fill, color='blue',linestyle='--')
axPZ.fill_betweenx(z_fill,4*z_fill**3, color='green',linestyle='--')
axFunc.fill_between(x_fill, z_fill, 1, color='blue', linestyle='--')
axFunc.fill_betweenx(z_fill, x_fill, 1, color='green', linestyle='--')
plt.show()


# The blue area in the $p(x)$ graph above has the same area as the green area in the $p(z)$ graph on the right. 

# ## Policy Gradient Demo: Training Classifier with PG vs Gradient Descent
# 
# As seen in lecture, we can use REINFORCE as an alternative way to train a classifier. Instead of using cross-entropy and gradient descent, we can directly try to optimize the classification accuracy using REINFORCE. The set up looks like this (as shown in lec 21):
# 
# * state x: an image
# * action a: a digit class
# * reward r(x,a): 1 if correct, 0 if wrong
# * policy pi(a|x): a distribution over categories
# 
# The difference between using REINFORCE versus Cross Entropy loss is that we need to sample the digit class, given the distribution over categories for a given image (state) at time $i$:
# 
# $\tilde{a_i} \sim \pi(a | x_i)$
# 
# And then apply the REINFORCE gradient estimator:
# 
# $ gradient = r(\tilde{a_i}, t_i) \frac{\partial}{\partial \theta} \log \pi_\theta (a_i | x_i)$
# 
# $ \theta \leftarrow \theta + \alpha * gradient$

# In[3]:


import autograd.numpy as np
import autograd.numpy.random as npr
from autograd.scipy.special import expit

from autograd import grad
from autograd.misc.optimizers import adam
from autograd import elementwise_grad

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# Generate some toy data
from sklearn.datasets import load_iris
iris = load_iris()

# print iris['DESCR']

color_wheel = {1: "#0392cf", 
               2: "#7bc043", 
               3: "#ee4035"}

sepal_len = iris['data'][:100,0]
sepal_wid = iris['data'][:100,1]
labels = iris['target'][:100]

# We will also center the data
# This is done to make numbers nice, so that we have no
# need for biases in our classification. (You might not
# be able to remove biases this way in general.)

sepal_len -= np.mean(sepal_len)
sepal_wid -= np.mean(sepal_wid)


# We can plot the subset of data that we've chosen:

# In[4]:


plt.scatter(sepal_len, 
            sepal_wid,
            c=labels,
            cmap=plt.cm.Paired)
plt.xlabel("sepal length")
plt.ylabel("sepal width")


# In[5]:


def plot_sep(w1, w2, color='green'):
    '''
    Plot decision boundary hypothesis 
      w1 * sepal_len + w2 * sepal_wid = 0
    in input space, highlighting the hyperplane
    '''
    plt.scatter(sepal_len, 
                sepal_wid,
                c=labels,
                cmap=plt.cm.Paired)
    plt.title("Separation in Input Space")
    plt.ylim([-1.5,1.5])
    plt.xlim([-1.5,2])
    plt.xlabel("sepal length")
    plt.ylabel("sepal width")
    if w2 != 0:
        m = -w1/w2
        t = 1 if w2 > 0 else -1
        plt.plot(
            [-1.5,2.0], 
            [-1.5*m, 2.0*m], 
            '-y', 
            color=color)
        plt.fill_between(
            [-1.5, 2.0],
            [m*-1.5, m*2.0],
            [t*1.5, t*1.5],
            alpha=0.2,
            color=color)
    if w2 == 0: # decision boundary is vertical
        t = 1 if w1 > 0 else -1
        plt.plot([0, 0],
                 [-1.5, 2.0],
                 '-y',
                color=color)
        plt.fill_between(
            [0, 2.0*t],
            [-1.5, -2.0],
            [1.5, 2],
            alpha=0.2,
            color=color)


# In[6]:


# A simple logistic classifier model
data = [(x,y) for x, y in zip(sepal_len, sepal_wid)]

def get_batch(data, labels, batch_size):
    choices = np.random.choice(len(data), batch_size, replace=False)
    return np.array(data)[choices], np.array(labels)[choices]


# In[7]:


get_batch(data,labels, 10)


# In[8]:


# Get a subset of the data and plot
data_batch, label_batch = get_batch(data,labels, 10)
plt.scatter(data_batch[:,0], 
            data_batch[:,1],
            c=label_batch,
            cmap=plt.cm.Paired)
plt.xlabel("sepal length")
plt.ylabel("sepal width")


# In[9]:


# Version 1: Using binary cross-entropy and SGD
# Reference: https://github.com/HIPS/autograd/blob/master/docs/tutorial.md 
def sigmoid(x):
    return 0.5 * (np.tanh(x / 2.) + 1)

def logistic_predictions(weights, inputs):
    # Outputs probability of a label being true according to logistic model.
    return sigmoid(np.dot(inputs, weights))

def bce_training_loss(weights, inputs, targets):
    # Training loss is the negative log-likelihood of the training labels.
    preds = logistic_predictions(weights, inputs)
    label_probabilities = preds * targets + (1 - preds) * (1 - targets)
    return -np.mean(np.log(label_probabilities))


# In[10]:


# Define a function that returns gradients of training loss using Autograd.
training_gradient_fun = grad(bce_training_loss)

lr = 0.01
weights = np.array([0.0, 0.0])
batch_size = 1
data_batch, label_batch = get_batch(data, labels, batch_size)
ce_loss_list = []
iterations = []
num_it = 100

print("Initial loss:", bce_training_loss(weights, data_batch, label_batch))
ce_loss_list.append(bce_training_loss(weights, data_batch, label_batch))
iterations.append(0)
for i in range(num_it):
    data_batch, label_batch = get_batch(data, labels, batch_size)
    weights -= training_gradient_fun(weights, data_batch, label_batch) * lr
    if i % 10 == 0:
        print "Weights: {}, ce loss: {}".format(weights, bce_training_loss(weights, data_batch, label_batch))
        plt.figure()
        plot_sep(weights[0], weights[1], color='green')
        plt.title("Iteration {}".format(i))
        ce_loss_list.append(bce_training_loss(weights, data_batch, label_batch))
        iterations.append(i+1)
print("Trained loss:", bce_training_loss(weights, data_batch, label_batch))
print weights


# In[11]:


# Version 2: REINFORCE version of the training using classification accuracy
def logit(x):
    """Inverse of sigmoid: log(x / (1- x))
    """
    return np.log(x) - np.log(1-x)

# Fast Sigmoid: https://timvieira.github.io/blog/post/2016/07/04/fast-sigmoid-sampling/
def sample_sigmoid(x, noise):
    """ Assume x is batch_size * 1. 
    Return batch_size * 1 sample where the probability of ith row = 1 is given by x[i]
    
    Does u ~ uniform(0,1), out = x > logit(u)
    """
    z = logit(noise)
    return np.array(x > z, dtype=np.int64)

def log_sigmoid_predictions(weights, inputs, noise):
    prob = sigmoid(np.dot(inputs, weights))
    prediction = sample_sigmoid(prob, noise)
    return np.log(prediction*prob + (1-prediction)*(1-prob))

def funcval_log_sigmoid_predictions(weights, inputs, noise, func_val):
    log_sig_pred = log_sigmoid_predictions(weights, inputs, noise)
    return func_val * log_sig_pred

def classification_acc(weight, inputs, targets, noise):
    prob = sigmoid(np.dot(inputs, weights))
    prediction = sample_sigmoid(prob, noise)
    return np.array(prediction == targets, dtype=np.int64)

# Derivative of a sigmoid wrt input z: https://math.stackexchange.com/questions/2320905/obtaining-derivative-of-log-of-sigmoid-function
def grad_log_sigmoid_predictions(weights, inputs, noise):
    z = np.dot(inputs, weights)
    prob = sigmoid(z)
    prediction = sample_sigmoid(prob, noise)
    d_log_sigmoid_pred_dz = ((prediction)*(np.exp(-np.maximum(z,0))/(np.exp(z-np.maximum(z,0)) + np.exp(-np.maximum(z,0)))) + (1-prediction)*(np.exp(z-np.maximum(z,0))/(np.exp(z-np.maximum(z,0)) + np.exp(-np.maximum(z,0)))))
    return np.sum(np.expand_dims(d_log_sigmoid_pred_dz, axis=1)*(inputs), axis=0)

def REINFORCE(weight, inputs, targets, noise):
    prob = sigmoid(np.dot(inputs, weights))
    samples = sample_sigmoid(prob, noise)
    func_vals = classification_acc(weight, inputs, targets, noise)
    reinforce_grad = elementwise_grad(funcval_log_sigmoid_predictions)(weights, inputs, noise, func_vals)/inputs.shape[0]
    
    # Alternative: Pure python without using autograd. Implement own 
#     reinforce_grad = np.expand_dims(func_vals, axis=1) * grad_log_sigmoid_predictions(weights, inputs, noise)
    
    return reinforce_grad
    

# In[12]:


lr = 0.01
weights = np.array([0.0, 0.0])
batch_size = 1
data_batch, label_batch = get_batch(data, labels, batch_size)
out = logistic_predictions(weights, data_batch)

data_batch, label_batch = get_batch(data, labels, batch_size)
noise_u = np.random.uniform(0,1,size=label_batch.shape)
print("Initial Classification Error:", np.mean(classification_acc(weights, data_batch, label_batch, noise_u)))
print "weights: {}, classification error: {}, ce loss: {}".format(weights, np.mean(classification_acc(weights, data_batch, label_batch, noise_u)),bce_training_loss(weights, data_batch, label_batch))
rl_ce_loss_list = []
rl_ce_loss_list.append(bce_training_loss(weights, data_batch, label_batch))
plt.figure()
plot_sep(weights[0], weights[1], color='green')
plt.title("Iteration {}".format(i))
for i in range(num_it):
    data_batch, label_batch = get_batch(data, labels, batch_size)
    noise_u = np.random.uniform(0,1,size=label_batch.shape)
    reinforce_grad = REINFORCE(weights, data_batch, label_batch, noise_u)
    weights += reinforce_grad*lr
    if i % 10 == 0:
        print "weights: {}, classification error: {}, ce loss: {}".format(weights, np.mean(classification_acc(weights, data_batch, label_batch, noise_u)),bce_training_loss(weights, data_batch, label_batch))
        plt.figure()
        plot_sep(weights[0], weights[1], color='green')
        plt.title("Iteration {}".format(i))
        rl_ce_loss_list.append(bce_training_loss(weights, data_batch, label_batch))
print("Trained loss:", bce_training_loss(weights, data_batch, label_batch))
print weights


# In[13]:


# Plot CE with SGD vs REINFORCE
plt.figure()
plt.plot(iterations, ce_loss_list, label="SGD XEntropy")
plt.plot(iterations, rl_ce_loss_list, label="REINFORCE")
plt.ylabel("XEntropy")
plt.xlabel("Iterations")
plt.title("Cross Entropy over Iterations")
plt.legend()


# **Exercise**: Try with baseline: modify `classification_acc` to have baseline: `reward = [pred == target] + 10`

# In[ ]: