#!/usr/bin/env python
# coding: utf-8

# ## Lab 1: Linear regression, OLS and gradient descent 

# In[1]:


import numpy as np
import matplotlib.pyplot as plt


# 1) Generating synthetic data
# =================================
# We start by generating equispaced points on the interval [0,5]. In this case we 
# assume a simple parametric model between the targets and the features 
# of the form t = h_beta(x) and we suppose 
# that our targets are generated as t_i = h_beta(x_i) = beta0 + beta1*x_i for 
# some beta0, beta1 (which we fix to arbitrary unknown constants)

x = np.linspace(0, 5, num=10)

xtilde = np.hstack((np.ones((len(x),1)).reshape(-1,1), x.reshape(-1,1)))


print(xtilde)

beta0 =1
beta1 = 1


beta = np.array([beta0, beta1])

targets = np.dot(beta, xtilde.T)

print(targets) # ti's 

# In most situation our assumption on the model h_beta will not be able to 
# match perfectly the data set. To put ourselves in such a framework, we perturb 
# our samples with some random noise epsilon_i so that they do not perfectly lie on 
# a line anymore. 

t_noisy = targets + np.random.normal(0, 0.2, len(x))

plt.scatter(x, t_noisy, c='r')
plt.plot(x, targets)
plt.show()


# In[2]:


# 2) Learning the model
# =================================

# The supervised learning problem then consist in find an appropriate mapping h_beta 
# that predicts as well as possible the targets t_i from the features x_i's
# Since we assumed the simple parametric model h_beta(x) = beta0 + beta1*x, we are left with 
# finding the beta0 and beta1 such that the t_i are best predicted from the x_i

# there are several criteria we could use. Here we choose to use a least squares loss (OLS for ordinary least squares)
# of the form L(beta) = sum_i (t_i - (beta0 + beta1*x_i))^2 which penalize the squares of the 
# differences between the predictions from our model and the actual targets


# We have our criterion, now we need a way to find the associated optimal 
# beta0 and beta1, that is to say the values beta0 and beta1 that make this loss minimum
# for this again there are multiple options. The first one is to follow the 
# opposite direction to the gradient until we reach a minimum. Indeed, the gradient 
# of a function always points in the direction of steepest ascent. By taking - grad, we 
# are thus guaranteed to decrease the value of the function. The gradient is the vector concatenating 
# the derivatives with respect to each variable (beta0 and beta1), we thus have grad = [dL/dbeta0, dL/dbeta1]

# Once we have computed the derivatives of L with respect to beta0 and beta1, 
# we can then apply the updates

# beta0 <-- beta0 - eta * dL/dbeta0
# beta1 <-- beta1 - eta * dL/dbeta1

# for a sufficiently small eta also known as the learning rate. If eta is small enough, since -gradient 
# always points in the direction of steepest descent will end up reaching a point of zero derivative


# In[3]:


# setting the learning rate
eta = 0.1

# setting max num of gradient updates
maxIter = 100


# initial, starting point for (beta0, beta1)
beta_init = np.random.normal(0,1,2)
beta = beta_init

reweighting = np.true_divide(1, 2*len(x))


# gradient updates

for iter in np.arange(0,maxIter):

    
    grad_beta0 = 2*reweighting*np.sum((targets.reshape(-1,1) - np.dot(beta, xtilde.T).reshape(-1,1)))*(-1)
    grad_beta1_tmp = (targets.reshape(-1,1) - np.dot(beta, xtilde.T).reshape(-1,1))
    grad_beta1 = -2*reweighting*np.sum(np.multiply(grad_beta1_tmp,x.reshape(-1,1)))
    
    beta[0] = beta[0] - eta*grad_beta0
    beta[1] = beta[1] - eta*grad_beta1


# In[4]:


xprediction = np.linspace(0,5,50)
xpredictiontilde = np.hstack((np.ones((len(xprediction),1)).reshape(-1,1), xprediction.reshape(-1,1)))

targets_prediction = np.dot(beta, xpredictiontilde.T)


# plot of the recovered model on top of the true (unknown one)
plt.scatter(x, t_noisy, c='r')
plt.plot(x, targets)
plt.plot(xprediction, targets_prediction, c='g')
plt.show()


# In[33]:


grad_beta0