#!/usr/bin/env python
# coding: utf-8

# # Activity 4 - Linear Regression
# 
# In this example, we see how a system can perform regression analysis. Regression is very much a statistical approach for learning, such that we can learn what the output Y should be when we know x. We use the linear regression equations (see link for more details), to identify the slope / gradient of the line, and the y-intercept. We can then express a line that generalises our data in the form Y = mx + c.

# In[1]:


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


# In[2]:


def generate_data():
    ### we will generate data that looks like y = mx + c with some random added noise
    c = 3
    m = 0.7
    noise = 20
    data = []
    for i in range(100):
        x = np.random.random() * 50
        y = (m * x + c) + ( np.random.random() * noise + noise/2)
        data.append([x,y])
    data = np.array(data)
    return data
data = generate_data()


# In[3]:


plt.scatter(data[:,0], data[:,1], color='k')
plt.show()


# In[4]:


# Let's create some useful variables that we will work with
N = data.shape[0]
X = data[:,0]
Y = data[:,1]
XY = X * Y
X2 = X ** 2
Y2 = Y ** 2


# In[5]:


#### http://www.statisticshowto.com/how-to-find-a-linear-regression-equation/
c = ((np.sum(Y) * np.sum(X2)) - (np.sum(X) * np.sum(XY))) / (N * np.sum(X2) - np.sum(X) ** 2)
m = ((N * np.sum(XY)) - (np.sum(X) * np.sum(Y))) / (N * np.sum(X2) - np.sum(X) ** 2)
print (c, m)


# In[6]:


x_1 = 0
x_2 = 50
Y1 = (m * x_1) + c
Y2 = (m * x_2) + c
plt.scatter(data[:,0], data[:,1], color='k')
plt.plot([x_1, x_2], [Y1, Y2])
plt.show()


# In[7]:


some_new_data = np.array([13,26,35,43])

output_predictions = (m * some_new_data) + c 
output_predictions

plt.scatter(data[:,0], data[:,1], color='k', alpha=0.2)
plt.scatter(some_new_data, output_predictions, color='red')
#plt.plot([x_1, x_2], [Y1, Y2])
plt.show()


# In[ ]: