In this example, we see how a system can perform regression analysis. Regression is very much a statistical approach for learning, such that we can learn what the output Y should be when we know x. We use the linear regression equations (see link for more details), to identify the slope / gradient of the line, and the y-intercept. We can then express a line that generalises our data in the form Y = mx + c.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
def generate_data():
### we will generate data that looks like y = mx + c with some random added noise
c = 3
m = 0.7
noise = 20
data = []
for i in range(100):
x = np.random.random() * 50
y = (m * x + c) + ( np.random.random() * noise + noise/2)
data.append([x,y])
data = np.array(data)
return data
data = generate_data()
plt.scatter(data[:,0], data[:,1], color='k')
plt.show()
# Let's create some useful variables that we will work with
N = data.shape[0]
X = data[:,0]
Y = data[:,1]
XY = X * Y
X2 = X ** 2
Y2 = Y ** 2
#### http://www.statisticshowto.com/how-to-find-a-linear-regression-equation/
c = ((np.sum(Y) * np.sum(X2)) - (np.sum(X) * np.sum(XY))) / (N * np.sum(X2) - np.sum(X) ** 2)
m = ((N * np.sum(XY)) - (np.sum(X) * np.sum(Y))) / (N * np.sum(X2) - np.sum(X) ** 2)
print (c, m)
23.971305135783272 0.69691578146255
x_1 = 0
x_2 = 50
Y1 = (m * x_1) + c
Y2 = (m * x_2) + c
plt.scatter(data[:,0], data[:,1], color='k')
plt.plot([x_1, x_2], [Y1, Y2])
plt.show()
some_new_data = np.array([13,26,35,43])
output_predictions = (m * some_new_data) + c
output_predictions
plt.scatter(data[:,0], data[:,1], color='k', alpha=0.2)
plt.scatter(some_new_data, output_predictions, color='red')
#plt.plot([x_1, x_2], [Y1, Y2])
plt.show()