#!/usr/bin/env python # coding: utf-8 # # Regression in Python # # *** # In[1]: # Numerical arrays import numpy as np # Plotting. import matplotlib.pyplot as plt # In[2]: # Plots styles. plt.style.use('ggplot') # Plot size. plt.rcParams['figure.figsize'] = (12, 8) # In[3]: # Create two points. x = np.array([4.0, 16.0]) y = np.array([6.0, 12.0]) x, y # In[4]: # Plot the points. plt.plot(x, y, 'ro') # Give ourselves some space. plt.xlim([-5.0, 25.0]) plt.ylim([-5.0, 25.0]); # In[5]: # Plot a straight line. l = np.linspace(-5.0, 25.0, 100) plt.plot(l, 0.5 * l + 4.0, 'g--') # Plot a straight line segment. l = np.linspace(4.0, 16.0, 10) plt.plot(l, 0.5 * l + 4.0, 'b-') # Plot the points. plt.plot(x, y, 'ro') # Give ourselves some space. plt.xlim([-5.0, 25.0]) plt.ylim([-5.0, 25.0]); # In[6]: # Plot a straight line. l = np.linspace(-3.0, 20.0, 100) plt.plot(l, 0.5 * l + 4.0, 'g-') # Plot a parabola. plt.plot(l, 10.0 * (l**2) - 199.5 * l + 644.0, 'b-') # Plot a cubic. plt.plot(l, (l**3) - 20.0625 * l**2 + 65.75 * l, 'y-') # Plot the points. plt.plot(x, y, 'ro'); #
# # #### Lines # # *** # In[7]: # Set up some x values. x = np.linspace(0.0, 10.0, 1000) x # *** # # $$ y = 5 x + 2 $$ # In[8]: # Create y - note numpy's element-wise operations. y = 5.0 * x + 2.0 # In[9]: # Look at y. y # In[10]: # Plot x versus y. plt.plot(x, y, 'k.') # In[11]: # Do regression on the x and y arrays using numpy. np.polyfit(x, y, 1) # *** # # $$ y = 3 x - 1 + \epsilon $$ # In[12]: # Create a y with noise. y = 3.0 * x - 1.0 + np.random.normal(0.0, 1.0, len(x)) # In[13]: # Look at y. y # In[14]: # Do regression on the x and y arrays using numpy. np.polyfit(x, y, 1) # In[15]: # Create variables with those values. m, c = np.polyfit(x, y, 1) # Have a look at m and c. m, c # In[16]: # Plot x and y and the regression line in red. plt.plot(x, y, 'k.') plt.plot(x, m * x + c, 'r-') # Note that we can easily calculate the best m and c ourselves. # In[17]: # Calculate mean x and mean y. x_avg = np.mean(x) y_avg = np.mean(y) # Subtract means from x and y. x_zero = x - x_avg y_zero = y - y_avg # Dot product of mean-adjusted x and y divided by dot product of mean adjusted x with itself. m = np.sum(x_zero * y_zero) / np.sum(x_zero * x_zero) # Subtract m times average x from average y. c = y_avg - m * x_avg # Let's have a look - same values as above. m, c # *** # # $$ y = 2 x^2 + 5x + 1 + \epsilon $$ # In[18]: # Create y from a polynomial in x. y = 2.0 * x * x + 5.0 * x + 1.0 + np.random.normal(0.0, 1.0, len(x)) # In[19]: # Look at y. y # In[20]: # Blindly try the regression - we get answers. # Create variables with those values. m, c = np.polyfit(x, y, 1) # Have a look at m and c. m, c # In[21]: # Plot the line and the points. plt.plot(x, y, 'k.') plt.plot(x, m * x + c, 'r-') # In[22]: # Create variables with those values. a, b, c = np.polyfit(x, y, 2) # Plot the line and the points. plt.plot(x, y, 'k.') plt.plot(x, a * x * x + b * x + c, 'r-') # Note how the points below the line are bunched in a specific $x$ range. # # *** # # ## Multiple linear regression # # Let's try multiple linear regression using sklearn. # [https://scikit-learn.org/stable/](https://scikit-learn.org/stable/) # In[23]: # Import linear_model from sklearn. import sklearn.linear_model as lm # In[24]: # Create a linear regression model instance. m = lm.LinearRegression() # In[25]: # Let's use pandas to read a csv file and organise our data. import pandas as pd # In[26]: # Read the iris csv from online. df = pd.read_csv('https://datahub.io/machine-learning/iris/r/iris.csv') # In[27]: df # $$ petalwidth = t (sepallength) + u (sepalwidth) + v (petallength) + c $$ # In[28]: # Let's pretend we want to do linear regression on these variables to predict petal width. x = df[['sepallength', 'sepalwidth', 'petallength']] # In[29]: # Here's petal width. y = df['petalwidth'] # In[30]: # Ask our model to fit the data. m.fit(x, y) # In[31]: # Here's our intercept. m.intercept_ # In[32]: # Here's our coefficients, in order. m.coef_ # In[33]: # See how good our fit is. m.score(x, y) # In[34]: # Calculating the score by hand. t, u, v = m.coef_ c = m.intercept_ y_avg = y.mean() u = ((y - (t * x['sepallength'] + u * x['sepalwidth'] + v * x['petallength'] + c))**2).sum() v = ((y - y.mean())**2).sum() 1 - (u/v) # *** # # ## Using statsmodels # In[35]: # Using statsmodels. import statsmodels.api as sm # Tell statmodels to include an intercept. xwithc = sm.add_constant(x) # Create a model. msm = sm.OLS(y, xwithc) # Fit the data. rsm = msm.fit() # Print a summary. print(rsm.summary()) # ## End