#!/usr/bin/env python # coding: utf-8 # # Liner Regression Implementation # The objective of this notebook is to get familiarize with the problem of `Linear Regression`. # ## 1.3.0 Background about the dataset # # TLDR: You have 25 independent variables (`x1, x2, x3, ... , x25`) type: `float` for each data point. You can use a linear combination of these 25 independent variables to predict the y (dependent variable) of each data point. # In[1]: import csv import random import numpy as np import matplotlib.pyplot as plt # In[2]: train_data = list() with open('train_q3.csv', 'r') as infile: input_rows = csv.DictReader(infile) for row in input_rows: data_point = ([float(row['x1']), float(row['x2']), float(row['x3']), float(row['x4']), float(row['x5']), float(row['x6']), float(row['x7']), float(row['x8']), float(row['x9']), float(row['x10']), float(row['x11']), float(row['x12']), float(row['x13']), float(row['x14']), float(row['x15']), float(row['x16']), float(row['x17']), float(row['x18']), float(row['x19']), float(row['x20']), float(row['x21']), float(row['x22']), float(row['x23']), float(row['x24']), float(row['x25'])], float(row['y'])) train_data.append(data_point) # each point in x_train has 25 values - 1 for each feature x_train = [x[0] for x in train_data] # each point in y_train has 1 value - the 'y' of the molecule y_train = [x[1] for x in train_data] test_data = list() with open('test_q3.csv', 'r') as infile: input_rows = csv.DictReader(infile) for row in input_rows: data_point = ([float(row['x1']), float(row['x2']), float(row['x3']), float(row['x4']), float(row['x5']), float(row['x6']), float(row['x7']), float(row['x8']), float(row['x9']), float(row['x10']), float(row['x11']), float(row['x12']), float(row['x13']), float(row['x14']), float(row['x15']), float(row['x16']), float(row['x17']), float(row['x18']), float(row['x19']), float(row['x20']), float(row['x21']), float(row['x22']), float(row['x23']), float(row['x24']), float(row['x25'])], float(row['y'])) test_data.append(data_point) x_test = [x[0] for x in test_data] y_test = [x[1] for x in test_data] # ### 1.3.1 Implement a Linear Regression model that minimizes the MSE **without using any libraries**. You may use NumPy to vectorize your code, but *do not use numpy.polyfit* or anything similar. # # 1.3.1.1 Explain how you plan to implement Linear Regression in 5-10 lines. # # 1.3.1.2 Implement Linear Regression using `x_train` and `y_train` as the train dataset. # # 1.3.2.3 Choose the best learning rate and print the learning rate for which you achieved the best MSE. # # 1.2.1.4 Make a [Parity Plot](https://en.wikipedia.org/wiki/Parity_plot) of your model's bandgap predictions on the test set with the actual values. # # `ANSWER 1.3.1.1` # There will be a slope vector with dimension (25,1) initialised with all 0's , and a constant bias value with initialised with 0 # a gradient descent function will take these both as input, predict the values, then adjust slope vector and bias accordingly and return adjusted ones. This will go on for the number of times(epochs) mentioned, and at last we get a final slope vector and bias. # # We then predict our final values with these bias and slope vector obtained # In[3]: # 1.3.1.2 # implement Linear Regression #data = nxm #m = mx1 #b = nx1 #y = nx1 def gradientDescent(m_temp,b_temp,x,y,lr): n,m = np.shape(x) y_pred = np.dot(x,m_temp) + b_temp y = np.reshape(y,(n,1)) trans = np.transpose((y-y_pred)) mG = np.dot(trans,np.dot(-(2/n),x)) #bG = sum(trans)/n sm=0 for i in range(n): sm+=trans[0][i] bG = -(2/n)*sm mG = np.transpose(mG) mG = m_temp - lr*mG bG = b_temp - lr*bG return mG,bG def mse(y_test,y_pred): sm=0 for k in range(len(y_test)): sm+= y_test[k]-y_pred[k] sm = (sm*sm)/len(y_test) return sm # In[169]: # In[11]: epochs = 10000 learning_rate=0.01 rows, cols = np.shape(x_train) m = np.reshape(np.array([0]*cols),(cols,1)) b = 0 for i in range(epochs): m, b = gradientDescent(m,b,x_train,y_train,learning_rate) y_pred = np.dot(x_test,m)+b print(mse(y_test,y_pred)) # In[12]: # 1.3.1.4 # Get the predictions of x_test into `y_pred` fig, ax = plt.subplots(figsize=(7,7)) ax.scatter(y_test, y_pred) lims = [ np.min([ax.get_xlim(), ax.get_ylim()]), np.max([ax.get_xlim(), ax.get_ylim()]), ] ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0) ax.set_aspect('equal') ax.set_xlim(lims) ax.set_ylim(lims) ax.set_title('Parity Plot of Custom Linear Regression') ax.set_xlabel('Ground truth y-values') ax.set_ylabel('Predicted y-values') plt.show() # In[13]: # 1.3.2.3 # try with different learning rates and choose the best one epochs = 100 rows, cols = np.shape(x_train) m = np.reshape(np.array([0]*cols),(cols,1)) b = 0 minimse = float('inf') finalPred=[] lrli =[] mseli=[] finallr=0 for lr in range(100): l = lr/100 for i in range(epochs): m, b = gradientDescent(m,b,x_train,y_train,l) y_pred = np.dot(x_test,m)+b mseval = mse(y_test,y_pred) mseli.append(mseval) lrli.append(l) if minimse>mseval: minimse = mseval finalPred = y_pred finallr=l print("Best learning rate: "+str(finallr)) print("min mse : "+str(minimse)) # In[14]: y_pred = finalPred # In[15]: # 1.3.1.4 # Get the predictions of x_test into `y_pred` fig, ax = plt.subplots(figsize=(7,7)) ax.scatter(y_test, y_pred) lims = [ np.min([ax.get_xlim(), ax.get_ylim()]), np.max([ax.get_xlim(), ax.get_ylim()]), ] ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0) ax.set_aspect('equal') ax.set_xlim(lims) ax.set_ylim(lims) ax.set_title('Parity Plot of Custom Linear Regression') ax.set_xlabel('Ground truth y-values') ax.set_ylabel('Predicted y-values') plt.show() # ### 1.3.2 Implement Ridge Regression # # 1.3.2.1 Explain Ridge regression briefly in 1-2 lines. # # 1.3.2.2 Implement Ridge regression and make a table of different RMSE scores you achieved with different values of alpha. What does the parameter `alpha` do? # # 1.3.2.3 How does it affect the results here? Explain in 5-10 lines in total. (You can use scikit-learn from this cell onwards) # # 1.3.2.4 Make a Parity Plot of Ridge Regression model's y-predictions on the test set with the actual values. # # `1.3.2.1 Answer` # => Ridge regression is a method of estimating the coefficients of multiple-regression models in scenarios where the independent variables are highly correlated. # In[70]: # 1.3.2.2 # you should not have imported sklearn before this point import sklearn from sklearn.linear_model import Ridge import pandas as pd # implement Ridge regression and make a table where you explore the effect of different values of `alpha` li=[0.01,0.05,0.1,0.5,1,1.5,2,2.5,3,4,5] d={} bestPred=[] bestalpha=0 minimse=float('inf') for i in li: ridgeReg = Ridge(alpha=i) ridgeReg.fit(x_train,y_train) y_pred_ridge = ridgeReg.predict(x_test) mse = np.mean((y_pred_ridge - y_test)**2) if minimse>mse: minimse=mse bestPred=y_pred_ridge bestalpha=i d[i]=mse alphaTable = pd.DataFrame(d.items(),columns=['aplha','mse']) print(alphaTable) print("Best alpha : ",bestalpha) # # `1.3.2.3 Answer` # In[64]: # 1.3.2.4 fig, ax = plt.subplots(figsize=(7,7)) ax.scatter(y_test, bestPred) lims = [ np.min([ax.get_xlim(), ax.get_ylim()]), np.max([ax.get_xlim(), ax.get_ylim()]), ] ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0) ax.set_aspect('equal') ax.set_xlim(lims) ax.set_ylim(lims) ax.set_title('Parity Plot of Custom Linear Regression') ax.set_xlabel('Ground truth y-values') ax.set_ylabel('Predicted y-values') plt.show() # ### 1.3.3 Implement Lasso Regression # 1.3.3.1 Explain Lasso regression briefly in 1-2 lines. # # 1.3.3.2 Implement Lasso regression and make a table of different RMSE scores you achieved with different values of alpha. # # 1.3.3.3 What does the parameter `alpha` do? How does it affect the results here? Explain in 5-10 lines in total. # # 1.3.3.4 Make a Parity Plot of Lasso Regression model's y-predictions on the test set with the actual values. # # `1.3.3.1 Answer` # # Lasso is short for Least Absolute Shrinkage and Selection Operator, which is used both for regularization and model selection. If a model uses the L1 regularization technique, then it is called lasso regression. # In[73]: # 1.3.3.2 # implement Lasso regression and make a table where you explore the effect of different values of `alpha` import sklearn from sklearn.linear_model import Lasso import pandas as pd li=[0.01,0.05,0.1,0.3,0.5,0.8,1,1.5,2,2.5,3,4,5] d={} bestPred=[] bestalpha=0 minimse=float('inf') for i in li: lassoReg = Lasso(alpha=i) lassoReg.fit(x_train,y_train) y_pred_lasso = lassoReg.predict(x_test) mse = np.mean((y_pred_lasso - y_test)**2) if minimse>mse: minimse=mse bestPred=y_pred_lasso bestalpha=i d[i]=mse alphaTable = pd.DataFrame(d.items(),columns=['aplha','mse']) print(alphaTable) print("Best alpha : ",bestalpha) # # `1.3.3.3 Answer` # # Here, α (alpha) works similar to that of ridge and provides a trade-off between balancing RSS and magnitude of coefficients. Like that of ridge, α can take various values. Lets iterate it here briefly: # # α = 0: Same coefficients as simple linear regression # α = ∞: All coefficients zero # 0 < α < ∞: coefficients between 0 and that of simple linear regression # In[74]: # 1.3.3.4 fig, ax = plt.subplots(figsize=(7,7)) ax.scatter(y_test, bestPred) lims = [ np.min([ax.get_xlim(), ax.get_ylim()]), np.max([ax.get_xlim(), ax.get_ylim()]), ] ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0) ax.set_aspect('equal') ax.set_xlim(lims) ax.set_ylim(lims) ax.set_title('Parity Plot of Custom Linear Regression') ax.set_xlabel('Ground truth y-values') ax.set_ylabel('Predicted y-values') plt.show() # In[ ]: