Notebook

Programming Session Week 3¶

Full Solutions¶

In this session we will continue to work on regression and we will extend our toolbox to include an additional set of classification methods.

Exercise 1¶

Exercise 1.a¶

The model below was generated using a degree 2 polynomial. Study the evolution of the MSE for various degrees from 1 to 5 and by generating your training and test sets as noisy samples from the true quadratic function. Use $K$-fold cross validation to retrieve the correct model complexity out the possible maximum degrees.

In [12]:

num_bins = 5

points_per_bin = 10

dataset_size = num_bins*points_per_bin

bin_list_x = np.zeros((num_bins, 10))
bin_list_t = np.zeros((num_bins, 10))

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

for b in np.arange(num_bins): 

    x_true = np.linspace(0,1,10)
    t_true = 0.1 + 0.1*x_true + x_true**2
    x_sample = np.linspace(0,1,10)
    t_sample = t_true + np.random.normal(0,.1,len(x_sample))

    bin_list_x[b,:] = x_sample
    bin_list_t[b,:] = t_sample
    
    
training_data_x = np.zeros((num_bins, 10*(num_bins-1)))
training_data_t = np.zeros((num_bins, 10*(num_bins-1)))

for b in np.arange(num_bins):
    
    training_data_x_b = []
    training_data_t_b = []
    
    for b2 in np.arange(num_bins):
        
        if b2 != b:
            
            training_data_x_b = np.hstack((training_data_x_b, bin_list_x[b2, :]))
            training_data_t_b = np.hstack((training_data_t_b, bin_list_t[b2, :]))
    
    
    training_data_x[b, :] = training_data_x_b
    training_data_t[b, :] = training_data_t_b

In [34]:

   
maxDegree = 5

MSE = np.zeros((maxDegree,))
    
for degrees in np.arange(maxDegree)+1:
    
    poly = PolynomialFeatures(degrees)
     
    for b in np.arange(num_bins):
        
        feature_mat = poly.fit_transform(training_data_x[b,:].reshape(-1,1))
        reg = LinearRegression().fit(feature_mat, training_data_t[b,:])
        
        Xprediction = poly.fit_transform(bin_list_x[b,:].reshape(-1,1))
        
        MSE[degrees-1] += (1/dataset_size)*np.sum((reg.predict(Xprediction) - bin_list_t[b,:])**2)

plt.plot(MSE)
plt.show()

Exercise 2¶

Exercise 2.a¶

Using the OLS loss, try to learn a classifier for the dataset given below.

In [51]:

import scipy.io
import matplotlib.pyplot as plt

from matplotlib.colors import ListedColormap
cm_bright = ListedColormap(['#FF0000', '#0000FF'])


data_class1 = scipy.io.loadmat('points_class1_Lab2_Ex1.mat')['points_class1_Lab2_Ex1']
data_class2 = scipy.io.loadmat('points_class2_Lab2_Ex1.mat')['points_class2_Lab2_Ex1']

plt.scatter(data_class1[:,0], data_class1[:,1], c='b')
plt.scatter(data_class2[:,0], data_class2[:,1], c='r')
plt.show()

target_class1 = np.ones((np.shape(data_class1)[0], ))
target_class2 = -np.ones((np.shape(data_class2)[0], ))

from sklearn.linear_model import LinearRegression

my_classification = LinearRegression()

target = np.vstack((target_class1.reshape(-1,1), target_class2.reshape(-1,1)))

data = np.vstack((data_class1, data_class2))

my_classification.fit(data, target)


# Generate a grid of points on which I want to compute the prediction

x1min = np.min(data[:,0])
x1max = np.max(data[:,0])
x2min = np.min(data[:,1])
x2max = np.max(data[:,1])

# generate set of equispaced points

x1 = np.linspace(x1min, x1max, 50)
x2 = np.linspace(x2min, x2max, 50)

# generate all 50*50 coordinates pairs (x1, x2) over the space
# xx1, xx2 are matrices containing the x1 (resp. x2) coordinates of 
# all the points on the 50 x 50 grid 

xx1, xx2 = np.meshgrid(x1, x2)


Xprediction = np.hstack((xx1.reshape(-1,1), xx2.reshape(-1,1)))


prediction_on_grid = my_classification.predict(Xprediction)
# returns a real number

final_prediction_grid = 2*(prediction_on_grid>0)-1


plt.scatter(data_class1[:,0], data_class1[:,1], c='b')
plt.scatter(data_class2[:,0], data_class2[:,1], c='r')
plt.contourf(xx1, xx2, final_prediction_grid.reshape(np.shape(xx1)), levels=0, cmap = cm_bright, alpha=0.2)
plt.show()

In [46]:

print(final_prediction_grid)

[[-1]
 [-1]
 [-1]
 ...
 [-1]
 [-1]
 [-1]]

Exercise 2.b¶

How could you extend your classifier to the dataset shown below.

In [53]:

import scipy.io
import matplotlib.pyplot as plt

from matplotlib.colors import ListedColormap
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

data_class1 = scipy.io.loadmat('points_class1_Lab2_Ex2.mat')['points_class1_Lab2_Ex2']
data_class2 = scipy.io.loadmat('points_class2_Lab2_Ex2.mat')['points_class2_Lab2_Ex2']

plt.scatter(data_class1[:,0], data_class1[:,1], c='b')
plt.scatter(data_class2[:,0], data_class2[:,1], c='r')
plt.show()

target_class1 = np.ones((np.shape(data_class1)[0], 1))
target_class2 = -np.ones((np.shape(data_class2)[0], 1))

target = np.vstack((target_class1, target_class2))

data = np.vstack((data_class1, data_class2))

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(5)
data_polynomial = poly.fit_transform(data)

my_classification = LinearRegression()

my_classification.fit(data_polynomial, target)


x1min = np.min(data[:,0])
x1max = np.max(data[:,0])
x2min = np.min(data[:,1])
x2max = np.max(data[:,1])

x1 = np.linspace(x1min, x1max, 50)
x2 = np.linspace(x2min, x2max, 50)

# generate the grid 

xx1, xx2 = np.meshgrid(x1, x2)

Xprediction = np.hstack((xx1.reshape(-1,1), xx2.reshape(-1,1)))

Xprediction_polynomial = poly.fit_transform(Xprediction)

predictions_grid = my_classification.predict(Xprediction_polynomial)

plt.scatter(data_class1[:,0], data_class1[:,1], c='b')
plt.scatter(data_class2[:,0], data_class2[:,1], c='r')
plt.contourf(xx1, xx2, predictions_grid.reshape(np.shape(xx1)), levels=0, cmap = cm_bright, alpha=0.2)
plt.show()

Exercise 2.c¶

We now want to use the OLS to learn a multi-class classifier for the dataset below. Start by coding the one-vs-one and one-vs-rest classifiers. Then use the a single discriminant function with one hot encoding of the classes.

In [83]:

import scipy.io
import matplotlib.pyplot as plt
import numpy as np

data_class1 = scipy.io.loadmat('points_class1_Lab2_Ex3.mat')['points_class1_Lab2_Ex3']
data_class2 = scipy.io.loadmat('points_class2_Lab2_Ex3.mat')['points_class2_Lab2_Ex3']
data_class3 = scipy.io.loadmat('points_class3_Lab2_Ex3.mat')['points_class3_Lab2_Ex3']
data_class4 = scipy.io.loadmat('points_class4_Lab2_Ex3.mat')['points_class4_Lab2_Ex3']
data_class5 = scipy.io.loadmat('points_class5_Lab2_Ex3.mat')['points_class5_Lab2_Ex3']

plt.scatter(data_class1[:,0], data_class1[:,1])
plt.scatter(data_class2[:,0], data_class2[:,1])
plt.scatter(data_class3[:,0], data_class3[:,1])
plt.scatter(data_class4[:,0], data_class4[:,1])
plt.scatter(data_class5[:,0], data_class5[:,1])

plt.show()

data = np.vstack((data_class1, data_class2))
data = np.vstack((data, data_class3))
data = np.vstack((data, data_class4))
data = np.vstack((data, data_class5))
num_points = np.shape(data)[0]

# generating polynomial features

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(4)
data_poly = poly.fit_transform(data)

target = np.zeros((num_points, 1))

num0 = np.shape(data_class1)[0]
num1 = np.shape(data_class2)[0]
num2 = np.shape(data_class3)[0]
num3 = np.shape(data_class4)[0]
num4 = np.shape(data_class5)[0]

target[num0:num0+num1+1] = 1
target[num0+num1:num0+num1+num2+1] = 2
target[num0+num1+num2:num0+num1+num2+num3+1] = 3
target[num0+num1+num2+num3:] = 4

K = 5

from sklearn.linear_model import LinearRegression

x1min = np.min(data[:,0])
x1max = np.max(data[:,0])
x2min = np.min(data[:,1])
x2max = np.max(data[:,1])

# generate boundaries of the grid

x1 = np.linspace(x1min, x1max, 500)
x2 = np.linspace(x2min, x2max, 500)

xx1, xx2 = np.meshgrid(x1, x2)

data_prediction = np.vstack((xx1.flatten(), xx2.flatten())).T

data_prediction_poly = poly.fit_transform(data_prediction)

Prediction = np.zeros((len(xx1.flatten()), K-1))

for k in np.arange(K-1):
    
        indices_class_k = np.where(target == k)
        target_one_vs_rest = np.zeros((len(target), 1))
        target_one_vs_rest[indices_class_k] = 1
        
        reg = LinearRegression()
        reg.fit(data_poly, target_one_vs_rest)
        
        Prediction[:,k] = np.squeeze(reg.predict(data_prediction_poly)>1/2)

final_prediction = np.zeros((len(xx1.flatten()), ))

for i in np.arange(len(final_prediction)):
    
    if np.argwhere(Prediction[i,:]==1).size ==1:
        
        final_prediction[i] = np.argwhere(Prediction[i,:]==1)
        
    elif np.argwhere(Prediction[i,:]==1).size >1:
        
        final_prediction[i] = 5
        
    else:
    
        final_prediction[i] = 4

In [84]:

from matplotlib.colors import ListedColormap

plt.contourf(xx1, xx2, final_prediction.reshape(np.shape(xx1)), cmap=plt.cm.hsv, alpha=.2)
plt.scatter(data[:,0], data[:,1], c = target,cmap=plt.cm.hsv)

plt.show()

In [94]:

from sklearn.preprocessing import PolynomialFeatures

mypoly = PolynomialFeatures(6)

Polynomial_features = mypoly.fit_transform(data)

targetMatrix = np.zeros((len(target), K))

for i in np.arange(len(target)):
    
    targetMatrix[i,int(target[i])] = 1
       
# for regularization we consider the inverse inv(X^TX + Id)

XTX = np.matmul(Polynomial_features.T, Polynomial_features)
identity = np.identity(np.shape(XTX)[0])
lbda = .1
mat_tmp = XTX + lbda*identity
        
invXTX = np.linalg.inv(mat_tmp) 
RHS = np.matmul(Polynomial_features.T, targetMatrix)

Beta = np.matmul(invXTX,RHS)

x1min = np.min(data[:,0])
x1max = np.max(data[:,0])
x2min = np.min(data[:,1])
x2max = np.max(data[:,1])

# generate boundaries of the grid

x1 = np.linspace(x1min, x1max, 500)
x2 = np.linspace(x2min, x2max, 500)

xx1, xx2 = np.meshgrid(x1, x2)

data_prediction = np.vstack((xx1.flatten(), xx2.flatten())).T

Polynomial_features_prediction = mypoly.fit_transform(data_prediction)

prediction_grid = np.matmul(Polynomial_features_prediction, Beta)

prediction = np.argmax(prediction_grid, axis=1)

In [95]:

from matplotlib.colors import ListedColormap
plt.contourf(xx1, xx2, prediction.reshape(np.shape(xx1)), cmap=plt.cm.hsv, alpha=.2)
plt.scatter(data[:,0], data[:,1], c = target,cmap=plt.cm.hsv)

plt.show()

Exercise 3.¶

Exercise 3.a¶

Use the OLS classifier from scikit-learn to classify the flowers from the iris dataset into the 3 species. Don't forget to split your dataset into a training and a test part so that you evaluate it properly once it has been trained (you can rely on scikit learn's train_test_split function)

In [146]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


from sklearn import datasets
iris = datasets.load_iris()
X = iris.data 
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# plotting the first two features 
plt.scatter(X[:,0], X[:,1],c=y)
plt.title('First two features')
plt.show()

In [172]:

 from sklearn import linear_model
reg = linear_model.LinearRegression()

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore', sparse = False)
targets_trainOH = enc.fit_transform(y_train.reshape(-1,1))

# we regularize by adding a lambda*Id

lbda = .1

Xtilde_train = np.hstack((np.ones((np.shape(X_train)[0],1)), X_train))

Identity = np.identity(np.shape(np.matmul(Xtilde_train.T, Xtilde_train))[0])

tmp = np.matmul(Xtilde_train.T, Xtilde_train) + lbda*Identity

RHS = np.matmul(Xtilde_train.T,targets_trainOH)
beta = np.matmul(np.linalg.inv(tmp), RHS)

# computing the error on the test set as the misclassification rate

data_prediction = np.hstack((np.ones((np.shape(X_test)[0], 1)), X_test)) 
prediction_grid = np.matmul(data_prediction, beta)

# returning the plane that gives the highest value
prediction_grid_F = np.argmax(prediction_grid, axis=1)

error_rate = len(np.where(prediction_grid_F!=y_test))/len(y_test)
print('misclassfication rate is %s' %error_rate)

misclassfication rate is 0.06666666666666667

Exercise 3.b¶

Do the same with the [https://www.kaggle.com/c/titanic](titanic dataset) and try to learn a model that can efficiently predict which passengers are going to survive the wreck.

In [ ]:

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

myModel = LogisticRegression()

import pandas as pd
data = pd.read_csv('train.csv')  
target = data['Survived']

feature_matrix = data 

# replacing the 'Sex' column with a binary column
featureMatrix = data.iloc[:,2:]
ind = featureMatrix['Sex']=='female'
tmp = np.zeros((len(featureMatrix['Sex']),))
tmp[ind] = 1
featureMatrix['Sex'] = tmp

# removing the rows containing NANs values and relabelling

featureMatrix = featureMatrix.dropna(axis=0)
featureMatrix = featureMatrix.reset_index(drop=True)

# we also remove the name column which is beyond what we want to achieve now 

featureMatrix = featureMatrix.drop('Name', axis=1)
featureMatrix = featureMatrix.reset_index(drop=True)


tmp = featureMatrix['Embarked']
# We turn the 'Embarked column' into a numerical column
tmp2 = np.zeros((len(tmp),1))
tmp2[np.where(tmp == 'C')] = 1
tmp2 = np.squeeze(tmp2)
featureMatrix['Embarked'] = tmp2
ls = featureMatrix['Cabin']

Cabin_number = np.zeros((len(ls), 1))

# we first create a list of all decks and return the total number of cabins on each deck

decklist = []
for i in range(len(ls)):
    decklist.append(ls[i][0])
y = list(set(decklist))
y = sorted(y)

max_numCabins = np.zeros((len(y),))

# simple auxilliary function to deal with char
def char_index(char_list, char):
    i=0
    for i in range(len(char_list)):
        if char_list[i] == char:
            return i
        
        
        
for i in range(len(ls)):
    ind = char_index(y,ls[i][0])
    
    # if more than one cabin
    tmp = ls[i][1:]
    if tmp.count(' ')>0:
        cabin_list = []
        for j in np.range(tmp.count(' ')):
            cabin_list.append(tmp[:tmp.find(' ')])
            tmp = tmp[tmp.find(' '):]
    
    # then check the max over cabins... 
    
    if max_numCabins[i]<= int(tmp[1:]):
        max_numCabins[i] = int(tmp[1:])
    
    # to be done 

Exercise 4.¶

Exercise 4.a¶

In this 4th exercise, we will study the robustness of the OLS approach for classification. Consider the dataset below.

Start by learning a simple binary OLS classifier on that dataset (you can use the linear_regression model from scikit-learn).
Then try to force misclassification by adding a blue point on the far left of the dataset.
Once your updated dataset can be used to highlight misclassification by the OLS, replace the OLS classifier with the logistic regression classifier from scikit learn (on the same dataset). What do you notice ?

In [266]:

import scipy.io
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np

cm_bright = ListedColormap(['#FF0000', '#0000FF'])

data_class1 = scipy.io.loadmat('points_class1_Lab2_Ex4.mat')['points_class1_Lab2_Ex4']
data_class2 = scipy.io.loadmat('points_class2_Lab2_Ex4.mat')['points_class2_Lab2_Ex4']

data = np.vstack((data_class1, data_class2))
target1 = np.ones((np.shape(data_class1)[0], 1))
target2 = np.zeros((np.shape(data_class2)[0], 1))

target = np.vstack((target1, target2))

plt.scatter(data[:,0], data[:,1], c = target, cmap=cm_bright)
plt.show()

As a second part, we add an outlier, in order to force misclassification

In [267]:

# Adding an outlier

data = np.vstack((data_class1, data_class2))

target = np.zeros((np.shape(data)[0], 1))
target[:np.shape(data_class1)[0]] = 1

data_outlier = np.vstack((np.asarray([-4, 0.6]), data))
target_outlier = np.vstack((1, target))

plt.scatter(data_outlier[:,0], data_outlier[:,1], c=target_outlier, cmap=cm_bright)
plt.show()

Given the outlier, due to the lack of an activation function (and the fact that the classifier returns a real value, the distance of the point to the plane, instead of ), we can easily break a least squares classifier as shown below.

In [269]:

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(data_outlier, target_outlier)

x1min = np.min(data_outlier[:,0])
x1max = np.max(data_outlier[:,0])
x2min = np.min(data_outlier[:,1])
x2max = np.max(data_outlier[:,1])

x1 = np.linspace(0, x1max, 500)
x2 = np.linspace(x2min, x2max, 500)

xx1, xx2 = np.meshgrid(x1, x2)

data_prediction = np.vstack((xx1.flatten(), xx2.flatten())).T

prediction_LS = reg.predict(data_prediction)>1/2

plt.contourf(xx1, xx2, prediction_LS.reshape(np.shape(xx1)), alpha=.2, cmap=cm_bright)
plt.scatter(data_outlier[1:,0], data_outlier[1:,1], c=target_outlier[1:],cmap=cm_bright)
plt.show()

The logistic regression classifier, thanks to the activation function (which chops off the extreme values to a bounded output in [0,1]) maintains the classifier at the right location

In [270]:

from sklearn.linear_model import LogisticRegression

reg = LogisticRegression()
reg.fit(data_outlier, target_outlier)

x1min = np.min(data_outlier[:,0])
x1max = np.max(data_outlier[:,0])
x2min = np.min(data_outlier[:,1])
x2max = np.max(data_outlier[:,1])

x1 = np.linspace(0, x1max, 500)
x2 = np.linspace(x2min, x2max, 500)

xx1, xx2 = np.meshgrid(x1, x2)

data_prediction = np.vstack((xx1.flatten(), xx2.flatten())).T

prediction_LS = reg.predict(data_prediction)>1/2

plt.contourf(xx1, xx2, prediction_LS.reshape(np.shape(xx1)), alpha=.2, cmap=cm_bright)
plt.scatter(data_outlier[1:,0], data_outlier[1:,1], c=target_outlier[1:], cmap=cm_bright)
#plt.ylim([])

plt.show()

/Users/augustincosse/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(**kwargs)