The model below was generated using a degree 2 polynomial. Study the evolution of the MSE for various degrees from 1 to 5 and by generating your training and test sets as noisy samples from the true quadratic function. Use $K$-fold cross validation to retrieve the correct model complexity out the possible maximum degrees.
num_bins = 5
points_per_bin = 10
dataset_size = num_bins*points_per_bin
bin_list_x = np.zeros((num_bins, 10))
bin_list_t = np.zeros((num_bins, 10))
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
for b in np.arange(num_bins):
x_true = np.linspace(0,1,10)
t_true = 0.1 + 0.1*x_true + x_true**2
x_sample = np.linspace(0,1,10)
t_sample = t_true + np.random.normal(0,.1,len(x_sample))
bin_list_x[b,:] = x_sample
bin_list_t[b,:] = t_sample
training_data_x = np.zeros((num_bins, 10*(num_bins-1)))
training_data_t = np.zeros((num_bins, 10*(num_bins-1)))
for b in np.arange(num_bins):
training_data_x_b = []
training_data_t_b = []
for b2 in np.arange(num_bins):
if b2 != b:
training_data_x_b = np.hstack((training_data_x_b, bin_list_x[b2, :]))
training_data_t_b = np.hstack((training_data_t_b, bin_list_t[b2, :]))
training_data_x[b, :] = training_data_x_b
training_data_t[b, :] = training_data_t_b
maxDegree = 5
MSE = np.zeros((maxDegree,))
for degrees in np.arange(maxDegree)+1:
poly = PolynomialFeatures(degrees)
for b in np.arange(num_bins):
feature_mat = poly.fit_transform(training_data_x[b,:].reshape(-1,1))
reg = LinearRegression().fit(feature_mat, training_data_t[b,:])
Xprediction = poly.fit_transform(bin_list_x[b,:].reshape(-1,1))
MSE[degrees-1] += (1/dataset_size)*np.sum((reg.predict(Xprediction) - bin_list_t[b,:])**2)
plt.plot(MSE)
plt.show()
import scipy.io
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
data_class1 = scipy.io.loadmat('points_class1_Lab2_Ex1.mat')['points_class1_Lab2_Ex1']
data_class2 = scipy.io.loadmat('points_class2_Lab2_Ex1.mat')['points_class2_Lab2_Ex1']
plt.scatter(data_class1[:,0], data_class1[:,1], c='b')
plt.scatter(data_class2[:,0], data_class2[:,1], c='r')
plt.show()
target_class1 = np.ones((np.shape(data_class1)[0], ))
target_class2 = -np.ones((np.shape(data_class2)[0], ))
from sklearn.linear_model import LinearRegression
my_classification = LinearRegression()
target = np.vstack((target_class1.reshape(-1,1), target_class2.reshape(-1,1)))
data = np.vstack((data_class1, data_class2))
my_classification.fit(data, target)
# Generate a grid of points on which I want to compute the prediction
x1min = np.min(data[:,0])
x1max = np.max(data[:,0])
x2min = np.min(data[:,1])
x2max = np.max(data[:,1])
# generate set of equispaced points
x1 = np.linspace(x1min, x1max, 50)
x2 = np.linspace(x2min, x2max, 50)
# generate all 50*50 coordinates pairs (x1, x2) over the space
# xx1, xx2 are matrices containing the x1 (resp. x2) coordinates of
# all the points on the 50 x 50 grid
xx1, xx2 = np.meshgrid(x1, x2)
Xprediction = np.hstack((xx1.reshape(-1,1), xx2.reshape(-1,1)))
prediction_on_grid = my_classification.predict(Xprediction)
# returns a real number
final_prediction_grid = 2*(prediction_on_grid>0)-1
plt.scatter(data_class1[:,0], data_class1[:,1], c='b')
plt.scatter(data_class2[:,0], data_class2[:,1], c='r')
plt.contourf(xx1, xx2, final_prediction_grid.reshape(np.shape(xx1)), levels=0, cmap = cm_bright, alpha=0.2)
plt.show()
print(final_prediction_grid)
[[-1] [-1] [-1] ... [-1] [-1] [-1]]
How could you extend your classifier to the dataset shown below.
import scipy.io
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
data_class1 = scipy.io.loadmat('points_class1_Lab2_Ex2.mat')['points_class1_Lab2_Ex2']
data_class2 = scipy.io.loadmat('points_class2_Lab2_Ex2.mat')['points_class2_Lab2_Ex2']
plt.scatter(data_class1[:,0], data_class1[:,1], c='b')
plt.scatter(data_class2[:,0], data_class2[:,1], c='r')
plt.show()
target_class1 = np.ones((np.shape(data_class1)[0], 1))
target_class2 = -np.ones((np.shape(data_class2)[0], 1))
target = np.vstack((target_class1, target_class2))
data = np.vstack((data_class1, data_class2))
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
data_polynomial = poly.fit_transform(data)
my_classification = LinearRegression()
my_classification.fit(data_polynomial, target)
x1min = np.min(data[:,0])
x1max = np.max(data[:,0])
x2min = np.min(data[:,1])
x2max = np.max(data[:,1])
x1 = np.linspace(x1min, x1max, 50)
x2 = np.linspace(x2min, x2max, 50)
# generate the grid
xx1, xx2 = np.meshgrid(x1, x2)
Xprediction = np.hstack((xx1.reshape(-1,1), xx2.reshape(-1,1)))
Xprediction_polynomial = poly.fit_transform(Xprediction)
predictions_grid = my_classification.predict(Xprediction_polynomial)
plt.scatter(data_class1[:,0], data_class1[:,1], c='b')
plt.scatter(data_class2[:,0], data_class2[:,1], c='r')
plt.contourf(xx1, xx2, predictions_grid.reshape(np.shape(xx1)), levels=0, cmap = cm_bright, alpha=0.2)
plt.show()
We now want to use the OLS to learn a multi-class classifier for the dataset below. Start by coding the one-vs-one and one-vs-rest classifiers. Then use the a single discriminant function with one hot encoding of the classes.
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
data_class1 = scipy.io.loadmat('points_class1_Lab2_Ex3.mat')['points_class1_Lab2_Ex3']
data_class2 = scipy.io.loadmat('points_class2_Lab2_Ex3.mat')['points_class2_Lab2_Ex3']
data_class3 = scipy.io.loadmat('points_class3_Lab2_Ex3.mat')['points_class3_Lab2_Ex3']
data_class4 = scipy.io.loadmat('points_class4_Lab2_Ex3.mat')['points_class4_Lab2_Ex3']
data_class5 = scipy.io.loadmat('points_class5_Lab2_Ex3.mat')['points_class5_Lab2_Ex3']
plt.scatter(data_class1[:,0], data_class1[:,1])
plt.scatter(data_class2[:,0], data_class2[:,1])
plt.scatter(data_class3[:,0], data_class3[:,1])
plt.scatter(data_class4[:,0], data_class4[:,1])
plt.scatter(data_class5[:,0], data_class5[:,1])
plt.show()
data = np.vstack((data_class1, data_class2))
data = np.vstack((data, data_class3))
data = np.vstack((data, data_class4))
data = np.vstack((data, data_class5))
num_points = np.shape(data)[0]
# generating polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(4)
data_poly = poly.fit_transform(data)
target = np.zeros((num_points, 1))
num0 = np.shape(data_class1)[0]
num1 = np.shape(data_class2)[0]
num2 = np.shape(data_class3)[0]
num3 = np.shape(data_class4)[0]
num4 = np.shape(data_class5)[0]
target[num0:num0+num1+1] = 1
target[num0+num1:num0+num1+num2+1] = 2
target[num0+num1+num2:num0+num1+num2+num3+1] = 3
target[num0+num1+num2+num3:] = 4
K = 5
from sklearn.linear_model import LinearRegression
x1min = np.min(data[:,0])
x1max = np.max(data[:,0])
x2min = np.min(data[:,1])
x2max = np.max(data[:,1])
# generate boundaries of the grid
x1 = np.linspace(x1min, x1max, 500)
x2 = np.linspace(x2min, x2max, 500)
xx1, xx2 = np.meshgrid(x1, x2)
data_prediction = np.vstack((xx1.flatten(), xx2.flatten())).T
data_prediction_poly = poly.fit_transform(data_prediction)
Prediction = np.zeros((len(xx1.flatten()), K-1))
for k in np.arange(K-1):
indices_class_k = np.where(target == k)
target_one_vs_rest = np.zeros((len(target), 1))
target_one_vs_rest[indices_class_k] = 1
reg = LinearRegression()
reg.fit(data_poly, target_one_vs_rest)
Prediction[:,k] = np.squeeze(reg.predict(data_prediction_poly)>1/2)
final_prediction = np.zeros((len(xx1.flatten()), ))
for i in np.arange(len(final_prediction)):
if np.argwhere(Prediction[i,:]==1).size ==1:
final_prediction[i] = np.argwhere(Prediction[i,:]==1)
elif np.argwhere(Prediction[i,:]==1).size >1:
final_prediction[i] = 5
else:
final_prediction[i] = 4
from matplotlib.colors import ListedColormap
plt.contourf(xx1, xx2, final_prediction.reshape(np.shape(xx1)), cmap=plt.cm.hsv, alpha=.2)
plt.scatter(data[:,0], data[:,1], c = target,cmap=plt.cm.hsv)
plt.show()
from sklearn.preprocessing import PolynomialFeatures
mypoly = PolynomialFeatures(6)
Polynomial_features = mypoly.fit_transform(data)
targetMatrix = np.zeros((len(target), K))
for i in np.arange(len(target)):
targetMatrix[i,int(target[i])] = 1
# for regularization we consider the inverse inv(X^TX + Id)
XTX = np.matmul(Polynomial_features.T, Polynomial_features)
identity = np.identity(np.shape(XTX)[0])
lbda = .1
mat_tmp = XTX + lbda*identity
invXTX = np.linalg.inv(mat_tmp)
RHS = np.matmul(Polynomial_features.T, targetMatrix)
Beta = np.matmul(invXTX,RHS)
x1min = np.min(data[:,0])
x1max = np.max(data[:,0])
x2min = np.min(data[:,1])
x2max = np.max(data[:,1])
# generate boundaries of the grid
x1 = np.linspace(x1min, x1max, 500)
x2 = np.linspace(x2min, x2max, 500)
xx1, xx2 = np.meshgrid(x1, x2)
data_prediction = np.vstack((xx1.flatten(), xx2.flatten())).T
Polynomial_features_prediction = mypoly.fit_transform(data_prediction)
prediction_grid = np.matmul(Polynomial_features_prediction, Beta)
prediction = np.argmax(prediction_grid, axis=1)
from matplotlib.colors import ListedColormap
plt.contourf(xx1, xx2, prediction.reshape(np.shape(xx1)), cmap=plt.cm.hsv, alpha=.2)
plt.scatter(data[:,0], data[:,1], c = target,cmap=plt.cm.hsv)
plt.show()
Use the OLS classifier from scikit-learn to classify the flowers from the iris dataset into the 3 species. Don't forget to split your dataset into a training and a test part so that you evaluate it properly once it has been trained (you can rely on scikit learn's train_test_split function)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# plotting the first two features
plt.scatter(X[:,0], X[:,1],c=y)
plt.title('First two features')
plt.show()
from sklearn import linear_model
reg = linear_model.LinearRegression()
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore', sparse = False)
targets_trainOH = enc.fit_transform(y_train.reshape(-1,1))
# we regularize by adding a lambda*Id
lbda = .1
Xtilde_train = np.hstack((np.ones((np.shape(X_train)[0],1)), X_train))
Identity = np.identity(np.shape(np.matmul(Xtilde_train.T, Xtilde_train))[0])
tmp = np.matmul(Xtilde_train.T, Xtilde_train) + lbda*Identity
RHS = np.matmul(Xtilde_train.T,targets_trainOH)
beta = np.matmul(np.linalg.inv(tmp), RHS)
# computing the error on the test set as the misclassification rate
data_prediction = np.hstack((np.ones((np.shape(X_test)[0], 1)), X_test))
prediction_grid = np.matmul(data_prediction, beta)
# returning the plane that gives the highest value
prediction_grid_F = np.argmax(prediction_grid, axis=1)
error_rate = len(np.where(prediction_grid_F!=y_test))/len(y_test)
print('misclassfication rate is %s' %error_rate)
misclassfication rate is 0.06666666666666667
Do the same with the [https://www.kaggle.com/c/titanic](titanic dataset) and try to learn a model that can efficiently predict which passengers are going to survive the wreck.
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
myModel = LogisticRegression()
import pandas as pd
data = pd.read_csv('train.csv')
target = data['Survived']
feature_matrix = data
# replacing the 'Sex' column with a binary column
featureMatrix = data.iloc[:,2:]
ind = featureMatrix['Sex']=='female'
tmp = np.zeros((len(featureMatrix['Sex']),))
tmp[ind] = 1
featureMatrix['Sex'] = tmp
# removing the rows containing NANs values and relabelling
featureMatrix = featureMatrix.dropna(axis=0)
featureMatrix = featureMatrix.reset_index(drop=True)
# we also remove the name column which is beyond what we want to achieve now
featureMatrix = featureMatrix.drop('Name', axis=1)
featureMatrix = featureMatrix.reset_index(drop=True)
tmp = featureMatrix['Embarked']
# We turn the 'Embarked column' into a numerical column
tmp2 = np.zeros((len(tmp),1))
tmp2[np.where(tmp == 'C')] = 1
tmp2 = np.squeeze(tmp2)
featureMatrix['Embarked'] = tmp2
ls = featureMatrix['Cabin']
Cabin_number = np.zeros((len(ls), 1))
# we first create a list of all decks and return the total number of cabins on each deck
decklist = []
for i in range(len(ls)):
decklist.append(ls[i][0])
y = list(set(decklist))
y = sorted(y)
max_numCabins = np.zeros((len(y),))
# simple auxilliary function to deal with char
def char_index(char_list, char):
i=0
for i in range(len(char_list)):
if char_list[i] == char:
return i
for i in range(len(ls)):
ind = char_index(y,ls[i][0])
# if more than one cabin
tmp = ls[i][1:]
if tmp.count(' ')>0:
cabin_list = []
for j in np.range(tmp.count(' ')):
cabin_list.append(tmp[:tmp.find(' ')])
tmp = tmp[tmp.find(' '):]
# then check the max over cabins...
if max_numCabins[i]<= int(tmp[1:]):
max_numCabins[i] = int(tmp[1:])
# to be done
In this 4th exercise, we will study the robustness of the OLS approach for classification. Consider the dataset below.
import scipy.io
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
data_class1 = scipy.io.loadmat('points_class1_Lab2_Ex4.mat')['points_class1_Lab2_Ex4']
data_class2 = scipy.io.loadmat('points_class2_Lab2_Ex4.mat')['points_class2_Lab2_Ex4']
data = np.vstack((data_class1, data_class2))
target1 = np.ones((np.shape(data_class1)[0], 1))
target2 = np.zeros((np.shape(data_class2)[0], 1))
target = np.vstack((target1, target2))
plt.scatter(data[:,0], data[:,1], c = target, cmap=cm_bright)
plt.show()
As a second part, we add an outlier, in order to force misclassification
# Adding an outlier
data = np.vstack((data_class1, data_class2))
target = np.zeros((np.shape(data)[0], 1))
target[:np.shape(data_class1)[0]] = 1
data_outlier = np.vstack((np.asarray([-4, 0.6]), data))
target_outlier = np.vstack((1, target))
plt.scatter(data_outlier[:,0], data_outlier[:,1], c=target_outlier, cmap=cm_bright)
plt.show()
Given the outlier, due to the lack of an activation function (and the fact that the classifier returns a real value, the distance of the point to the plane, instead of ), we can easily break a least squares classifier as shown below.
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(data_outlier, target_outlier)
x1min = np.min(data_outlier[:,0])
x1max = np.max(data_outlier[:,0])
x2min = np.min(data_outlier[:,1])
x2max = np.max(data_outlier[:,1])
x1 = np.linspace(0, x1max, 500)
x2 = np.linspace(x2min, x2max, 500)
xx1, xx2 = np.meshgrid(x1, x2)
data_prediction = np.vstack((xx1.flatten(), xx2.flatten())).T
prediction_LS = reg.predict(data_prediction)>1/2
plt.contourf(xx1, xx2, prediction_LS.reshape(np.shape(xx1)), alpha=.2, cmap=cm_bright)
plt.scatter(data_outlier[1:,0], data_outlier[1:,1], c=target_outlier[1:],cmap=cm_bright)
plt.show()
The logistic regression classifier, thanks to the activation function (which chops off the extreme values to a bounded output in [0,1]) maintains the classifier at the right location
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(data_outlier, target_outlier)
x1min = np.min(data_outlier[:,0])
x1max = np.max(data_outlier[:,0])
x2min = np.min(data_outlier[:,1])
x2max = np.max(data_outlier[:,1])
x1 = np.linspace(0, x1max, 500)
x2 = np.linspace(x2min, x2max, 500)
xx1, xx2 = np.meshgrid(x1, x2)
data_prediction = np.vstack((xx1.flatten(), xx2.flatten())).T
prediction_LS = reg.predict(data_prediction)>1/2
plt.contourf(xx1, xx2, prediction_LS.reshape(np.shape(xx1)), alpha=.2, cmap=cm_bright)
plt.scatter(data_outlier[1:,0], data_outlier[1:,1], c=target_outlier[1:], cmap=cm_bright)
#plt.ylim([])
plt.show()
/Users/augustincosse/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). return f(**kwargs)