# imports for the tutorial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import Ridge, Lasso
%matplotlib ipympl
We roughly divide ML algorithms to 2.5 groups:
Note: there is also Reinforcement Learning, but we will not address it in this course.
It is important to evaluate the classifier generalization performance in order to:
Train-Test Separation - The naive approach is seprating the data into train set and test set, that is, taking a portion of the data for training the model (usually about 80% of the dataset) and save another portion, that the model has not seen in order to test the model's performance. This is called the test set (usually about 20% of the dataset).
Note: Scikit-learn has a function we can use called train_test_split
that makes it easy for us to split our dataset into training and testing data.
# let's load the cancer dataset, shuffle it and speratre into train and test set
dataset = pd.read_csv('./datasets/cancer_dataset.csv')
# print the number of rows in the data set
number_of_rows = len(dataset)
print("total samples: {}".format(number_of_rows))
total_positive_samples = np.sum(dataset['diagnosis'].values == 'M')
print("total positive sampels (M): {}, total negative samples (B): {}".format(
total_positive_samples, number_of_rows - total_positive_samples))
num_train = int(0.8 * number_of_rows)
dataset.sample(10)
total samples: 569 total positive sampels (M): 212, total negative samples (B): 357
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
61 | 858981 | B | 8.598 | 20.98 | 54.66 | 221.8 | 0.12430 | 0.08963 | 0.030000 | 0.009259 | ... | 27.04 | 62.06 | 273.9 | 0.16390 | 0.16980 | 0.09001 | 0.02778 | 0.2972 | 0.07712 | NaN |
334 | 897374 | B | 12.300 | 19.02 | 77.88 | 464.4 | 0.08313 | 0.04202 | 0.007756 | 0.008535 | ... | 28.46 | 84.53 | 544.3 | 0.12220 | 0.09052 | 0.03619 | 0.03983 | 0.2554 | 0.07207 | NaN |
494 | 914102 | B | 13.160 | 20.54 | 84.06 | 538.7 | 0.07335 | 0.05275 | 0.018000 | 0.012560 | ... | 28.46 | 95.29 | 648.3 | 0.11180 | 0.16460 | 0.07698 | 0.04195 | 0.2687 | 0.07429 | NaN |
55 | 85759902 | B | 11.520 | 18.75 | 73.34 | 409.0 | 0.09524 | 0.05473 | 0.030360 | 0.022780 | ... | 22.47 | 81.81 | 506.2 | 0.12490 | 0.08720 | 0.09076 | 0.06316 | 0.3306 | 0.07036 | NaN |
223 | 8812877 | M | 15.750 | 20.25 | 102.60 | 761.3 | 0.10250 | 0.12040 | 0.114700 | 0.064620 | ... | 30.29 | 125.90 | 1088.0 | 0.15520 | 0.44800 | 0.39760 | 0.14790 | 0.3993 | 0.10640 | NaN |
477 | 911673 | B | 13.900 | 16.62 | 88.97 | 599.4 | 0.06828 | 0.05319 | 0.022240 | 0.013390 | ... | 21.80 | 101.20 | 718.9 | 0.09384 | 0.20060 | 0.13840 | 0.06222 | 0.2679 | 0.07698 | NaN |
517 | 916838 | M | 19.890 | 20.26 | 130.50 | 1214.0 | 0.10370 | 0.13100 | 0.141100 | 0.094310 | ... | 25.23 | 160.50 | 1646.0 | 0.14170 | 0.33090 | 0.41850 | 0.16130 | 0.2549 | 0.09136 | NaN |
423 | 906878 | B | 13.660 | 19.13 | 89.46 | 575.3 | 0.09057 | 0.11470 | 0.096570 | 0.048120 | ... | 25.50 | 101.40 | 708.8 | 0.11470 | 0.31670 | 0.36600 | 0.14070 | 0.2744 | 0.08839 | NaN |
345 | 898677 | B | 10.260 | 14.71 | 66.20 | 321.6 | 0.09882 | 0.09159 | 0.035810 | 0.020370 | ... | 19.48 | 70.89 | 357.1 | 0.13600 | 0.16360 | 0.07162 | 0.04074 | 0.2434 | 0.08488 | NaN |
195 | 875878 | B | 12.910 | 16.33 | 82.53 | 516.4 | 0.07941 | 0.05366 | 0.038730 | 0.023770 | ... | 22.00 | 90.81 | 600.6 | 0.10970 | 0.15060 | 0.17640 | 0.08235 | 0.3024 | 0.06949 | NaN |
10 rows × 33 columns
# split to train-test
x = dataset[['radius_mean', 'texture_mean']].values
y = dataset['diagnosis'].values == 'M' # 1 for Malignat, 0 for Benign
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
# stratify=y: same proportions of class labels in train and test sets, we provide the labels y
print("train size: {}, test size: {}".format(len(X_train), len(X_test)))
train size: 455, test size: 114
That is why we seperate into 3 sets:
cross_val_score
- takes in a classifier, training data, $k$ (number of folds) and the scoring technique ("accuracy" for example). Read the Doc HereAn important theoretical result of statistics and classic ML is the fact the the model generalization error can be expressed as a sum of 3 different errors: bias, variance and irreducible error.
Given a true (but unknown) function $F(x)$ with noise $F(x) = f + \epsilon$, we seek to estimate it based on $n$ samples from a set $\mathcal{D}$. We denote the regression function as $g(x; \mathcal{D})$.
The error of the regression model is given by: $$ MSE = \mathbb{E}_{\mathcal{D}} \big[(F(x) - g(x; \mathcal{D}) )^2 \big] $$
The total error can be decomposed into 3 terms: $$ E[(F - g )^2] = E[(f + \epsilon -g)^2] = E[(f + \epsilon -g +E[g] - E[g])^2]$$ $$ = ... = (f - E[g])^2 +E[\epsilon^2] +E\big[(E[g] - g)^2\big]$$ $$ = Bias[g]^2 +\sigma^2 +Var[g]$$
where $x, \theta \in \mathcal{R}^m$ and $h_{\theta}:\mathcal{R}^m \to \mathcal{R}.$
# fit scaler on training data (not on test data!)
scaler = MinMaxScaler().fit(X_train)
# transform training data
X_train_norm = scaler.transform(X_train)
# transform testing data
X_test_norm = scaler.transform(X_test)
# fit scaler on training data (not on test data!)
scaler = StandardScaler().fit(X_train)
# transform training data
X_train_norm = scaler.transform(X_train)
# transform testing data
X_test_norm = scaler.transform(X_test)
A linear model makes a prediction by computing a weighted sum of the input features, plus a constant called the bias term (also called intercept sometimes).
We denote:
The Linear Regression model prediction: $$ \hat{y} = \theta_0 + \theta_1 x_1 + ... + \theta_n x_n $$ In vector form: $$ \hat{y}=h_{\theta} (x) =\theta^T \cdot x $$
How do we train a linear regression model?
To find the value of $\theta$ that minimizes the cost function, there is a closed-form solution - a mathemtical equation that gives the result directly. It is also called the Normal Equation. We will now derive it.
The general idea is to tweak parameters iteratively to minimize a cost function.
It measures the local gradient of the error function with regards to the parameter vector ($\theta$ or $w$), and it goes down in the direction of the descending gradient. Once the gradient is zero - the minimum is reached (=convergence).
Learning Rate hyperparameter - it is the size of step to be taken in each iteration.
# helper functions
def plot_3d_lls(x, y, z, lls_sol, title=""):
# plot
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, label='Y')
ax.scatter(x, y, lls_sol, label='Xw')
ax.legend()
ax.set_xlabel('Radius Mean')
ax.set_ylabel('Area Mean')
ax.set_zlabel('Perimeter Mean')
ax.set_title(title)
def batch_generator(x, y, batch_size, shuffle=True):
"""
This function generates batches for a given dataset x.
"""
N, L = x.shape
num_batches = N // batch_size
batch_x = []
batch_y = []
if shuffle:
# shuffle
rand_gen = np.random.RandomState(0)
shuffled_indices = rand_gen.permutation(np.arange(N))
x = x[shuffled_indices, :]
y = y[shuffled_indices, :]
for i in range(N):
batch_x.append(x[i, :])
batch_y.append(y[i, :])
if len(batch_x) == batch_size:
yield np.array(batch_x).reshape(batch_size, L), np.array(batch_y).reshape(batch_size, 1)
batch_x = []
batch_y = []
if batch_x:
yield np.array(batch_x).reshape(-1, L), np.array(batch_y).reshape(-1, 1)
# split to train-test
x = dataset[['radius_mean', 'area_mean']].values
y = dataset[['perimeter_mean']].values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
N = len(X_train)
print("train size: {}, test size: {}".format(len(X_train), len(X_test)))
train size: 455, test size: 114
# multivaraite mini-batch gradient descent
# scaling (done maually, but you can also use sklearn)
X_train = (X_train - X_train.mean(axis=0, keepdims=True)) / X_train.std(axis=0, keepdims=True)
y_train = (y_train - y_train.mean(axis=0, keepdims=True)) / y_train.std(axis=0, keepdims=True)
batch_size = 16
num_batches = N // batch_size
print("total batches:", num_batches)
total batches: 28
num_epochs = 10
alpha_k = 0.001
batch_gen = batch_generator(X_train, y_train, batch_size, shuffle=True)
# initialize w
w = np.zeros((2, 1))
for i in range(num_epochs):
for batch_i, batch in enumerate(batch_gen):
batch_x, batch_y = batch
if batch_i % 50 == 0:
print("iter:", i, "batch:", batch_i, " w = ")
print(w)
gradient = 2 * batch_x.T @ batch_x @ w - 2 * batch_x.T @ batch_y
w = w - alpha_k * gradient
batch_gen = batch_generator(X_train, y_train, batch_size, shuffle=True)
lls_sol = X_train @ w
iter: 0 batch: 0 w = [[0.] [0.]] iter: 1 batch: 0 w = [[0.42729943] [0.41258379]] iter: 2 batch: 0 w = [[0.49676955] [0.47094993]] iter: 3 batch: 0 w = [[0.51174938] [0.47548105]] iter: 4 batch: 0 w = [[0.51838691] [0.47187408]] iter: 5 batch: 0 w = [[0.52370328] [0.46708033]] iter: 6 batch: 0 w = [[0.52876727] [0.4621567 ]] iter: 7 batch: 0 w = [[0.53374208] [0.45726337]] iter: 8 batch: 0 w = [[0.53865315] [0.4524241 ]] iter: 9 batch: 0 w = [[0.54350493] [0.44764194]]
# plot
%matplotlib ipympl
plot_3d_lls(X_train[:,0], X_train[:, 1], y_train, lls_sol,
"Breast Cancer - Radius Mean vs. Area Mean vs. Perimeter Mean - LLS Mini-Batch GD")
print("w:")
print(w)
w: [[0.54829871] [0.44291675]]
# ridge regression
ridge_reg = Ridge(alpha=1, solver='cholesky', fit_intercept=False)
ridge_reg.fit(X_train, y_train)
w = ridge_reg.coef_
lls_sol = X_train @ w.T
print("w:", w)
w: [[0.88003189 0.1174982 ]]
# plot
plot_3d_lls(X_train[:,0], X_train[:, 1], y_train, lls_sol,
"Breast Cancer - Radius Mean vs. Area Mean vs. Perimeter Mean - LLS Mini-Batch GD")
Animation by Dustin Kenefake
# lasso regression
lasso_reg = Lasso(alpha=0.1, fit_intercept=False)
lasso_reg.fit(X_train, y_train)
w = lasso_reg.coef_
lls_sol = X_train @ w.T
print("w:", w)
w: [8.97848395e-01 1.71905813e-04]
# plot
plot_3d_lls(X_train[:,0], X_train[:, 1], y_train, lls_sol,
"Breast Cancer - Radius Mean vs. Area Mean vs. Perimeter Mean - LLS Mini-Batch GD")