The k-nn algorithm is a simple supervised machine learning algorithm that can be used both for classification and regression. It's an instance-based algorithm. So instead of estimating a model, it stores all training examples in memory and makes predictions using a similarity measure.
Given an input example, the k-nn algorithm retrieves the k most similar instances from memory. Similarity is defined in terms of distance, that is, the training examples with the smallest (euclidean) distance to the input example are considered to be most similar.
The target value of the input example is computed as follows:
Classification:
a) unweighted: output the most common classification among the k-nearest neighbors
b) weighted: sum up the weights of the k-nearest neighbors for each classification value, output classification with highest weight
Regression:
a) unweighted: output the average of the values of the k-nearest neighbors
b) weighted: for all classification values, sum up classification value$*$weight and divide the result trough the sum of all weights
The weighted k-nn version is a refined version of the algorithm in which the contribution of each neighbor is weighted according to its distance to the query point. Below, we implement the basic unweighted version of the k-nn algorithm for the digits dataset from sklearn.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
np.random.seed(123)
%matplotlib inline
# We will use the digits dataset as an example. It consists of the 1797 images of hand-written digits. Each digit is
# represented by a 64-dimensional vector of pixel values.
digits = load_digits()
X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')
# Example digits
fig = plt.figure(figsize=(10,8))
for i in range(10):
ax = fig.add_subplot(2, 5, i+1)
plt.imshow(X[i].reshape((8,8)), cmap='gray')
plt.show()
X_train shape: (1347, 64) y_train shape: (1347,) X_test shape: (450, 64) y_test shape: (450,)
class kNN():
def __init__(self):
pass
def fit(self, X, y):
self.data = X
self.targets = y
def euclidean_distance(self, X):
"""
Computes the euclidean distance between the training data and
a new input example or matrix of input examples X
"""
# input: single data point
if X.ndim == 1:
l2 = np.sqrt(np.sum((self.data - X)**2, axis=1))
# input: matrix of data points
if X.ndim == 2:
n_samples, _ = X.shape
l2 = [np.sqrt(np.sum((self.data - X[i])**2, axis=1)) for i in range(n_samples)]
return np.array(l2)
def predict(self, X, k=1):
"""
Predicts the classification for an input example or matrix of input examples X
"""
# step 1: compute distance between input and training data
dists = self.euclidean_distance(X)
# step 2: find the k nearest neighbors and their classifications
if X.ndim == 1:
if k == 1:
nn = np.argmin(dists)
return self.targets[nn]
else:
knn = np.argsort(dists)[:k]
y_knn = self.targets[knn]
max_vote = max(y_knn, key=list(y_knn).count)
return max_vote
if X.ndim == 2:
knn = np.argsort(dists)[:, :k]
y_knn = self.targets[knn]
if k == 1:
return y_knn.T
else:
n_samples, _ = X.shape
max_votes = [max(y_knn[i], key=list(y_knn[i]).count) for i in range(n_samples)]
return max_votes
knn = kNN()
knn.fit(X_train, y_train)
print("Testing one datapoint, k=1")
print(f"Predicted label: {knn.predict(X_test[0], k=1)}")
print(f"True label: {y_test[0]}")
print()
print("Testing one datapoint, k=5")
print(f"Predicted label: {knn.predict(X_test[20], k=5)}")
print(f"True label: {y_test[20]}")
print()
print("Testing 10 datapoint, k=1")
print(f"Predicted labels: {knn.predict(X_test[5:15], k=1)}")
print(f"True labels: {y_test[5:15]}")
print()
print("Testing 10 datapoint, k=4")
print(f"Predicted labels: {knn.predict(X_test[5:15], k=4)}")
print(f"True labels: {y_test[5:15]}")
print()
Testing one datapoint, k=1 Predicted label: 8 True label: 8 Testing one datapoint, k=5 Predicted label: 3 True label: 3 Testing 10 datapoint, k=1 Predicted labels: [[5 4 5 5 6 6 1 0 8 8]] True labels: [5 4 5 5 6 6 1 0 8 8] Testing 10 datapoint, k=4 Predicted labels: [5, 4, 5, 5, 6, 6, 1, 0, 8, 8] True labels: [5 4 5 5 6 6 1 0 8 8]
# Compute accuracy on test set
y_p_test1 = knn.predict(X_test, k=1)
test_acc1= np.sum(y_p_test1[0] == y_test)/len(y_p_test1[0]) * 100
print(f"Test accuracy with k = 1: {format(test_acc1)}")
y_p_test5 = knn.predict(X_test, k=5)
test_acc5= np.sum(y_p_test5 == y_test)/len(y_p_test5) * 100
print(f"Test accuracy with k = 5: {format(test_acc5)}")
Test accuracy with k = 1: 99.11111111111111 Test accuracy with k = 5: 98.66666666666667