import numpy as np
from bokeh.plotting import figure, show, output_notebook
output_notebook()
def PCA(X, n_components):
# normalize to zero mean
mu = X.mean(axis=0)
X = X - mu
# eigenvectors of covariance matrix
sigma = X.T @ X
eigvals, eigvecs = np.linalg.eig(sigma)
# principal components
order = np.argsort(eigvals)[::-1]
components = eigvecs[:, order[:n_components]]
# projection
Z = X @ components
# result
return Z, components
# generate points
x = np.linspace(0, 13, num=100)
y = x + np.sin(x) - np.cos(x)
# 2D data
X = np.c_[x, y]
# PCA
projection, components = PCA(X, n_components=2)
# principal components
components
array([[ 0.72299657, -0.69085162], [ 0.69085162, 0.72299657]])
# convariance matrix of projected data
(projection.T @ projection).round(3)
array([[ 2705.081, 0. ], [ 0. , 47.716]])
# prepare plot data
mean = np.mean(X, axis=0)
extent = projection.min(), projection.max()
angle = np.arctan(components[1] / components[0]) + np.pi * (components[0] < 0)
# plot original data & principal components
plot = figure()
plot.scatter(x, y)
plot.ray(*mean, length=0, angle=angle[0], line_width=2, line_color='red')
plot.ray(*mean, length=0, angle=angle[1], line_width=2, line_color='green')
show(plot)
# plot projected data
plot = figure(x_range=extent, y_range=extent)
plot.scatter(projection[:, 0], projection[:, 1])
show(plot)
# generate binary vectors
X = np.random.rand(90, 30)
X[:30, :] = X[:30, :] < ([.4] * 10 + [.1] * 10 + [.1] * 10)
X[30:60, :] = X[30:60, :] < ([.1] * 10 + [.4] * 10 + [.1] * 10)
X[60:, :] = X[60:, :] < ([.1] * 10 + [.1] * 10 + [.4] * 10)
# define 3 classes
Y = ['red'] * 30 + ['green'] * 30 + ['blue'] * 30
# PCA
projection, _ = PCA(X, n_components=2)
# plot projected data: 30D -> 2D
plot = figure()
plot.scatter(projection[:, 0], projection[:, 1], color=Y)
show(plot)