Sascha Spors, Professorship Signal Theory and Digital Signal Processing, Institute of Communications Engineering (INT), Faculty of Computer Science and Electrical Engineering (IEF), University of Rostock, Germany
Master Course #24512
Feel free to contact lecturer frank.schultz@uni-rostock.de
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import svd, diagsvd
from statsmodels.multivariate.pca import PCA
We intentionally create a mean-free data matrix X, then SVD and PCA exhibit the same concepts.
# be careful when changing the seed
# SVD U and V vectors might in this case need reflections
# to match with statsmodels' results
# also when reflections occur the mapping of original data -> PC scores
# is not anymore simply explained by just rotating the data set
# hence, we should elaborate on the with_reflection = True case
with_reflection = True
if with_reflection:
rng = np.random.default_rng(6)
# index for specific data points to plot with specific colors
# this helps to identify the reflection
di1 = 12
di2 = 80
else:
rng = np.random.default_rng(1)
# index for specific data points to plot with specific colors
di1 = 12
di2 = 91
# construct 2 features from normal PDF
mean = [0, 0]
cov = [[1.3, 0.99], [0.99, 1]]
N = 200 # no of samples
x, y = rng.multivariate_normal(mean, cov, N).T
X = np.array([x, y]).T
# the PCA assumes mean-free columns, so we explicitly design X this way
X = X - np.mean(X, axis=0)
# we could also normalize by standard deviation,
# then each column of X has unit variance and total variance of X is 2:
if False:
X = X / np.std(X, axis=0, ddof=1)
print(np.var(X, axis=0, ddof=1), np.sum(np.var(X, axis=0, ddof=1)) )
F = X.shape[1]
print("X.shape", X.shape) # (200, 2)
print("rank", np.linalg.matrix_rank(X))
print("condition number", np.linalg.cond(X))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(X[:, 0], X[:, 1], "x", color="gray")
ax.plot(X[di1, 0], X[di1, 1], "C1x", ms=10, mew=3)
ax.plot(X[di2, 0], X[di2, 1], "C3x", ms=10, mew=3)
ax.axis("square")
ax.set_xlim(-5, 5)
ax.set_ylim(-5, 5)
ax.set_xlabel("original feature 1")
ax.set_ylabel("original feature 2")
ax.set_title("data in original coordinate system")
ax.grid(True)
plt.savefig('slides/pca_2d_original_data.pdf', dpi=600, bbox_inches='tight')
We use abbreviation PC for principal component.
# we work on matrix X directly (so standardize=False), and we already
# made it mean free above (so demean=False), normalize=False to give us
# data that is nicely connected to SVD data
# PCA from statsmodels.multivariate.pca
pca = PCA(X, ncomp=2, standardize=False, demean=False, normalize=False)
# SVD
[U, s, Vh] = svd(X, full_matrices=False)
S = diagsvd(s, F, F)
V = Vh.conj().T
pcs = U @ S # known as PC, PC signals, PC factors, PC scores, PC features
pcl = V # known as PC loadings, PC coefficients
# note that sometimes V.T is called loadings, coefficients
# check if statsmodels pca and our manual SVD-based PCA produce same results:
print(np.allclose(pca.scores, pcs))
print(np.allclose(pca.coeff.T, V))
For mean free data matrix X whole PCA information is in the SVD, but can also be derived with a covariance matrix mindset.
covX = X.T @ X
covX_N_1 = covX / (N-1)
# eigval_covX are not necessarily sorted, eigvec_covX might exhibit reflections
# eigval_covX_N_1 are not necessarily sorted, eigvec_covX_N_1 might exhibit reflections
[eigval_covX, eigvec_covX] = np.linalg.eig(covX)
[eigval_covX_N_1, eigvec_covX_N_1] = np.linalg.eig(covX_N_1)
# should be similar vectors but with potential reflections
eigvec_covX_N_1, eigvec_covX, V
# squared singular values == not necessarily sorted eig values of covX:
s**2, eigval_covX
We get variances of the PC scores / PC signals.
print(np.var(pcs[:, 0], ddof=1))
print(np.var(pcs[:, 1], ddof=1))
var_pcs = np.var(pcs, axis=0, ddof=1)
# variance of PC scores == sorted eig values of covX_N_1:
var_pcs, eigval_covX_N_1
We compare with the variances of the original features.
print(np.var(X[:, 0], ddof=1))
print(np.var(X[:, 1], ddof=1))
var_X = np.var(X, axis=0, ddof=1)
We don't lose variance, we just distribute it over the signals (vectors) in another way.
np.sum(var_pcs), np.sum(var_X)
1st PC signal explains about 94 % of total variance.
var_pcs[0] / np.sum(var_pcs) * 100
2nd PC signal explains the remaining about 6 % of total variance.
var_pcs[1] / np.sum(var_pcs) * 100
Subsequently explained variance by PC scores
np.cumsum(var_pcs) / np.sum(var_pcs) * 100
PC scores exhibit sorted variances compared to the original features. In this 2D example this is not too obvious, as original features also seem to be sorted from high to low variance.
var_X / np.sum(var_pcs) * 100
The standard deviation - i.e. sqrt(variance) - of the orthogonal PC scores is helpful for the following.
std_pcs = np.std(pcs, axis=0, ddof=1)
std_pcs, np.sqrt(var_pcs)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(X[:, 0], X[:, 1], "x", color="gray")
ax.plot(X[di1, 0], X[di1, 1], "C1x", mew=3)
ax.plot(X[di2, 0], X[di2, 1], "C3x", mew=3)
# draw directions of PC axis
# length follows 3*sigma rule for normal distribution
# note that V not necessarily spans a right-hand system
col = ['C1', 'C3']
for i in range(F):
ax.plot(
[0, 3*std_pcs[i] * V[0, i]],
[0, 3*std_pcs[i] * V[1, i]],
col[i], lw=3,
label=f"{'direction of PC '}{i+1}" f"{' score, col '}{i+1}" f"{' in V, right sing vec '}{i+1}")
plt.axis("square")
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.xlabel("original feature 1")
plt.ylabel("original feature 2")
plt.title("data in original coordinate system")
plt.legend()
plt.grid(True)
plt.savefig('slides/pca_2d_original_data_with_pcdir.pdf', dpi=600, bbox_inches='tight')
[U_pcs, s_pcs, Vh_pcs] = svd(pcs, full_matrices=False)
S_pcs = diagsvd(s_pcs, F, F)
V_pcs = Vh_pcs.conj().T
# make sure that correct U/V pair for pcs holds by
# introducing corresponding reflections
for i in range(F):
if not np.allclose((U_pcs @ S_pcs)[:, i], pcs[:, i]):
U_pcs[:,i] *= -1
V_pcs[:,i] *= -1
# then V_pcs indicates the correct directions in the PC coordinate system
print(np.allclose(U_pcs @ S_pcs, pcs))
Nt = 2**6
t = np.arange(Nt) / Nt * 2 * np.pi
ellipse_x, ellipse_y = np.cos(t), np.sin(t)
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(pcs[:, 0], pcs[:, 1], "x", color="gray")
plt.plot(pcs[di1, 0], pcs[di1, 1], "C1x", mew=3)
plt.plot(pcs[di2, 0], pcs[di2, 1], "C3x", mew=3)
# draw directions of PC axis
# length follows 3*sigma rule for normal distribution
col = ['C1', 'C3']
for i in range(F):
ax.plot(
[0, 3*std_pcs[i] * V_pcs[0, i]],
[0, 3*std_pcs[i] * V_pcs[1, i]],
col[i], lw=3,
label=f"{'direction of PC '}{i+1}" f"{', length = 3 std(PC '}{i+1}" f"{')'}")
plt.plot(1*std_pcs[0]*ellipse_x, 1*std_pcs[1] *
ellipse_y, 'k--', label=r'3$\sigma$ ellipse')
plt.plot(3*std_pcs[0]*ellipse_x, 3*std_pcs[1] *
ellipse_y, 'k:', label=r'1$\sigma$ ellipse')
plt.axis("square")
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.title("data in PC coordinate system")
plt.legend()
plt.grid(True)
plt.savefig('slides/pca_2d_pc_data.pdf', dpi=600, bbox_inches='tight')
For rank R data matrix, the data is rotated (and potentially reflected in some axes) such that along the axis of PC 1 most variance occurs (most data spread), whereas along the axis of the R-th PC (last PC, here in the case it is PC 2) fewest variance occurs.
Generally, var(PC 1)>var(PC 2)>...>var(PC R) just as the sorting of the singular values in the SVD. Recall how we calculated pcs
and var of pcs
...
We do rank reduction / low-rank approximation. Hence, we keep number of features, but the data is simplified in this feature space.
r_des = 1
# SVD mindset
# sum of outer products, i.e. sum of rank-1 matrices
X_rank_red = np.zeros((N, F))
for i in range(r_des):
X_rank_red += s[i] * np.outer(U[:, i], V[:, i])
# PCA mindset
# we might also use the PC signals and set intended PC loading column to zero
X_rank_red2 = np.zeros((N, F))
pcl_rank_red = np.copy(pcl)
pcl_rank_red[:, r_des:] = 0
X_rank_red2 = pcs @ pcl_rank_red.conj().T
print(np.allclose(X_rank_red, X_rank_red2))
pcl_rank_red
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(X_rank_red2[:, 0], X_rank_red2[:, 1], "x", color="gray")
ax.plot(X_rank_red2[di1, 0], X_rank_red2[di1, 1], "C1x", mew=3)
ax.plot(X_rank_red2[di2, 0], X_rank_red2[di2, 1], "C3x", mew=3)
# draw directions of PC axis
# length follows 3*sigma rule for normal distribution
# note that V not necessarily spans a right-hand system
col = ['C1', 'C3']
for i in range(F):
ax.plot(
[0, 3*std_pcs[i] * V[0, i]],
[0, 3*std_pcs[i] * V[1, i]],
col[i], lw=1,
label=f"{'direction of PC '}{i+1}" f"{' score, col '}{i+1}" f"{' in V, right sing vec '}{i+1}")
ax.axis("square")
ax.set_xlim(-5, 5)
ax.set_ylim(-5, 5)
ax.set_xlabel("new feature 1 after rank reduction")
ax.set_ylabel("new feature 2")
ax.set_title("rank-{0:d} approximation of data".format(r_des))
ax.legend()
ax.grid(True)
plt.savefig('slides/pca_2d_truncated_svd.pdf', dpi=600, bbox_inches='tight')
We do reduction of matrix size, so we reduce the number of columns, i.e. number of features. We do this for the orthogonal PC scores (the weighted column space of X).
dim_des = 1
# PCA mindset
X_dim_red = np.zeros((N, dim_des))
X_dim_red = pcs[:, :dim_des]
print('original matrix shape ', X.shape)
print('matrix shape after dim red', X_dim_red.shape)
X_dim_red_plot = np.zeros((N, F))
X_dim_red_plot[:, :dim_des] = pcs[:, :dim_des]
# check with SVD mindset
print(np.allclose((U @ S)[:, :dim_des], X_dim_red))
print(np.allclose(X @ V[:, :dim_des], X_dim_red))
# the dimensionality reduction actually yields a matrix with smaller
# dimension, cf. shape of X_dim_red
# for convenience we plot data here in 2D plot
# note however that X_dim_red_plot[:,1] is precisely zero if
# dim_des = 1 was chosen
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(X_dim_red_plot[:, 0], X_dim_red_plot[:, 1], "x", color="gray")
ax.plot(X_dim_red_plot[di1, 0], X_dim_red_plot[di1, 1], "C1x", mew=3)
ax.plot(X_dim_red_plot[di2, 0], X_dim_red_plot[di2, 1], "C3x", mew=3)
# draw directions of PC axis
# length follows 3*sigma rule for normal distribution
col = ['C1', 'C3']
for i in range(F):
ax.plot(
[0, 3*std_pcs[i] * V_pcs[0, i]],
[0, 3*std_pcs[i] * V_pcs[1, i]],
col[i], lw=1,
label=f"{'direction of PC '}{i+1}" f"{', length = 3 std(PC '}{i+1}" f"{')'}")
ax.axis("square")
ax.set_xlim(-5, 5)
ax.set_ylim(-5, 5)
ax.set_xlabel("PC 1")
ax.set_ylabel("PC 2")
ax.set_title("PCA data dimensionality reduction to {0:d}D".format(r_des))
ax.legend()
ax.grid(True)
plt.savefig('slides/pca_2d_dim_red.pdf', dpi=600, bbox_inches='tight')
print("reduced to dimension", X_dim_red.shape)
For both cases, (i) truncated SVD / rank reduction / low-rank approximation and (ii) dimensionality reduction, the variance sorted, orthogonal characteristics of the PC scores helps to encode the data in terms of maximum feature independence and explained variance.