Sascha Spors, Professorship Signal Theory and Digital Signal Processing, Institute of Communications Engineering (INT), Faculty of Computer Science and Electrical Engineering (IEF), University of Rostock, Germany
Master Course #24512
Feel free to contact lecturer frank.schultz@uni-rostock.de
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.linalg import svd, diagsvd
from statsmodels.multivariate.pca import PCA
matplotlib_widget_flag = True
if matplotlib_widget_flag:
%matplotlib widget
We intentionally create a mean-free data matrix X, then SVD and PCA exhibit the same concepts.
# be careful when changing the seed
# SVD U and V vectors might in this case need reflections
# to match with statsmodels' results
# also when reflections occur the mapping of original data -> PC scores
# is not anymore simply explained by just rotating the data set
rng = np.random.default_rng(2) #1
# construct 3 features from normal PDF
mean = [0, 0, 0]
cov = [[3, 2, 0], [2, 3, 1], [0, 1, 1]]
N = 200 # no of samples
x, y, z = rng.multivariate_normal(mean, cov, N).T
X = np.array([x, y, z]).T
# the PCA assumes mean-free columns, so we explicitly design X this way
X = X - np.mean(X, axis=0)
# we could also normalize by standard deviation,
# then each column of X has unit variance and total variance of X is 3:
if False:
X = X / np.std(X, axis=0, ddof=1)
print(np.var(X, axis=0, ddof=1), np.sum(np.var(X, axis=0, ddof=1)) )
F = X.shape[1]
print("X.shape", X.shape) # (200, 3)
print("rank", np.linalg.matrix_rank(X))
print("condition number", np.linalg.cond(X))
# index for specific data points to plot with specific colors
# this helps to identify potential reflections of U and V space vectors
di1 = 36
di2 = 23
di3 = 57
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(X[:, 0], X[:, 1], X[:, 2], "x", color="gray")
ax.plot(X[di1, 0], X[di1, 1], X[di1, 2], "C3x", ms=10, mew=3)
ax.plot(X[di2, 0], X[di2, 1], X[di2, 2], "C2x", ms=10, mew=3)
ax.plot(X[di3, 0], X[di3, 1], X[di3, 2], "C0x", ms=10, mew=3)
ax.axis("square")
ax.set_xlim(-6, 6)
ax.set_ylim(-6, 6)
ax.set_zlim(-6, 6)
ax.set_xlabel("original feature 1")
ax.set_ylabel("original feature 2")
ax.set_zlabel("original feature 3")
ax.set_title("data in original coordinate system")
ax.grid(True)
plt.savefig('slides/pca_3d_original_data.pdf', dpi=600)
We use abbreviation PC for principal component.
# we work on matrix X directly (so standardize=False), and we already
# made it mean free above (so demean=False), normalize=False to give us
# data that is nicely connected to SVD data
# PCA from statsmodels.multivariate.pca
pca = PCA(X, ncomp=3, standardize=False, demean=False, normalize=False)
# SVD
[U, s, Vh] = svd(X, full_matrices=False)
S = diagsvd(s, F, F)
V = Vh.conj().T
pcs = U @ S # known as PC, PC signals, PC factors, PC scores, PC features
pcl = V # known as PC loadings, PC coefficients
# note that sometimes V.T is called loadings, coefficients
# check if statsmodels pca and our manual SVD-based PCA produce same results:
print(np.allclose(pca.scores, pcs))
print(np.allclose(pca.coeff.T, V))
For mean free data matrix X whole PCA information is in the SVD, but can also be derived with a covariance matrix mindset.
covX = X.T @ X
covX_N_1 = covX / (N-1)
# eigval_covX are not necessarily sorted, eigvec_covX might exhibit reflections
# eigval_covX_N_1 are not necessarily sorted, eigvec_covX_N_1 might exhibit reflections
[eigval_covX, eigvec_covX] = np.linalg.eig(covX)
[eigval_covX_N_1, eigvec_covX_N_1] = np.linalg.eig(covX_N_1)
# should be similar vectors but with potential reflections
eigvec_covX_N_1, eigvec_covX, V
# squared singular values == not necessarily sorted eig values of covX:
s**2, eigval_covX
We get variances of the PC scores / PC signals.
print(np.var(pcs[:, 0], ddof=1))
print(np.var(pcs[:, 1], ddof=1))
print(np.var(pcs[:, 2], ddof=1))
var_pcs = np.var(pcs, axis=0, ddof=1)
# variance of PC scores == sorted eig values of covX_N_1:
var_pcs, eigval_covX_N_1
We compare with the variances of the original features.
print(np.var(X[:, 0], ddof=1))
print(np.var(X[:, 1], ddof=1))
print(np.var(X[:, 2], ddof=1))
var_X = np.var(X, axis=0, ddof=1)
We don't lose variance, we just distribute it over the signals (vectors) in another way.
np.sum(var_pcs), np.sum(var_X)
1st PC signal explains about ?? % of total variance.
var_pcs[0] / np.sum(var_pcs) * 100
2nd PC signal explains about ?? % of total variance.
var_pcs[1] / np.sum(var_pcs) * 100
3rd PC signal explains the remaining about ?? % of total variance.
var_pcs[2] / np.sum(var_pcs) * 100
Subsequently explained variance by PC scores
np.cumsum(var_pcs) / np.sum(var_pcs) * 100
PC scores exhibit sorted variances compared to the original features. In this 3D example this is not too obvious, as original features also seem to be sorted from high to low variance.
var_X / np.sum(var_pcs) * 100
The standard deviation - i.e. sqrt(variance) - of the orthogonal PC scores is helpful for the following.
std_pcs = np.std(pcs, axis=0, ddof=1)
std_pcs, np.sqrt(var_pcs)
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(X[:, 0], X[:, 1], X[:, 2], "x", color="gray")
ax.plot(X[di1, 0], X[di1, 1], X[di1, 2], "C3x", mew=3)
ax.plot(X[di2, 0], X[di2, 1], X[di2, 2], "C2x", mew=3)
ax.plot(X[di3, 0], X[di3, 1], X[di3, 2], "C0x", mew=3)
# draw directions of PC axis
# length follows 3*sigma rule for normal distribution
# note that V not necessarily spans a right-hand system
col = ['C3', 'C2', 'C0']
for i in range(F):
ax.plot(
[0, 3*std_pcs[i] * V[0, i]],
[0, 3*std_pcs[i] * V[1, i]],
[0, 3*std_pcs[i] * V[2, i]],
col[i], lw=3,
label=f"{'direction of PC '}{i+1}" f"{' score, col '}{i+1}" f"{' in V, right sing vec '}{i+1}")
ax.axis("square")
ax.set_xlim(-6, 6)
ax.set_ylim(-6, 6)
ax.set_zlim(-6, 6)
ax.set_xlabel("original feature 1")
ax.set_ylabel("original feature 2")
ax.set_zlabel("original feature 3")
ax.set_title("data in original coordinate system")
ax.legend()
ax.grid(True)
plt.savefig('slides/pca_3d_original_data_with_pcdir.pdf', dpi=600)
[U_pcs, s_pcs, Vh_pcs] = svd(pcs, full_matrices=False)
S_pcs = diagsvd(s_pcs, F, F)
V_pcs = Vh_pcs.conj().T
# make sure that correct U/V pair for pcs holds by
# introducing corresponding reflections
for i in range(F):
if not np.allclose((U_pcs @ S_pcs)[:, i], pcs[:, i]):
U_pcs[:,i] *= -1
V_pcs[:,i] *= -1
# then V_pcs indicates the correct directions in the PC coordinate system
print(np.allclose(U_pcs @ S_pcs, pcs))
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(pcs[:, 0], pcs[:, 1], pcs[:, 2], "x", color="gray")
ax.plot(pcs[di1, 0], pcs[di1, 1], pcs[di1, 2], "C3x", mew=3)
ax.plot(pcs[di2, 0], pcs[di2, 1], pcs[di2, 2], "C2x", mew=3)
ax.plot(pcs[di3, 0], pcs[di3, 1], pcs[di3, 2], "C0x", mew=3)
# draw directions of PC axis
# length follows 3*sigma rule for normal distribution
col = ['C3', 'C2', 'C0']
for i in range(F):
ax.plot(
[0, 3*std_pcs[i] * V_pcs[0, i]],
[0, 3*std_pcs[i] * V_pcs[1, i]],
[0, 3*std_pcs[i] * V_pcs[2, i]],
col[i], lw=3,
label=f"{'direction of PC '}{i+1}" f"{', length = 3 std(PC '}{i+1}" f"{')'}")
ax.axis("square")
ax.set_xlim(-6, 6)
ax.set_ylim(-6, 6)
ax.set_zlim(-6, 6)
ax.set_xlabel("PC 1")
ax.set_ylabel("PC 2")
ax.set_zlabel("PC 3")
ax.set_title("data in PC coordinate system")
ax.legend()
ax.grid(True)
plt.savefig('slides/pca_3d_pc_data.pdf', dpi=600)
For rank R data matrix, the data is rotated (and potentially reflected in some axes) such that along the axis of PC 1 most variance occurs (most data spread), whereas along the axis of the R-th PC (last PC, here in the case it is PC 3) fewest variance occurrence.
Generally, var(PC 1)>var(PC 2)>...>var(PC R) just as the sorting of the singular values in the SVD. Recall how we calculated pcs
and var of pcs
...
We do rank reduction / low-rank approximation. Hence, we keep number of features, but the data is simplified in this feature space.
r_des = 2
we reduce data information to a plane in 3D spacer_des = 1
we reduce data information to a line in 3D spacer_des = 2 # 1 or 2
# SVD mindset
# sum of outer products, i.e. sum of rank-1 matrices
X_rank_red = np.zeros((N, F))
for i in range(r_des):
X_rank_red += s[i] * np.outer(U[:, i], V[:, i])
# PCA mindset:
# we might also use the PC signals and set intended PC loadings to zero
X_rank_red2 = np.zeros((N, F))
pcl_rank_red = np.copy(pcl)
pcl_rank_red[:, r_des:] = 0
X_rank_red2 = pcs @ pcl_rank_red.conj().T
print(np.allclose(X_rank_red, X_rank_red2))
pcl_rank_red
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(X_rank_red2[:, 0],
X_rank_red2[:, 1],
X_rank_red2[:, 2], "x", color="gray")
ax.plot(X_rank_red2[di1, 0],
X_rank_red2[di1, 1],
X_rank_red2[di1, 2], "C3x", mew=3)
ax.plot(X_rank_red2[di2, 0],
X_rank_red2[di2, 1],
X_rank_red2[di2, 2], "C2x", mew=3)
ax.plot(X_rank_red2[di3, 0],
X_rank_red2[di3, 1],
X_rank_red2[di3, 2], "C0x", mew=3)
# draw directions of PC axis
# length follows 3*sigma rule for normal distribution
# note that V not necessarily spans a right-hand system
col = ['C3', 'C2', 'C0']
for i in range(F):
ax.plot(
[0, 3*std_pcs[i] * V[0, i]],
[0, 3*std_pcs[i] * V[1, i]],
[0, 3*std_pcs[i] * V[2, i]],
col[i], lw=1,
label=f"{'direction of PC '}{i+1}" f"{' score, col '}{i+1}" f"{' in V, right sing vec '}{i+1}")
ax.axis("square")
ax.set_xlim(-6, 6)
ax.set_ylim(-6, 6)
ax.set_zlim(-6, 6)
ax.set_xlabel("new feature 1 after rank reduction")
ax.set_ylabel("new feature 2")
ax.set_zlabel("new feature 3")
ax.set_title("rank-{0:d} approximation of data".format(r_des))
ax.legend()
ax.grid(True)
plt.savefig('slides/pca_3d_truncated_svd.pdf', dpi=600)
We do reduction of matrix size, so we reduce the number of columns, i.e. number of features. We do this for the orthogonal PC scores (the weighted column space of X).
dim_des = 2
we reduce data to a plane in 2D space, it is though plotted in 3D (third variable is zero) for conveniencedim_des = 1
we reduce data to a line in 1D space, it is though plotted in 3D (second and third variable is zero) for conveniencedim_des = 2 # 1 or 2
# PCA mindset
X_dim_red = np.zeros((N, dim_des))
X_dim_red = pcs[:, :dim_des]
print('original matrix shape ', X.shape)
print('matrix shape after dim red', X_dim_red.shape)
X_dim_red_plot = np.zeros((N, F))
X_dim_red_plot[:, :dim_des] = pcs[:, :dim_des]
# check with SVD mindset
print(np.allclose((U @ S)[:, :dim_des], X_dim_red))
print(np.allclose(X @ V[:, :dim_des], X_dim_red))
# the dimensionality reduction actually yields a matrix with smaller
# dimension, cf. shape of X_dim_red
# for convenience we plot data here in 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(
X_dim_red_plot[:, 0],
X_dim_red_plot[:, 1],
X_dim_red_plot[:, 2],
"x", color="gray")
ax.plot(
X_dim_red_plot[di1, 0],
X_dim_red_plot[di1, 1],
X_dim_red_plot[di1, 2],
"C3x", mew=3)
ax.plot(
X_dim_red_plot[di2, 0],
X_dim_red_plot[di2, 1],
X_dim_red_plot[di2, 2],
"C2x", mew=3)
ax.plot(
X_dim_red_plot[di3, 0],
X_dim_red_plot[di3, 1],
X_dim_red_plot[di3, 2],
"C0x", mew=3)
# draw directions of PC axis
# length follows 3*sigma rule for normal distribution
col = ['C3', 'C2', 'C0']
for i in range(F):
ax.plot(
[0, 3*std_pcs[i] * V_pcs[0, i]],
[0, 3*std_pcs[i] * V_pcs[1, i]],
[0, 3*std_pcs[i] * V_pcs[2, i]],
col[i], lw=1,
label=f"{'direction of PC '}{i+1}" f"{', length = 3 std(PC '}{i+1}" f"{')'}")
ax.axis("square")
ax.set_xlim(-6, 6)
ax.set_ylim(-6, 6)
ax.set_zlim(-6, 6)
ax.set_xlabel("PC 1")
ax.set_ylabel("PC 2")
ax.set_zlabel("PC 3")
plt.title("PCA data dimensionality reduction to {0:d}D".format(dim_des))
ax.legend()
ax.grid(True)
plt.savefig('slides/pca_3d_dim_red.pdf', dpi=600)
print("reduced to dimension", X_dim_red.shape)
For both cases, (i) truncated SVD / rank reduction / low-rank approximation and (ii) dimensionality reduction, the variance sorted, orthogonal characteristics of the PC scores helps to encode the data in terms of maximum feature independence and explained variance.