Notebook

Sascha Spors, Professorship Signal Theory and Digital Signal Processing, Institute of Communications Engineering (INT), Faculty of Computer Science and Electrical Engineering (IEF), University of Rostock, Germany

Data Driven Audio Signal Processing - A Tutorial with Computational Examples¶

Master Course #24512

Feel free to contact lecturer frank.schultz@uni-rostock.de

PCA on Achieved Points of Written Examination¶

In [ ]:

import numpy as np
import scipy
from scipy.linalg import svd, diagsvd
import matplotlib as mpl
import matplotlib.pyplot as plt

np.set_printoptions(precision=3, sign=' ', suppress=True)

print(np.__version__)  # tested with 1.26.4
print(scipy.__version__)  # tested with 1.13.1
print(mpl.__version__)  # tested with 3.9.2

In [ ]:

X = np.loadtxt(open("exam_points_meanfree_unitvar.csv", "rb"), delimiter=";", skiprows=0)
N, F = X.shape
print(N, F)  # 34 students, 5 tasks for exam on signals & systems, a typical course in electrical engineering bachelor studies
# columns correspond to theses tasks
task_label = ['Task 1: Convolution', 'Task 2: Fourier', 'Task 3: Sampling', 'Task 4: Laplace Domain', 'Task 5: z-Domain']

In [ ]:

# data in exam_points_meanfree_unitvar.csv is already mean-free and columns have var=1
# so the numbers in X do not represent points or percentage,
# but rather encode the performance of the students per task in a normalised way
# X is however sorted: first row belongs to best grade, last row to worst grade
np.mean(X, axis=0), np.std(X, axis=0, ddof=1), np.var(X, axis=0, ddof=1)

In [ ]:

# for completeness of PCA algorithm ->
# make X zscore (altough it is already)
mu = np.mean(X, axis=0)
X = X - mu  # de-mean
sigma = np.sqrt(np.sum(X**2, axis=0) / (N-1))
X = X / sigma  # normalise to std=1
np.mean(X, axis=0), np.std(X, axis=0, ddof=1), np.var(X, axis=0, ddof=1)  # check

In [ ]:

X  # print mean=0 / var=1 data matrix

In [ ]:

# get SVD / CovMatrix stuff
[U, s, Vh] = svd(X)
V = Vh.T  # we don't use Vh later on!
S = diagsvd(s, N, F)  # sing vals matrix
D, _ = np.linalg.eig(X.T @ X / (N-1))  # eig vals
D = -np.sort(-D)  # sort them, then ==
d = s**2 / (N-1)
print(np.allclose(d, D))  # so we go for d later on

# switch polarities for nicer interpretation ot the
# exam data
V[:,0] *= -1
U[:,0] *= -1

V[:,2] *= -1
U[:,2] *= -1

V[:,3] *= -1
U[:,3] *= -1

In [ ]:

# PCA
US = U @ S
PC_Features = US @ diagsvd(1 / np.sqrt(d), F, F)  # normalised such that columns have var 1, aka (normalised) PC scores
print(np.var(PC_Features, axis=0, ddof=1))
#PC_Loadings = (diagsvd(np.sqrt(d), F, F) @ V.T).T  # ==
PC_Loadings = V @ diagsvd(np.sqrt(d), F, F)  # aka PC coeff, not unit-length anymore, but normalised such that it shows correlation between PC_Features and X

In [ ]:

np.allclose(X, PC_Features @ PC_Loadings.T)  # check correct matrix factorisation

In [ ]:

# project an x column vector to a pc feature column -> do this for all options -> get all weights for linear comb of pc features
# correlation uses unit-length vectors
PC_Loadings_manual = np.zeros((F, F))
for row in range(F):
    tmp_x = X[:, row] / np.linalg.norm(X[:, row])
    for column in range(F):
        tmp_pc = PC_Features[:, column] / np.linalg.norm(PC_Features[:, column])
        PC_Loadings_manual[row, column] = np.inner(tmp_pc, tmp_x)
np.allclose(PC_Loadings_manual, PC_Loadings)  # we get the PC_Loadings matrix

In [ ]:

# explained variance
d, np.var(US, axis=0, ddof=1)

In [ ]:

# explained cum variance in %
cum_var = np.cumsum(d) / np.sum(d) * 100
cum_var

Check via Plots¶

In [ ]:

plt.figure(figsize=(12,8))

plt.subplot(2,1,1)
for f in range(F):
    plt.plot(X[:, f], 'o-', color='C'+str(f), label='Task '+str(f+1), ms=3)
plt.legend(loc='lower left')
plt.xticks([0, N-1], labels=['best grade', 'worst grade'])
plt.ylabel('normalised points (mean-free, var=1)')
plt.grid(True)
plt.title(task_label)

plt.subplot(2,1,2)
for f in range(F):
    plt.plot(US[:, f], 'o-', color='C'+str(f), label='PCA v '+str(f+1), lw=(F-f)*2/3, ms=(F-f)*3/2)
plt.legend(loc='lower left')
plt.xticks([0, N-1], labels=['best grade', 'worst grade'])
plt.ylabel('PC features (mean-free, sorted var)')
plt.xlabel('student index (sorted grade)')
plt.grid(True)
plt.title(['cum var in %:', cum_var])
plt.tight_layout()

In [ ]:

# correlation between task and pc
pc_label = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5']
cmap = plt.get_cmap('Spectral_r', 8)
fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111)
cax = ax.matshow(PC_Loadings, cmap=cmap, vmin=-1, vmax=+1)
fig.colorbar(cax)
ax.set_xticks(np.arange(len(pc_label)))
ax.set_yticks(np.arange(len(task_label)))
ax.set_xticklabels(pc_label)
ax.set_yticklabels(task_label)
ax.set_title('Loading Matrix = PC x contributes to Task y')
plt.tight_layout()

# a rank 3 approximation of the data,
# i.e. using only PC1, PC2 and PC3 in the linear combination to reconstruct X
# would only change one grade by a 1/3 grade step
# so 85.899 % explained variance would be enough to figure the actual grading

# PC1 and PC2 might allow an intuitive interpretation:
# students are very well prepared to convolution, Laplace and z-Domain tasks
# as theses tasks are always very similar and definitiely will be queried in the exam
# so, PC1 indidcates the performance on 'fulfilled' expectations and is
# highly correlated with the achieved grade
# the Fourier task and the sampling task were chosen out of a wide range of options
# here students have rather 'unknown' expectations, which is why we need PC 2 to cover this
#
# PC 3 to 5 show positive vs. negative correlations, i.e. mostly one good task vs. one bad task performance
# some of these results are intuitive: we know that students sometimes have preferences for Laplace vs. z-Domain  

In [ ]:

np.sum(PC_Loadings**2, axis=0)  # that's again the explained variance of the PCs

In [ ]:

np.sum(PC_Loadings**2, axis=1)  # communalities, must sum to 1 in our normalised handling

Copyright¶

the notebooks are provided as Open Educational Resources
feel free to use the notebooks for your own purposes
the text is licensed under Creative Commons Attribution 4.0
the code of the IPython examples is licensed under the MIT license
please attribute the work as follows: Frank Schultz, Data Driven Audio Signal Processing - A Tutorial Featuring Computational Examples, University of Rostock ideally with relevant file(s), github URL https://github.com/spatialaudio/data-driven-audio-signal-processing-exercise, commit number and/or version tag, year.