import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
import scipy.spatial.distance as ssd
from scipy.cluster import hierarchy
from sklearn.metrics import v_measure_score
from sklearn.decomposition import KernelPCA
from tck.TCK import TCK
from tck.datasets import DataLoader
np.random.seed(0)
# Set the colormap for the histogram plot
cmap = plt.cm.tab20(np.linspace(0, 1, 12))
hierarchy.set_link_color_palette([mpl.colors.rgb2hex(rgb[:3]) for rgb in cmap])
Load the data.
downloader = DataLoader()
Xtr, Ytr, Xte, Yte = downloader.get_data('Japanese_Vowels')
# Since we are doing clustering, we do not need the train/test split
X = np.concatenate((Xtr, Xte), axis=0)
Y = np.concatenate((Ytr, Yte), axis=0)
Loaded Japanese_Vowels dataset. Data shapes: Xtr: (270, 29, 12) Ytr: (270, 1) Xte: (370, 29, 12) Yte: (370, 1)
Add missing data (40%).
mask = np.random.choice([0, 1], size=X.shape, p=[0.6, 0.4])
X[mask == 1] = np.nan
Compute TCK.
tck = TCK(G=40, C=20)
K = tck.fit(X).predict(mode='tr-tr')
print(f"K shape: {K.shape}")
The dataset contains missing data Training the TCK using the following parameters: C = 20, G = 40 Number of MTS for each GMM: 512 - 640 (80 - 100 percent) Number of attributes sampled from [2, 11] Length of time segments sampled from [6, 23]
Fitting GMMs: 100%|██████████| 760/760 [00:31<00:00, 23.84it/s] Computing TCK (tr-tr): 100%|██████████| 760/760 [00:52<00:00, 14.52it/s]
K shape: (640, 640)
Visualize the kernel.
idx_sorted = np.argsort(Y[:,0])
K_sorted = K[:,idx_sorted][idx_sorted,:]
fig = plt.figure(figsize=(6,6))
h = plt.imshow(K_sorted)
plt.title("TCK matrix (sorted)")
plt.colorbar(h)
class_num, _ = np.histogram(Y[:,0], bins=len(np.unique(Y)))
pos = np.cumsum(class_num)
plt.xticks(pos, np.unique(Y))
plt.yticks(pos, np.unique(Y))
plt.xlabel("MTS class")
plt.ylabel("MTS class")
plt.show()
Perform dimensionality reduction with Kernel PCA using TCK as the kernel.
kpca = KernelPCA(n_components=2, kernel='precomputed')
embeddings_pca = kpca.fit_transform(K)
fig = plt.figure(figsize=(8,6))
plt.scatter(embeddings_pca[:,0], embeddings_pca[:,1], c=Y[:,0], s=10, cmap='tab20')
plt.title("Kernel PCA embeddings")
plt.show()
Compute the dissimilarity matrix needed for clustering.
Dist = 1.0 - (K+K.T)/(2.0*K.max())
np.fill_diagonal(Dist, 0) # due to numerical errors, the diagonal might not be 0
Hierarchical clustering.
distArray = ssd.squareform(Dist)
Z = linkage(distArray, 'ward')
clust = fcluster(Z, t=2.0, criterion="distance")
print("Found %d clusters"%len(np.unique(clust)))
Found 11 clusters
Evaluate the agreement between class and cluster labels.
nmi = v_measure_score(Y[:,0], clust)
print("Normalized Mutual Information (v-score): %.3f"%nmi)
Normalized Mutual Information (v-score): 0.759
Plot the dendrogram.
fig = plt.figure(figsize=(20, 10))
dn = dendrogram(Z, color_threshold=2.0, labels=None, above_threshold_color='k')
plt.xticks([])
plt.show()
print("N. clusters: ", np.unique(dn['color_list']).shape[0]-1)
N. clusters: 11