#!/usr/bin/env python # coding: utf-8 # # Clustering Validation # We will play with data blobs with different characteristics # In[40]: from pylab import * import matplotlib.pyplot as plt from sklearn.decomposition import PCA from mpl_toolkits.mplot3d import Axes3D from amltlearn.datasets import make_blobs from sklearn.cluster import KMeans from sklearn.mixture import GaussianMixture get_ipython().run_line_magic('matplotlib', 'notebook') blobs, blabels = make_blobs(n_samples=200, n_features=3, centers=[[1,1,1],[0,0,0],[-1,-1,-1]], cluster_std=[0.2,0.1,0.3]) fig = plt.figure(figsize=(10,10)) ax = fig.add_subplot(111, projection='3d') plt.scatter(blobs[:, 0], blobs[:, 1], zs=blobs[:, 2], depthshade=False, c=blabels, s=100) plt.show() # These are very well separated blobs very easy to discover using k-means # In[41]: km = KMeans(n_clusters=3, n_init=10, random_state=0) labels = km.fit_predict(blobs) # In[42]: from amltlearn.metrics.cluster import calinski_harabasz_score, davies_bouldin_score from sklearn.metrics import adjusted_mutual_info_score, silhouette_score lscores = [] nclusters = 5 for nc in range(2,nclusters+1): km = KMeans(n_clusters=nc, n_init=10, random_state=0) labels = km.fit_predict(blobs) lscores.append(( silhouette_score(blobs, labels), calinski_harabasz_score(blobs, labels), davies_bouldin_score(blobs, labels))) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(131) plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores]) ax = fig.add_subplot(132) plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores]) ax = fig.add_subplot(133) plt.plot(range(2,nclusters+1), [x for _, _, x in lscores]) plt.show() # The Silhouette (higher), CH (higher) and DB (lower) indices aggree on that there are three clusters on the data # Let's introduce some overlapping among the clusters # In[43]: blobs, blabels = make_blobs(n_samples=200, n_features=3, centers=[[1,1,1],[0,0,0],[-1,-1,-1]], cluster_std=[0.4,0.45,0.3]) fig = plt.figure(figsize=(10,10)) ax = fig.add_subplot(111, projection='3d') ax.view_init(60, 90) plt.scatter(blobs[:, 0], blobs[:, 1], zs=blobs[:, 2], depthshade=False, c=blabels, s=100) plt.show() # Now is a little bit harder for k-means to discover the clusters # In[44]: km = KMeans(n_clusters=3, n_init=10, random_state=0) labels = km.fit_predict(blobs) print(adjusted_mutual_info_score(blabels, labels)) # In[45]: lscores = [] nclusters = 5 for nc in range(2,nclusters+1): km = KMeans(n_clusters=nc, n_init=10, random_state=0) labels = km.fit_predict(blobs) lscores.append(( silhouette_score(blobs, labels), calinski_harabasz_score(blobs, labels), davies_bouldin_score(blobs, labels))) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(131) plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores]) ax = fig.add_subplot(132) plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores]) ax = fig.add_subplot(133) plt.plot(range(2,nclusters+1), [x for _, _, x in lscores]) plt.show() # Now only CH indicex says that there is three clusters on the dataset, Silhouette and DB only see two. # In[46]: from amltlearn.metrics import scatter_matrices_scores lscores = [] nclusters = 5 for nc in range(2,nclusters+1): km = KMeans(n_clusters=nc, n_init=10, random_state=0) labels = km.fit_predict(blobs) lscores.append(scatter_matrices_scores(blobs, labels, indices= ['Hartigan', 'Xu', 'ZCF'])) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(131) plt.plot(range(2,nclusters+1), [x['Hartigan'] for x in lscores]) ax = fig.add_subplot(132) plt.plot(range(2,nclusters+1), [x['ZCF'] for x in lscores]) ax = fig.add_subplot(133) plt.plot(range(2,nclusters+1), [x['Xu'] for x in lscores]) plt.show() # These are the classical Hartigan index (used fequently for hierarchical clustering) and two other recent published indices # GMM does not have more luck discovering the noisy clusters. # In[47]: gmm = GaussianMixture(n_components=3, covariance_type='diag', random_state=0) gmm.fit(blobs) print(adjusted_mutual_info_score(blabels, labels)) # In[48]: lscores = [] nclusters = 5 for nc in range(2,nclusters+1): gmm = GaussianMixture(n_components=nc, covariance_type='diag', random_state=0) gmm.fit(blobs) labels = gmm.predict(blobs) lscores.append(( silhouette_score(blobs, labels), calinski_harabasz_score(blobs, labels), davies_bouldin_score(blobs, labels))) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(131) plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores]) ax = fig.add_subplot(132) plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores]) ax = fig.add_subplot(133) plt.plot(range(2,nclusters+1), [x for _, _, x in lscores]) plt.show() # And the indices have similar results # Now for non linear data # In[49]: from sklearn.datasets import make_moons moons, mlabels = make_moons(n_samples=200, noise=0.1) fig = plt.figure(figsize=(10,10)) ax = fig.add_subplot(111) plt.scatter(moons[:, 0], moons[:, 1], c=mlabels, s=100) plt.show() # Obviously the clusters obtained by K-means have little to do with the actual labels # In[50]: km = KMeans(n_clusters=2, n_init=10, random_state=0) labels = km.fit_predict(moons) print(adjusted_mutual_info_score(mlabels, labels)) # In[51]: lscores = [] nclusters = 20 for nc in range(2,nclusters+1): km = KMeans(n_clusters=nc, n_init=10, random_state=0) labels = km.fit_predict(moons) lscores.append(( silhouette_score(moons, labels), calinski_harabasz_score(moons, labels), davies_bouldin_score(moons, labels))) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(131) plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores]) ax = fig.add_subplot(132) plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores]) ax = fig.add_subplot(133) plt.plot(range(2,nclusters+1), [x for _, _, x in lscores]) plt.show() # Now the scores result in a high number of clusters, Silhouette and DB are close to agreeing on (maybe) 8 clusters, for CH the more the better (12-13?) # In[52]: lscores = [] nclusters = 20 for nc in range(2,nclusters+1): km = KMeans(n_clusters=nc, n_init=10, random_state=0) labels = km.fit_predict(moons) lscores.append(scatter_matrices_scores(moons, labels, indices= ['Hartigan', 'Xu', 'ZCF'])) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(131) plt.plot(range(2,nclusters+1), [x['Hartigan'] for x in lscores]) ax = fig.add_subplot(132) plt.plot(range(2,nclusters+1), [x['ZCF'] for x in lscores]) ax = fig.add_subplot(133) plt.plot(range(2,nclusters+1), [x['Xu'] for x in lscores]) plt.show() # The other indices also agree on the more the better # We can apply a non linear transformation to the data # In[53]: from sklearn.manifold import Isomap iso = Isomap(n_components=2, n_neighbors=7) fdata = iso.fit_transform(moons) km = KMeans(n_clusters=2, n_init=10, random_state=0) labels = km.fit_predict(fdata) print(adjusted_mutual_info_score(mlabels, labels)) # In[54]: lscores = [] nclusters = 20 for nc in range(2,nclusters+1): km = KMeans(n_clusters=nc, n_init=10, random_state=0) labels = km.fit_predict(fdata) lscores.append(( silhouette_score(fdata, labels), calinski_harabasz_score(fdata, labels), davies_bouldin_score(fdata, labels))) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(131) plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores]) ax = fig.add_subplot(132) plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores]) ax = fig.add_subplot(133) plt.plot(range(2,nclusters+1), [x for _, _, x in lscores]) plt.show() # But results are not much better # In[55]: lscores = [] nclusters = 20 for nc in range(2,nclusters+1): km = KMeans(n_clusters=nc, n_init=10, random_state=0) labels = km.fit_predict(fdata) lscores.append(scatter_matrices_scores(fdata, labels, indices= ['Hartigan', 'Xu', 'ZCF'])) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(131) plt.plot(range(2,nclusters+1), [x['Hartigan'] for x in lscores]) ax = fig.add_subplot(132) plt.plot(range(2,nclusters+1), [x['ZCF'] for x in lscores]) ax = fig.add_subplot(133) plt.plot(range(2,nclusters+1), [x['Xu'] for x in lscores]) plt.show() # For any of the indices # # External Indices # Another approach to cluster validation is to use external indices, this assumes that we know the labels and we compute how # "correlated" are to the ones discovered. # In[56]: blobs, blabels = make_blobs(n_samples=500, n_features=10, centers=6, cluster_std=.6, center_box=(-3,3)) fig = plt.figure(figsize=(10,10)) ax = fig.add_subplot(111, projection='3d') plt.scatter(blobs[:, 0], blobs[:, 1], zs=blobs[:, 2], depthshade=False, c=blabels, s=100) plt.show() # In[57]: from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score lscores = [] nclusters = 8 for nc in range(2,nclusters+1): km = KMeans(n_clusters=nc, n_init=10, random_state=0) labels = km.fit_predict(blobs) lscores.append(( adjusted_mutual_info_score(blabels, labels), adjusted_rand_score(blabels, labels), normalized_mutual_info_score(blabels, labels))) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(131) plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores]) ax = fig.add_subplot(132) plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores]) ax = fig.add_subplot(133) plt.plot(range(2,nclusters+1), [x for _, _, x in lscores]) plt.show() # In this case the maximum value of the scores is at the correct number of clusters # We do not always have a set of labels to compare with, another approach to take advantage of algorithms that return different partitions depending on initialization (e.g. K-means) and test how close are the labelings, the correct number of clusters should be where the different clusters have higher agreement. # In[58]: import numpy as np lscores = [] nclusters = 8 for nc in range(2,nclusters+1): llabels = [] for i in range(10): km = KMeans(n_clusters=nc, n_init=10) labels = km.fit_predict(blobs) llabels.append(labels) mscores = [] for i in range(10): for j in range(i,10): mscores.append(adjusted_mutual_info_score(llabels[i], llabels[j])) lscores.append(np.mean(mscores)) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(121) plt.plot(range(2,nclusters+1), lscores) plt.show() # In this case there is a range of possible clusters with high agreement. # In[ ]: