#!/usr/bin/env python
# coding: utf-8

# # Clustering Validation

# We will play with data blobs with different characteristics

# In[40]:


from pylab import *
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from amltlearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
get_ipython().run_line_magic('matplotlib', 'notebook')

blobs, blabels = make_blobs(n_samples=200, n_features=3, centers=[[1,1,1],[0,0,0],[-1,-1,-1]], cluster_std=[0.2,0.1,0.3])

fig = plt.figure(figsize=(10,10))

ax = fig.add_subplot(111, projection='3d')
plt.scatter(blobs[:, 0], blobs[:, 1], zs=blobs[:, 2], depthshade=False, c=blabels, s=100)
plt.show()


# These are very well separated blobs very easy to discover using k-means

# In[41]:


km = KMeans(n_clusters=3, n_init=10, random_state=0)
labels = km.fit_predict(blobs)


# In[42]:


from amltlearn.metrics.cluster import calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import adjusted_mutual_info_score, silhouette_score

lscores = []
nclusters = 5
for nc in range(2,nclusters+1):
    km = KMeans(n_clusters=nc, n_init=10, random_state=0)
    labels = km.fit_predict(blobs)
    lscores.append((
        silhouette_score(blobs, labels),
        calinski_harabasz_score(blobs, labels),
        davies_bouldin_score(blobs, labels)))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x for _, _, x in lscores])

plt.show()


# The Silhouette (higher), CH (higher) and DB (lower) indices aggree on that there are three clusters on the data

# Let's introduce some overlapping among the clusters

# In[43]:


blobs, blabels = make_blobs(n_samples=200, n_features=3, centers=[[1,1,1],[0,0,0],[-1,-1,-1]], cluster_std=[0.4,0.45,0.3])

fig = plt.figure(figsize=(10,10))

ax = fig.add_subplot(111, projection='3d')
ax.view_init(60, 90)
plt.scatter(blobs[:, 0], blobs[:, 1], zs=blobs[:, 2], depthshade=False, c=blabels, s=100)
plt.show()


# Now is a little bit harder for k-means to discover the clusters

# In[44]:


km = KMeans(n_clusters=3, n_init=10, random_state=0)
labels = km.fit_predict(blobs)

print(adjusted_mutual_info_score(blabels, labels))


# In[45]:


lscores = []
nclusters = 5
for nc in range(2,nclusters+1):
    km = KMeans(n_clusters=nc, n_init=10, random_state=0)
    labels = km.fit_predict(blobs)
    lscores.append((
        silhouette_score(blobs, labels),
        calinski_harabasz_score(blobs, labels),
        davies_bouldin_score(blobs, labels)))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x for _, _, x in lscores])

plt.show()


# Now only CH indicex says that there is three clusters on the dataset, Silhouette and DB only see two. 

# In[46]:


from amltlearn.metrics import  scatter_matrices_scores

lscores = []
nclusters = 5
for nc in range(2,nclusters+1):
    km = KMeans(n_clusters=nc, n_init=10, random_state=0)
    labels = km.fit_predict(blobs)
    lscores.append(scatter_matrices_scores(blobs, labels, indices= ['Hartigan', 'Xu', 'ZCF']))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x['Hartigan'] for x in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x['ZCF'] for x in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x['Xu'] for x in lscores])

plt.show()


# These are the classical Hartigan index (used fequently for hierarchical clustering) and two other recent published indices

# GMM does not have more luck discovering the noisy clusters.

# In[47]:


gmm = GaussianMixture(n_components=3, covariance_type='diag', random_state=0)
gmm.fit(blobs)
print(adjusted_mutual_info_score(blabels, labels))


# In[48]:


lscores = []
nclusters = 5
for nc in range(2,nclusters+1):
    gmm = GaussianMixture(n_components=nc, covariance_type='diag', random_state=0)
    gmm.fit(blobs)    
    labels = gmm.predict(blobs)
    lscores.append((
        silhouette_score(blobs, labels),
        calinski_harabasz_score(blobs, labels),
        davies_bouldin_score(blobs, labels)))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x for _, _, x in lscores])

plt.show()


# And the indices have similar results

# Now for non linear data

# In[49]:


from sklearn.datasets import make_moons
moons, mlabels = make_moons(n_samples=200, noise=0.1)

fig = plt.figure(figsize=(10,10))

ax = fig.add_subplot(111)
plt.scatter(moons[:, 0], moons[:, 1], c=mlabels, s=100)
plt.show()


# Obviously the clusters obtained by K-means have little to do with the actual labels

# In[50]:


km = KMeans(n_clusters=2, n_init=10, random_state=0)
labels = km.fit_predict(moons)

print(adjusted_mutual_info_score(mlabels, labels))


# In[51]:


lscores = []
nclusters = 20
for nc in range(2,nclusters+1):
    km = KMeans(n_clusters=nc, n_init=10, random_state=0)
    labels = km.fit_predict(moons)
    lscores.append((
        silhouette_score(moons, labels),
        calinski_harabasz_score(moons, labels),
        davies_bouldin_score(moons, labels)))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x for _, _, x in lscores])

plt.show()


# Now the scores result in a high number of clusters, Silhouette and DB are close to agreeing on (maybe) 8 clusters, for CH the more the better (12-13?)

# In[52]:


lscores = []
nclusters = 20
for nc in range(2,nclusters+1):
    km = KMeans(n_clusters=nc, n_init=10, random_state=0)
    labels = km.fit_predict(moons)
    lscores.append(scatter_matrices_scores(moons, labels, indices= ['Hartigan', 'Xu', 'ZCF']))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x['Hartigan'] for x in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x['ZCF'] for x in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x['Xu'] for x in lscores])

plt.show()


# The other indices also agree on the more the better

# We can apply a non linear transformation to the data

# In[53]:


from sklearn.manifold import Isomap
iso = Isomap(n_components=2, n_neighbors=7)
fdata = iso.fit_transform(moons)

km = KMeans(n_clusters=2, n_init=10, random_state=0)
labels = km.fit_predict(fdata)

print(adjusted_mutual_info_score(mlabels, labels))


# In[54]:


lscores = []
nclusters = 20
for nc in range(2,nclusters+1):
    km = KMeans(n_clusters=nc, n_init=10, random_state=0)
    labels = km.fit_predict(fdata)
    lscores.append((
        silhouette_score(fdata, labels),
        calinski_harabasz_score(fdata, labels),
        davies_bouldin_score(fdata, labels)))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x for _, _, x in lscores])

plt.show()


# But results are not much better

# In[55]:


lscores = []
nclusters = 20
for nc in range(2,nclusters+1):
    km = KMeans(n_clusters=nc, n_init=10, random_state=0)
    labels = km.fit_predict(fdata)
    lscores.append(scatter_matrices_scores(fdata, labels, indices= ['Hartigan', 'Xu', 'ZCF']))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x['Hartigan'] for x in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x['ZCF'] for x in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x['Xu'] for x in lscores])

plt.show()


# For any of the indices

# # External Indices

# Another approach to cluster validation is to use external indices, this assumes that we know the labels and we compute how
# "correlated" are to the ones discovered.

# In[56]:


blobs, blabels = make_blobs(n_samples=500, n_features=10, centers=6, cluster_std=.6, center_box=(-3,3))

fig = plt.figure(figsize=(10,10))

ax = fig.add_subplot(111, projection='3d')
plt.scatter(blobs[:, 0], blobs[:, 1], zs=blobs[:, 2], depthshade=False, c=blabels, s=100)
plt.show()


# In[57]:


from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

lscores = []
nclusters = 8
for nc in range(2,nclusters+1):
    km = KMeans(n_clusters=nc, n_init=10, random_state=0)
    labels = km.fit_predict(blobs)
    lscores.append((
        adjusted_mutual_info_score(blabels, labels),
        adjusted_rand_score(blabels, labels),
        normalized_mutual_info_score(blabels, labels)))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x for _, _, x in lscores])

plt.show()


# In this case the maximum value of the scores is at the correct number of clusters

# We do not always have a set of labels to compare with, another approach to take advantage of algorithms that return different partitions depending on initialization (e.g. K-means) and test how close are the labelings, the correct number of clusters should be where the different clusters have higher agreement.

# In[58]:


import numpy as np

lscores = []
nclusters = 8
for nc in range(2,nclusters+1):
    llabels = []
    for i in range(10):
        km = KMeans(n_clusters=nc, n_init=10)
        labels = km.fit_predict(blobs)
        llabels.append(labels)
    mscores = []
    for i in range(10):
        for j in range(i,10):
            mscores.append(adjusted_mutual_info_score(llabels[i], llabels[j]))
    lscores.append(np.mean(mscores))
            
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(121)
plt.plot(range(2,nclusters+1), lscores)

plt.show()


# In this case there is a range of possible clusters with high agreement.

# In[ ]: