#!/usr/bin/env python # coding: utf-8 # [Sebastian Raschka](http://sebastianraschka.com), 2015 # # https://github.com/rasbt/python-machine-learning-book # # Python Machine Learning - Code Examples # # Chapter 11 - Working with Unlabeled Data – Clustering Analysis # Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s). # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,scipy,scikit-learn") # In[ ]: # to install watermark just uncomment the following line: #%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py #
#
# ### Overview # - [Grouping objects by similarity using k-means](#Grouping-objects-by-similarity-using-k-means) # - [K-means++](#K-means++) # - [Hard versus soft clustering](#Hard-versus-soft-clustering) # - [Using the elbow method to find the optimal number of clusters](#Using-the-elbow-method-to-find-the-optimal-number-of-clusters) # - [Quantifying the quality of clustering via silhouette plots](#Quantifying-the-quality-of-clustering-via-silhouette-plots) # - [Organizing clusters as a hierarchical tree](#Organizing-clusters-as-a-hierarchical-tree) # - [Performing hierarchical clustering on a distance matrix](#Performing-hierarchical-clustering-on-a-distance-matrix) # - [Attaching dendrograms to a heat map](#Attaching-dendrograms-to-a-heat-map) # - [Applying agglomerative clustering via scikit-learn](#Applying-agglomerative-clustering-via-scikit-learn) # - [Locating regions of high density via DBSCAN](#Locating-regions-of-high-density-via-DBSCAN) # - [Summary](#Summary) #
#
# In[1]: from IPython.display import Image # # Grouping objects by similarity using k-means # In[1]: from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=150, n_features=2, centers=3, cluster_std=0.5, shuffle=True, random_state=0) # In[2]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.scatter(X[:,0], X[:,1], c='white', marker='o', s=50) plt.grid() plt.tight_layout() #plt.savefig('./figures/spheres.png', dpi=300) plt.show() # In[3]: from sklearn.cluster import KMeans km = KMeans(n_clusters=3, init='random', n_init=10, max_iter=300, tol=1e-04, random_state=0) y_km = km.fit_predict(X) plt.scatter(X[y_km==0,0], X[y_km==0,1], s=50, c='lightgreen', marker='s', label='cluster 1') plt.scatter(X[y_km==1,0], X[y_km==1,1], s=50, c='orange', marker='o', label='cluster 2') plt.scatter(X[y_km==2,0], X[y_km==2,1], s=50, c='lightblue', marker='v', label='cluster 3') plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=250, marker='*', c='red', label='centroids') plt.legend() plt.grid() plt.tight_layout() #plt.savefig('./figures/centroids.png', dpi=300) plt.show() #
# ## K-means++ # ... # ## Hard versus soft clustering # ... # ## Using the elbow method to find the optimal number of clusters # In[4]: print('Distortion: %.2f' % km.inertia_) # In[5]: distortions = [] for i in range(1, 11): km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=300, random_state=0) km.fit(X) distortions .append(km.inertia_) plt.plot(range(1,11), distortions , marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.tight_layout() #plt.savefig('./figures/elbow.png', dpi=300) plt.show() #
# ## Quantifying the quality of clustering via silhouette plots # In[6]: import numpy as np from matplotlib import cm from sklearn.metrics import silhouette_samples km = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, tol=1e-04, random_state=0) y_km = km.fit_predict(X) cluster_labels = np.unique(y_km) n_clusters = cluster_labels.shape[0] silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') y_ax_lower, y_ax_upper = 0, 0 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y_km == c] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color = cm.jet(i / n_clusters) plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color) yticks.append((y_ax_lower + y_ax_upper) / 2) y_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) plt.axvline(silhouette_avg, color="red", linestyle="--") plt.yticks(yticks, cluster_labels + 1) plt.ylabel('Cluster') plt.xlabel('Silhouette coefficient') plt.tight_layout() # plt.savefig('./figures/silhouette.png', dpi=300) plt.show() # Comparison to "bad" clustering: # In[7]: km = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=1e-04, random_state=0) y_km = km.fit_predict(X) plt.scatter(X[y_km==0,0], X[y_km==0,1], s=50, c='lightgreen', marker='s', label='cluster 1') plt.scatter(X[y_km==1,0], X[y_km==1,1], s=50, c='orange', marker='o', label='cluster 2') plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=250, marker='*', c='red', label='centroids') plt.legend() plt.grid() plt.tight_layout() #plt.savefig('./figures/centroids_bad.png', dpi=300) plt.show() # In[8]: cluster_labels = np.unique(y_km) n_clusters = cluster_labels.shape[0] silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') y_ax_lower, y_ax_upper = 0, 0 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y_km == c] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color = cm.jet(i / n_clusters) plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color) yticks.append((y_ax_lower + y_ax_upper) / 2) y_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) plt.axvline(silhouette_avg, color="red", linestyle="--") plt.yticks(yticks, cluster_labels + 1) plt.ylabel('Cluster') plt.xlabel('Silhouette coefficient') plt.tight_layout() # plt.savefig('./figures/silhouette_bad.png', dpi=300) plt.show() #
#
# # Organizing clusters as a hierarchical tree # In[4]: Image(filename='./images/11_05.png', width=400) # In[9]: import pandas as pd import numpy as np np.random.seed(123) variables = ['X', 'Y', 'Z'] labels = ['ID_0','ID_1','ID_2','ID_3','ID_4'] X = np.random.random_sample([5,3])*10 df = pd.DataFrame(X, columns=variables, index=labels) df #
# ## Performing hierarchical clustering on a distance matrix # In[10]: from scipy.spatial.distance import pdist,squareform row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels) row_dist # We can either pass a condensed distance matrix (upper triangular) from the `pdist` function, or we can pass the "original" data array and define the `'euclidean'` metric as function argument n `linkage`. However, we should nott pass the squareform distance matrix, which would yield different distance values although the overall clustering could be the same. # In[11]: # 1. incorrect approach: Squareform distance matrix from scipy.cluster.hierarchy import linkage row_clusters = linkage(row_dist, method='complete', metric='euclidean') pd.DataFrame(row_clusters, columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'], index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])]) # In[12]: # 2. correct approach: Condensed distance matrix row_clusters = linkage(pdist(df, metric='euclidean'), method='complete') pd.DataFrame(row_clusters, columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'], index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])]) # In[13]: # 3. correct approach: Input sample matrix row_clusters = linkage(df.values, method='complete', metric='euclidean') pd.DataFrame(row_clusters, columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'], index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])]) # In[14]: from scipy.cluster.hierarchy import dendrogram # make dendrogram black (part 1/2) # from scipy.cluster.hierarchy import set_link_color_palette # set_link_color_palette(['black']) row_dendr = dendrogram(row_clusters, labels=labels, # make dendrogram black (part 2/2) # color_threshold=np.inf ) plt.tight_layout() plt.ylabel('Euclidean distance') #plt.savefig('./figures/dendrogram.png', dpi=300, # bbox_inches='tight') plt.show() #
# ## Attaching dendrograms to a heat map # In[15]: # plot row dendrogram fig = plt.figure(figsize=(8,8)) axd = fig.add_axes([0.09,0.1,0.2,0.6]) row_dendr = dendrogram(row_clusters, orientation='right') # reorder data with respect to clustering df_rowclust = df.ix[row_dendr['leaves'][::-1]] axd.set_xticks([]) axd.set_yticks([]) # remove axes spines from dendrogram for i in axd.spines.values(): i.set_visible(False) # plot heatmap axm = fig.add_axes([0.23,0.1,0.6,0.6]) # x-pos, y-pos, width, height cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r') fig.colorbar(cax) axm.set_xticklabels([''] + list(df_rowclust.columns)) axm.set_yticklabels([''] + list(df_rowclust.index)) # plt.savefig('./figures/heatmap.png', dpi=300) plt.show() #
# ## Applying agglomerative clustering via scikit-learn # In[16]: from sklearn.cluster import AgglomerativeClustering ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete') labels = ac.fit_predict(X) print('Cluster labels: %s' % labels) #
#
# # Locating regions of high density via DBSCAN # In[5]: Image(filename='./images/11_11.png', width=500) # In[17]: from sklearn.datasets import make_moons X, y = make_moons(n_samples=200, noise=0.05, random_state=0) plt.scatter(X[:,0], X[:,1]) plt.tight_layout() #plt.savefig('./figures/moons.png', dpi=300) plt.show() # K-means and hierarchical clustering: # In[18]: f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,3)) km = KMeans(n_clusters=2, random_state=0) y_km = km.fit_predict(X) ax1.scatter(X[y_km==0,0], X[y_km==0,1], c='lightblue', marker='o', s=40, label='cluster 1') ax1.scatter(X[y_km==1,0], X[y_km==1,1], c='red', marker='s', s=40, label='cluster 2') ax1.set_title('K-means clustering') ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete') y_ac = ac.fit_predict(X) ax2.scatter(X[y_ac==0,0], X[y_ac==0,1], c='lightblue', marker='o', s=40, label='cluster 1') ax2.scatter(X[y_ac==1,0], X[y_ac==1,1], c='red', marker='s', s=40, label='cluster 2') ax2.set_title('Agglomerative clustering') plt.legend() plt.tight_layout() #plt.savefig('./figures/kmeans_and_ac.png', dpi=300) plt.show() # Density-based clustering: # In[19]: from sklearn.cluster import DBSCAN db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean') y_db = db.fit_predict(X) plt.scatter(X[y_db==0,0], X[y_db==0,1], c='lightblue', marker='o', s=40, label='cluster 1') plt.scatter(X[y_db==1,0], X[y_db==1,1], c='red', marker='s', s=40, label='cluster 2') plt.legend() plt.tight_layout() #plt.savefig('./figures/moons_dbscan.png', dpi=300) plt.show() #
#
# # Summary # ...