#!/usr/bin/env python # coding: utf-8 # ### Looking at cluster consistency # # After clustering some data, you often want to (and should!) inspect the results. Obviously, visualization is a big part of this. It is also helpful to look at some representative examples. HDBSCAN provides methods to get an approximate representative point for each cluster. It does this by calculating the centroid/medoid for each cluster, but weighting each point by its cluster membership strength (i.e. the probability of being in that cluster). One way to get representative samples for a cluster is to look for the points closest and furthest to this representative point. This allows you to look at the cluster and determine how consistent it is. For example, when clustering text this might mean "are the documents in my cluster talking about one topic?". # # Let's get some test data and cluster it: # In[1]: import pandas as pd import numpy as np import hdbscan from scipy.spatial.distance import cdist #Some plotting libraries import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'notebook') sns.set_context('poster') sns.set_color_codes() plot_kwds = {'alpha' : 0.25, 's' : 40, 'linewidths':0} # In[2]: data = np.load('clusterable_data.npy') # In[3]: clusterer = hdbscan.HDBSCAN(min_cluster_size=15) clusterer.fit(data) labels = clusterer.labels_ # Now let's plot our sample data. We will see that the clustering is pretty good. # In[4]: palette = sns.color_palette('deep', np.unique(labels).max() + 1) colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels] plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds) frame = plt.gca() frame.axes.get_xaxis().set_visible(False) frame.axes.get_yaxis().set_visible(False) # ### Finding representative points # # Below is a class that you can use to find the points closest to and furthest away from the cluster centroids/medoids. # In[ ]: # In[5]: class RankedPoints: def __init__(self, points, clusterer, metric='euclidean', selection_method='centroid'): """ Rank points in a cluster based on their distance to the cluster centroid/medoid Parameters ---------- points : array of shape (n_samples, n_features), and must be the same data passed into HDBSCAN clusterer : Instance of HDBSCAN that has been fit to data metric: string or callable, optional (default='euclidean') The metric to use when calculating distance between points in a cluster and the cluster centroid/medoid. If metric is a string or callable, it must be one of the options allowed by scipy.spatial.distance.cdist for its metric parameter. selection_method: string, optional (default='centroid') Method to use to find the weighted cluster center. Allowed options are 'centroid' and 'medoid'. """ self.clusterer = clusterer self.metric = metric allowed_methods = ['centroid', 'medoid'] if selection_method not in allowed_methods: raise ValueError(f'Selection method must be one of {allowed_methods}') if selection_method == 'centroid' and metric != 'euclidean': raise ValueError(f'Metric must be euclidian when using selection_method centroid. ' f'Current metric is {metric}') self.selection_method = selection_method self._embedding_cols = [str(i) for i in range(points.shape[1])] self.embedding_df = pd.DataFrame(points, columns=self._embedding_cols) self.embedding_df['cluster'] = clusterer.labels_ def calculate_all_distances_to_center(self): """For each cluster calculate the distance from each point to the centroid/medoid""" all_distances = pd.DataFrame() for label in np.unique(self.embedding_df['cluster']): distance_df = self.calculate_distances_for_cluster(label) all_distances = pd.concat([all_distances, distance_df]) self.embedding_df = self.embedding_df.merge(all_distances, left_index=True, right_index=True) def calculate_distances_for_cluster(self, cluster_id): """For a given cluster_id calculate the distance from each point to the centroid/medoid. Parameters ---------- cluster_id : int The id of the cluster to compute the distances for. If the cluster id is -1 which corresponds to the noise point cluster, then this will return a distance of NaN. Returns ------- df : A pandas DataFrame containing the distances from each point to the cluster centroid/medoid. The index of the dataframe corresponds to the index in the original data. """ cluster_of_interest = self.embedding_df[self.embedding_df['cluster'] == cluster_id].copy() if cluster_of_interest.empty: raise ValueError(f'Cluster id {cluster_id} not found') # Don't calculate distances for the noise cluster if cluster_id == -1: return pd.DataFrame(np.nan, columns=['dist_to_rep_point'], index=cluster_of_interest.index) if self.selection_method == 'centroid': rep_point = self.clusterer.weighted_cluster_centroid(cluster_id) if self.selection_method == 'medoid': rep_point = self.clusterer.weighted_cluster_medoid(cluster_id) dists = cdist(rep_point.reshape((1,len(self._embedding_cols))), cluster_of_interest[self._embedding_cols].values, metric=self.metric) return pd.DataFrame(dists[0], columns=['dist_to_rep_point'], index=cluster_of_interest.index) def rank_cluster_points_by_distance(self, cluster_id): """For a given cluster return a pandas dataframe of points ranked by distance to the cluster centroid/medoid """ cluster_of_interest = self.embedding_df[self.embedding_df['cluster'] == cluster_id].copy() if cluster_of_interest.empty: raise ValueError(f'Cluster id {cluster_id} not found') if 'dist_to_rep_point' not in self.embedding_df.columns: distance_df = self.calculate_distances_for_cluster(cluster_id) cluster_of_interest = cluster_of_interest.merge(distance_df, left_index=True, right_index=True) cluster_of_interest.sort_values('dist_to_rep_point', inplace=True) return cluster_of_interest def get_closest_samples_for_cluster(self, cluster_id, n_samples=5): """Get the N closest points to the cluster centroid/medoid""" return self.rank_cluster_points_by_distance(cluster_id).head(n_samples) def get_furthest_samples_for_cluster(self, cluster_id, n_samples=5): """Get the N points furthest away from the cluster centroid/medoid""" return self.rank_cluster_points_by_distance(cluster_id).tail(n_samples) # To use it we first need to instantiate the class. We pass in our sample data and the pretrained HDBSCAN class instance. # In[6]: examples = RankedPoints(data, clusterer, metric='euclidean', selection_method='medoid') # You can calculate the distances to the center for all clusters # In[9]: examples.calculate_all_distances_to_center() # Or just one # In[16]: distance_to_cluster_1_center = examples.calculate_distances_for_cluster(1) distance_to_cluster_1_center # Perhaps more usefully, you can get a list of points ranked based on their distance # In[17]: cluster_1_ranked = examples.rank_cluster_points_by_distance(1) cluster_1_ranked # ### Getting the N "most/least representative" points for a cluster # # There are a couple of wrapper methods to get the N points closest/furthest to the cluster center. All of these methods will return a dataframe with the index corresponding to the original data. This makes it easy to merge back with your original data and enrich it with other features. # In[10]: examples.get_closest_samples_for_cluster(2, n_samples=5) # In[11]: examples.get_furthest_samples_for_cluster(2, n_samples=5) # We can get these points for all clusters and put them into a dataframe for plotting # In[12]: close_samples = pd.DataFrame() far_samples = pd.DataFrame() for cluster in pd.unique(clusterer.labels_): if cluster >=0: close_samples = pd.concat([close_samples, examples.get_closest_samples_for_cluster(cluster)]) far_samples = pd.concat([far_samples, examples.get_furthest_samples_for_cluster(cluster)]) # In[13]: close_samples.head() # In[14]: far_samples.head() # Let's make the same plot from before but add black stars for the "most representative" points and blue X's for the "least representative" points # In[15]: palette = sns.color_palette('deep', np.unique(labels).max() + 1) colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels] fig = plt.figure() ax1 = fig.add_subplot(111) ax1.scatter(data.T[0], data.T[1], c=colors, **plot_kwds) ax1.scatter(close_samples['0'], close_samples['1'], s=40, marker='*', c=[(0.0,0.0,0.0)]) ax1.scatter(far_samples['0'], far_samples['1'], s=40, marker='x') frame = plt.gca() frame.axes.get_xaxis().set_visible(False) frame.axes.get_yaxis().set_visible(False) # As you can see this does what we would expect. This is a fairly crude method but can be useful when doing exploratory data analysis.