#!/usr/bin/env python # coding: utf-8 # # Chapter 1: Introduction to Clustering # # ## Exercise 1.04 - 1.05 # In[63]: from sklearn.datasets import make_blobs from sklearn.cluster import KMeans import matplotlib.pyplot as plt import numpy as np import math np.random.seed(0) get_ipython().run_line_magic('matplotlib', 'inline') # In[64]: X, y = make_blobs(n_samples=1500, centers=3, n_features=2, random_state=800) centroids = [[-6,2],[3,-4],[-5,10]] # In[65]: X # In[66]: plt.scatter(X[:, 0], X[:, 1], s=50, cmap='tab20b') plt.show() # In[67]: y # In[68]: plt.scatter(X[:, 0], X[:, 1], c=y,s=50, cmap='tab20b') plt.show() # In[69]: from scipy.spatial.distance import cdist # In[70]: X[105:110] # In[71]: for x in X[105:110]: calcs = cdist(x.reshape([1,-1]),centroids).squeeze() print(calcs, "Cluster Membership: ", np.argmin(calcs)) # In[72]: def k_means(X, K): #Keep track of history so you can see K-Means in action centroids_history = [] labels_history = [] rand_index = np.random.choice(X.shape[0], K) centroids = X[rand_index] centroids_history.append(centroids) while True: # Euclidean distances are calculated for each point relative to centroids, #and then np.argmin returns # the index location of the minimal distance - which cluster a point is #assigned to labels = np.argmin(cdist(X, centroids), axis=1) labels_history.append(labels) #Take mean of points within clusters to find new centroids: new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(K)]) centroids_history.append(new_centroids) # If old centroids and new centroids no longer change, K-Means is complete and end. Otherwise continue if np.all(centroids == new_centroids): break centroids = new_centroids return centroids, labels, centroids_history, labels_history centers, labels, centers_hist, labels_hist = k_means(X, 3) # In[73]: history = zip(centers_hist, labels_hist) for x, y in history: plt.figure(figsize=(4,3)) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='tab20b'); plt.scatter(x[:, 0], x[:, 1], c='red') plt.show() # In[ ]: