#!/usr/bin/env python # coding: utf-8 # In[42]: import numpy as np import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') # In[43]: # input features X = np.array([ [1, 3], [2, 1.8], [5, 8], [8, 8], [1., 0.8], [9, 11] ]) plt.scatter(X[:,0], X[:,1], s=100) plt.show() # In[44]: # colors for our groups colors = 5*['r', 'b', 'g', 'm', 'y', 'c'] # In[52]: class K_Means: def __init__(self, k=2, tol=0.01, max_iter=300): self.k = k # num groups to cluster self.tol = tol # tolerance: how much centroid can move before converging self.max_iter = max_iter # if tolerance is not reached (did not converge) def fit(self, data): self.centroids = {} # first k centroids are chosen arbitraraly # in this case the first k features are the initial centroids for i in range(self.k): self.centroids[i] = data[i] for i in range(self.max_iter): # contains (centroid, fetures) pairs self.classifications = {} for i in range(self.k): self.classifications[i] = [] for featureset in data: # list of distances from each centroid from the feature distances = [np.linalg.norm(featureset - self.centroids[centroid]) for centroid in self.centroids] classification = distances.index(min(distances)) self.classifications[classification].append(featureset) prev_centroids = dict(self.centroids) for classification in self.classifications: # centroid changes to mean of features self.centroids[classification] = np.average(self.classifications[classification], axis=0) # see if converged converged = True for c in self.centroids: origanal_centroid = prev_centroids[c] current_centroid = self.centroids[c] if np.sum((current_centroid-origanal_centroid) / origanal_centroid*100.0) > self.tol: print('centroid moved: ', np.sum((current_centroid-origanal_centroid) / origanal_centroid*100.0)) converged = False def predict(self, data): distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids] classification = distances.index(min(distances)) return classification # In[55]: clf = K_Means(k=2) clf.fit(X) # In[54]: for centroid in clf.centroids: plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1], marker = 'o', c='k',s=60) for classification in clf.classifications: color = colors[classification] for featureset in clf.classifications[classification]: plt.scatter(featureset[0], featureset[1], marker='x', c=color, s=70,linewidth=3) # plt.show()