#!/usr/bin/env python # coding: utf-8 # In[7]: import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import silhouette_score from scipy.spatial.distance import cdist np.random.seed(0) seeds = pd.read_csv('Seed_Data.csv') # In[2]: X = seeds[['A','P','C','LK','WK','A_Coef','LKG']] # In[3]: def k_means(X, K): #Keep track of history so you can see K-Means in action centroids_history = [] labels_history = [] rand_index = np.random.choice(X.shape[0], K) centroids = X[rand_index] centroids_history.append(centroids) while True: # Euclidean distances are calculated for each point relative to centroids, #and then np.argmin returns # the index location of the minimal distance - which cluster a point is #assigned to labels = np.argmin(cdist(X, centroids), axis=1) labels_history.append(labels) #Take mean of points within clusters to find new centroids: new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(K)]) centroids_history.append(new_centroids) # If old centroids and new centroids no longer change, K-Means is complete and end. Otherwise continue if np.all(centroids == new_centroids): break centroids = new_centroids return centroids, labels, centroids_history, labels_history # In[4]: X_mat = X.values # In[5]: centroids, labels, centroids_history, labels_history = k_means(X_mat, 3) # In[6]: silhouette_score(X[['A','LK']], labels)