In [7]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist

np.random.seed(0)

seeds = pd.read_csv('Seed_Data.csv')

In [2]:

X = seeds[['A','P','C','LK','WK','A_Coef','LKG']]

In [3]:

def k_means(X, K):
#Keep track of history so you can see K-Means in action
    centroids_history = []
    labels_history = []
    rand_index = np.random.choice(X.shape[0], K)  
    centroids = X[rand_index]
    centroids_history.append(centroids)
    while True:
# Euclidean distances are calculated for each point relative to centroids, #and then np.argmin returns
# the index location of the minimal distance - which cluster a point is #assigned to
        labels = np.argmin(cdist(X, centroids), axis=1)
        labels_history.append(labels)
#Take mean of points within clusters to find new centroids:
        new_centroids = np.array([X[labels == i].mean(axis=0)
                                for i in range(K)])
        centroids_history.append(new_centroids)
        
# If old centroids and new centroids no longer change, K-Means is complete and end. Otherwise continue
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    
    return centroids, labels, centroids_history, labels_history

In [4]:

X_mat = X.values

In [5]:

centroids, labels, centroids_history, labels_history = k_means(X_mat, 3)

In [6]:

silhouette_score(X[['A','LK']], labels)

Out[6]:

0.5875704550892767