#!/usr/bin/env python
# coding: utf-8

# # Chapter 1: Introduction to Clustering
# 
# ## Exercise 1.04 - 1.05

# In[63]:


from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import math

np.random.seed(0)

get_ipython().run_line_magic('matplotlib', 'inline')


# In[64]:


X, y = make_blobs(n_samples=1500, centers=3, n_features=2, random_state=800)
centroids = [[-6,2],[3,-4],[-5,10]]


# In[65]:


X


# In[66]:


plt.scatter(X[:, 0], X[:, 1], s=50, cmap='tab20b')
plt.show()


# In[67]:


y


# In[68]:


plt.scatter(X[:, 0], X[:, 1], c=y,s=50, cmap='tab20b')
plt.show()


# In[69]:


from scipy.spatial.distance import cdist


# In[70]:


X[105:110]


# In[71]:


for x in X[105:110]:
    calcs = cdist(x.reshape([1,-1]),centroids).squeeze()
    print(calcs, "Cluster Membership: ", np.argmin(calcs))


# In[72]:


def k_means(X, K):
#Keep track of history so you can see K-Means in action
    centroids_history = []
    labels_history = []
    rand_index = np.random.choice(X.shape[0], K)  
    centroids = X[rand_index]
    centroids_history.append(centroids)
    while True:
# Euclidean distances are calculated for each point relative to centroids, #and then np.argmin returns
# the index location of the minimal distance - which cluster a point is #assigned to
        labels = np.argmin(cdist(X, centroids), axis=1)
        labels_history.append(labels)
#Take mean of points within clusters to find new centroids:
        new_centroids = np.array([X[labels == i].mean(axis=0)
                                for i in range(K)])
        centroids_history.append(new_centroids)
        
        # If old centroids and new centroids no longer change, K-Means is complete and end. Otherwise continue
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    
    return centroids, labels, centroids_history, labels_history

centers, labels, centers_hist, labels_hist = k_means(X, 3)


# In[73]:


history = zip(centers_hist, labels_hist)
for x, y in history:
    plt.figure(figsize=(4,3))
    plt.scatter(X[:, 0], X[:, 1], c=y,
            s=50, cmap='tab20b');
    plt.scatter(x[:, 0], x[:, 1], c='red')    
    plt.show()


# In[ ]: