#!/usr/bin/env python
# coding: utf-8

# # Hierarchal Clustering

# In[4]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# ## The Data

# In[5]:


df = pd.read_csv(r"C:\Users\Teni\Desktop\Git-Github\DATA\cluster_mpg.csv")


# In[6]:


df = df.dropna()


# In[7]:


df.head()


# In[8]:


df.describe()


# In[9]:


df['origin'].value_counts()


# In[10]:


df_w_dummies = pd.get_dummies(df.drop('name',axis=1))


# In[11]:


df_w_dummies


# -----

# In[12]:


from sklearn.preprocessing import MinMaxScaler


# In[13]:


scaler = MinMaxScaler()


# In[14]:


scaled_data = scaler.fit_transform(df_w_dummies)


# In[15]:


scaled_data


# In[16]:


scaled_df = pd.DataFrame(scaled_data,columns=df_w_dummies.columns)


# In[17]:


plt.figure(figsize=(15,8))
sns.heatmap(scaled_df,cmap='magma');


# In[18]:


sns.clustermap(scaled_df,row_cluster=False)


# In[19]:


sns.clustermap(scaled_df,col_cluster=False)


# ## Using Scikit-Learn

# In[20]:


from sklearn.cluster import AgglomerativeClustering


# In[21]:


model = AgglomerativeClustering(n_clusters=4)


# In[22]:


cluster_labels = model.fit_predict(scaled_df)


# In[23]:


cluster_labels


# In[24]:


plt.figure(figsize=(12,4),dpi=200)
sns.scatterplot(data=df,x='mpg',y='weight',hue=cluster_labels)


# ## Exploring Number of Clusters with Dendrograms
# 
# Make sure to read the documentation online!
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html
# 
# #### Assuming every point starts as its own cluster

# In[25]:


model = AgglomerativeClustering(n_clusters=None,distance_threshold=0)


# In[26]:


cluster_labels = model.fit_predict(scaled_df)


# In[27]:


cluster_labels


# In[28]:


from scipy.cluster.hierarchy import dendrogram
from scipy.cluster import hierarchy


# ## Linkage Model

# In[29]:


linkage_matrix = hierarchy.linkage(model.children_)


# In[30]:


linkage_matrix


# In[31]:


plt.figure(figsize=(20,10))
# Warning! This plot will take awhile!!
dn = hierarchy.dendrogram(linkage_matrix)


# In[32]:


plt.figure(figsize=(20,10))
dn = hierarchy.dendrogram(linkage_matrix,truncate_mode='lastp',p=48)


# ### Choosing a Threshold Distance
# 
# **What is the distance between two points?**

# In[33]:


scaled_df.describe()


# In[34]:


scaled_df['mpg'].idxmax()


# In[35]:


scaled_df['mpg'].idxmin()


# In[36]:


# https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
a = scaled_df.iloc[320]
b = scaled_df.iloc[28]
dist = np.linalg.norm(a-b)


# In[37]:


dist


# #### Max possible distance?
# 
# Recall Euclidean distance: https://en.wikipedia.org/wiki/Euclidean_distance

# In[38]:


np.sqrt(len(scaled_df.columns))


# ### Creating a Model Based on Distance Threshold
# 
# * distance_threshold
#     * The linkage distance threshold above which, clusters will not be merged.

# In[39]:


model = AgglomerativeClustering(n_clusters=None,distance_threshold=2)


# In[40]:


cluster_labels = model.fit_predict(scaled_data)


# In[41]:


cluster_labels


# In[42]:


np.unique(cluster_labels)


# ### Linkage Matrix
# 
# Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
# 
#     A (n-1) by 4 matrix Z is returned. At the i-th iteration, clusters with indices Z[i, 0] and Z[i, 1] are combined to form cluster n + i. A cluster with an index less than n corresponds to one of the original observations. The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2]. The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster.

# In[43]:


linkage_matrix = hierarchy.linkage(model.children_)


# In[44]:


linkage_matrix


# In[45]:


plt.figure(figsize=(20,10))
dn = hierarchy.dendrogram(linkage_matrix,truncate_mode='lastp',p=11)