#!/usr/bin/env python # coding: utf-8 # # Hierarchal Clustering # In[4]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # ## The Data # In[5]: df = pd.read_csv(r"C:\Users\Teni\Desktop\Git-Github\DATA\cluster_mpg.csv") # In[6]: df = df.dropna() # In[7]: df.head() # In[8]: df.describe() # In[9]: df['origin'].value_counts() # In[10]: df_w_dummies = pd.get_dummies(df.drop('name',axis=1)) # In[11]: df_w_dummies # ----- # In[12]: from sklearn.preprocessing import MinMaxScaler # In[13]: scaler = MinMaxScaler() # In[14]: scaled_data = scaler.fit_transform(df_w_dummies) # In[15]: scaled_data # In[16]: scaled_df = pd.DataFrame(scaled_data,columns=df_w_dummies.columns) # In[17]: plt.figure(figsize=(15,8)) sns.heatmap(scaled_df,cmap='magma'); # In[18]: sns.clustermap(scaled_df,row_cluster=False) # In[19]: sns.clustermap(scaled_df,col_cluster=False) # ## Using Scikit-Learn # In[20]: from sklearn.cluster import AgglomerativeClustering # In[21]: model = AgglomerativeClustering(n_clusters=4) # In[22]: cluster_labels = model.fit_predict(scaled_df) # In[23]: cluster_labels # In[24]: plt.figure(figsize=(12,4),dpi=200) sns.scatterplot(data=df,x='mpg',y='weight',hue=cluster_labels) # ## Exploring Number of Clusters with Dendrograms # # Make sure to read the documentation online! # https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html # # #### Assuming every point starts as its own cluster # In[25]: model = AgglomerativeClustering(n_clusters=None,distance_threshold=0) # In[26]: cluster_labels = model.fit_predict(scaled_df) # In[27]: cluster_labels # In[28]: from scipy.cluster.hierarchy import dendrogram from scipy.cluster import hierarchy # ## Linkage Model # In[29]: linkage_matrix = hierarchy.linkage(model.children_) # In[30]: linkage_matrix # In[31]: plt.figure(figsize=(20,10)) # Warning! This plot will take awhile!! dn = hierarchy.dendrogram(linkage_matrix) # In[32]: plt.figure(figsize=(20,10)) dn = hierarchy.dendrogram(linkage_matrix,truncate_mode='lastp',p=48) # ### Choosing a Threshold Distance # # **What is the distance between two points?** # In[33]: scaled_df.describe() # In[34]: scaled_df['mpg'].idxmax() # In[35]: scaled_df['mpg'].idxmin() # In[36]: # https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy a = scaled_df.iloc[320] b = scaled_df.iloc[28] dist = np.linalg.norm(a-b) # In[37]: dist # #### Max possible distance? # # Recall Euclidean distance: https://en.wikipedia.org/wiki/Euclidean_distance # In[38]: np.sqrt(len(scaled_df.columns)) # ### Creating a Model Based on Distance Threshold # # * distance_threshold # * The linkage distance threshold above which, clusters will not be merged. # In[39]: model = AgglomerativeClustering(n_clusters=None,distance_threshold=2) # In[40]: cluster_labels = model.fit_predict(scaled_data) # In[41]: cluster_labels # In[42]: np.unique(cluster_labels) # ### Linkage Matrix # # Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage # # A (n-1) by 4 matrix Z is returned. At the i-th iteration, clusters with indices Z[i, 0] and Z[i, 1] are combined to form cluster n + i. A cluster with an index less than n corresponds to one of the original observations. The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2]. The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster. # In[43]: linkage_matrix = hierarchy.linkage(model.children_) # In[44]: linkage_matrix # In[45]: plt.figure(figsize=(20,10)) dn = hierarchy.dendrogram(linkage_matrix,truncate_mode='lastp',p=11)