#!/usr/bin/env python # coding: utf-8 # # Cluster exploration and feature importance # # This notebook assesses overall similarity of clusters based on Ward's agglomerative clustering and uses Random Forest model to explore the importance of individual characters. # In[1]: import numpy as np import pandas as pd import geopandas as gpd import dask.dataframe import matplotlib.pyplot as plt import urbangrammar_graphics as ugg from matplotlib.lines import Line2D from sklearn.ensemble import RandomForestClassifier # In[2]: get_ipython().run_line_magic('time', 'standardized_form = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/form/standardized/").set_index(\'hindex\')') get_ipython().run_line_magic('time', 'stand_fn = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/function/standardized/")') get_ipython().run_line_magic('time', 'data = dask.dataframe.multi.concat([standardized_form, stand_fn], axis=1).replace([np.inf, -np.inf], np.nan).fillna(0)') get_ipython().run_line_magic('time', 'data = data.drop(columns=["keep_q1", "keep_q2", "keep_q3"])') get_ipython().run_line_magic('time', 'data = data.compute()') # In[72]: data.info() # In[3]: labels_l1 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/KMeans10GB.pq") labels_l1 # In[4]: labels_l2_9 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_cl9_labels.pq") labels_l2_9 # In[5]: labels_l2_2 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/subclustering_cluster2_k3.pq") labels_l2_2 # In[6]: labels = labels_l1.copy() labels.loc[labels.kmeans10gb == 9, 'kmeans10gb'] = labels_l2_9['9'].values + 90 labels.loc[labels.kmeans10gb == 2, 'kmeans10gb'] = labels_l2_2['subclustering_cluster2_k3'].values + 20 # In[7]: labels.kmeans10gb.value_counts() # In[8]: outliers = [98, 93, 96, 97] mask = ~labels.kmeans10gb.isin(outliers) # ## Overall similarity # # Similarity of clusters can be represented by hierarchical dendrogram generated using Ward's agglomerative clustering. # In[50]: from scipy.cluster import hierarchy group = data.loc[mask].groupby(labels.loc[mask]['kmeans10gb'].values).mean() # cluster centroids median = data.loc[mask].groupby(labels.loc[mask]['kmeans10gb'].values).median() # In[ ]: Z = hierarchy.linkage(group, 'ward') fig, ax = plt.subplots(figsize=(25, 15)) dn = hierarchy.dendrogram(Z, labels=group.index) plt.grid(True, axis='y', which='both') # In[ ]: Z = hierarchy.linkage(median, 'ward') fig, ax = plt.subplots(figsize=(25, 15)) dn = hierarchy.dendrogram(Z, labels=group.index) plt.grid(True, axis='y', which='both') # ## Global feature importance # # Feature importance indicates which characters are more important in distinguishing between the signature types. # In[11]: clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, verbose=1) get_ipython().run_line_magic('time', "clf = clf.fit(data.loc[mask].values, labels.loc[mask]['kmeans10gb'].values)") # In[ ]: # DO NOT RUN, KILLS THE KERNEL ON OUT OF MEMORY # clf.score(data.loc[mask].values, labels.loc[mask]['kmeans10gb'].values) # In[12]: importances = clf.feature_importances_ # In[52]: clf # In[54]: std = np.std([clf.feature_importances_ for tree in clf.estimators_], axis=0) # In[86]: to_plot = pd.DataFrame({"std": std, "imp": clf.feature_importances_}, index=data.columns).sort_values("imp", ascending=False) to_plot # In[94]: to_plot['ff'] = pd.Series(to_plot.index).apply(lambda x: "form" if x in data.columns[:177] else "function").values # In[98]: to_plot["q"] = pd.Series(to_plot.index).apply(lambda x: x[-2:]).values # In[107]: to_plot.ff.isna().any() # In[ ]: import seaborn seaborn.set() fig, ax = plt.subplots(figsize=(8, 60)) seaborn.barplot(x='imp', y=to_plot.index, hue='ff', data=to_plot) # In[13]: importances = pd.Series(importances.flatten(), index=data.columns).sort_values(ascending=False) # In[71]: importances.tail(50) # In[26]: importances.iloc[150:200] # In[24]: importances # In[21]: importances.to_csv("../../urbangrammar_samba/spatial_signatures/clustering_data/spsig_feature_importance.csv") # ### Extremes # # To better understand the important characters, it is useful to check their actual values. # In[ ]: import seaborn as sns import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(20, 200)) sns.heatmap(group.T, cmap="vlag", center=0, annot=True, cbar=False) # plt.savefig("../../urbangrammar_samba/spatial_signatures/clustering_data/spsig_heatmap.pdf", bbox_inches="tight") # In[ ]: fig, ax = plt.subplots(figsize=(20, 200)) sns.heatmap(group.T.iloc[:, :-4], cmap="vlag", center=0, annot=True, cbar=False) plt.savefig("../../urbangrammar_samba/spatial_signatures/clustering_data/spsig_heatmap_no_centers.pdf", bbox_inches="tight")