#!/usr/bin/env python
# coding: utf-8

# # Cluster exploration and feature importance
# 
# This notebook assesses overall similarity of clusters based on Ward's agglomerative clustering and uses Random Forest model to explore the importance of individual characters.

# In[1]:


import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe
import matplotlib.pyplot as plt
import urbangrammar_graphics as ugg

from matplotlib.lines import Line2D
from sklearn.ensemble import RandomForestClassifier


# In[2]:


get_ipython().run_line_magic('time', 'standardized_form = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/form/standardized/").set_index(\'hindex\')')
get_ipython().run_line_magic('time', 'stand_fn = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/function/standardized/")')
get_ipython().run_line_magic('time', 'data = dask.dataframe.multi.concat([standardized_form, stand_fn], axis=1).replace([np.inf, -np.inf], np.nan).fillna(0)')
get_ipython().run_line_magic('time', 'data = data.drop(columns=["keep_q1", "keep_q2", "keep_q3"])')
get_ipython().run_line_magic('time', 'data = data.compute()')


# In[72]:


data.info()


# In[3]:


labels_l1 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/KMeans10GB.pq")
labels_l1


# In[4]:


labels_l2_9 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_cl9_labels.pq")
labels_l2_9


# In[5]:


labels_l2_2 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/subclustering_cluster2_k3.pq")
labels_l2_2


# In[6]:


labels = labels_l1.copy()
labels.loc[labels.kmeans10gb == 9, 'kmeans10gb'] = labels_l2_9['9'].values + 90
labels.loc[labels.kmeans10gb == 2, 'kmeans10gb'] = labels_l2_2['subclustering_cluster2_k3'].values + 20


# In[7]:


labels.kmeans10gb.value_counts()


# In[8]:


outliers = [98, 93, 96, 97]
mask = ~labels.kmeans10gb.isin(outliers)


# ## Overall similarity
# 
# Similarity of clusters can be represented by hierarchical dendrogram generated using Ward's agglomerative clustering.

# In[50]:


from scipy.cluster import hierarchy

group = data.loc[mask].groupby(labels.loc[mask]['kmeans10gb'].values).mean() # cluster centroids
median = data.loc[mask].groupby(labels.loc[mask]['kmeans10gb'].values).median()


# In[ ]:


Z = hierarchy.linkage(group, 'ward')
fig, ax = plt.subplots(figsize=(25, 15))
dn = hierarchy.dendrogram(Z, labels=group.index)
plt.grid(True, axis='y', which='both')


# In[ ]:


Z = hierarchy.linkage(median, 'ward')
fig, ax = plt.subplots(figsize=(25, 15))
dn = hierarchy.dendrogram(Z, labels=group.index)
plt.grid(True, axis='y', which='both')


# ## Global feature importance
# 
# Feature importance indicates which characters are more important in distinguishing between the signature types.

# In[11]:


clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, verbose=1)
get_ipython().run_line_magic('time', "clf = clf.fit(data.loc[mask].values, labels.loc[mask]['kmeans10gb'].values)")


# In[ ]:


# DO NOT RUN, KILLS THE KERNEL ON OUT OF MEMORY
# clf.score(data.loc[mask].values, labels.loc[mask]['kmeans10gb'].values)


# In[12]:


importances = clf.feature_importances_


# In[52]:


clf


# In[54]:


std = np.std([clf.feature_importances_ for tree in clf.estimators_], axis=0)


# In[86]:


to_plot = pd.DataFrame({"std": std, "imp": clf.feature_importances_}, index=data.columns).sort_values("imp", ascending=False)
to_plot


# In[94]:


to_plot['ff'] = pd.Series(to_plot.index).apply(lambda x: "form" if x in data.columns[:177] else "function").values


# In[98]:


to_plot["q"] = pd.Series(to_plot.index).apply(lambda x: x[-2:]).values


# In[107]:


to_plot.ff.isna().any()


# In[ ]:


import seaborn
seaborn.set()
fig, ax = plt.subplots(figsize=(8, 60))

seaborn.barplot(x='imp', y=to_plot.index, hue='ff', data=to_plot)


# In[13]:


importances = pd.Series(importances.flatten(), index=data.columns).sort_values(ascending=False)


# In[71]:


importances.tail(50)


# In[26]:


importances.iloc[150:200]


# In[24]:


importances


# In[21]:


importances.to_csv("../../urbangrammar_samba/spatial_signatures/clustering_data/spsig_feature_importance.csv")


# ### Extremes
# 
# To better understand the important characters, it is useful to check their actual values.

# In[ ]:


import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(20, 200))
sns.heatmap(group.T, cmap="vlag", center=0, annot=True, cbar=False)
# plt.savefig("../../urbangrammar_samba/spatial_signatures/clustering_data/spsig_heatmap.pdf", bbox_inches="tight")


# In[ ]:


fig, ax = plt.subplots(figsize=(20, 200))
sns.heatmap(group.T.iloc[:, :-4], cmap="vlag", center=0, annot=True, cbar=False)
plt.savefig("../../urbangrammar_samba/spatial_signatures/clustering_data/spsig_heatmap_no_centers.pdf", bbox_inches="tight")