This notebook assesses overall similarity of clusters based on Ward's agglomerative clustering and uses Random Forest model to explore the importance of individual characters.
import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe
import matplotlib.pyplot as plt
import urbangrammar_graphics as ugg
from matplotlib.lines import Line2D
from sklearn.ensemble import RandomForestClassifier
%time standardized_form = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/form/standardized/").set_index('hindex')
%time stand_fn = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/function/standardized/")
%time data = dask.dataframe.multi.concat([standardized_form, stand_fn], axis=1).replace([np.inf, -np.inf], np.nan).fillna(0)
%time data = data.drop(columns=["keep_q1", "keep_q2", "keep_q3"])
%time data = data.compute()
CPU times: user 38.5 s, sys: 33.9 s, total: 1min 12s Wall time: 1min 43s CPU times: user 73.3 ms, sys: 81.5 ms, total: 155 ms Wall time: 227 ms CPU times: user 43.1 ms, sys: 6.71 ms, total: 49.8 ms Wall time: 42.6 ms CPU times: user 18.7 ms, sys: 0 ns, total: 18.7 ms Wall time: 18.6 ms CPU times: user 2min 42s, sys: 1min 28s, total: 4min 10s Wall time: 2min 47s
data.info()
<class 'pandas.core.frame.DataFrame'> Index: 14539578 entries, c000e094707t0000 to c102e644989t0115 Columns: 328 entries, sdbAre_q1 to Code_18_521_q3 dtypes: float64(328) memory usage: 35.6+ GB
labels_l1 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/KMeans10GB.pq")
labels_l1
kmeans10gb | |
---|---|
hindex | |
c000e094707t0000 | 4 |
c000e094763t0000 | 0 |
c000e094763t0001 | 0 |
c000e094763t0002 | 0 |
c000e094764t0000 | 0 |
... | ... |
c102e644989t0111 | 0 |
c102e644989t0112 | 0 |
c102e644989t0113 | 0 |
c102e644989t0114 | 0 |
c102e644989t0115 | 0 |
14539578 rows × 1 columns
labels_l2_9 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_cl9_labels.pq")
labels_l2_9
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
113639 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
113640 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
113641 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
113642 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
113643 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 4 | 10 | 2 | 11 | 6 | 6 | 18 | 22 | 13 |
113644 rows × 24 columns
labels_l2_2 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/subclustering_cluster2_k3.pq")
labels_l2_2
subclustering_cluster2_k3 | |
---|---|
hindex | |
c000e097919t0003 | 1 |
c000e097919t0005 | 1 |
c000e097919t0008 | 1 |
c000e097919t0009 | 1 |
c000e097919t0015 | 1 |
... | ... |
c102e639766t0007 | 0 |
c102e639766t0010 | 0 |
c102e639766t0011 | 0 |
c102e639766t0012 | 0 |
c102e639766t0013 | 0 |
1115564 rows × 1 columns
labels = labels_l1.copy()
labels.loc[labels.kmeans10gb == 9, 'kmeans10gb'] = labels_l2_9['9'].values + 90
labels.loc[labels.kmeans10gb == 2, 'kmeans10gb'] = labels_l2_2['subclustering_cluster2_k3'].values + 20
labels.kmeans10gb.value_counts()
7 3686554 0 3022385 3 2561211 1 1962830 5 707211 4 595902 8 564318 20 502835 21 374090 22 238639 6 209959 90 86380 92 21760 94 3739 91 1390 95 264 97 98 98 8 93 3 96 2 Name: kmeans10gb, dtype: int64
outliers = [98, 93, 96, 97]
mask = ~labels.kmeans10gb.isin(outliers)
Similarity of clusters can be represented by hierarchical dendrogram generated using Ward's agglomerative clustering.
from scipy.cluster import hierarchy
group = data.loc[mask].groupby(labels.loc[mask]['kmeans10gb'].values).mean() # cluster centroids
median = data.loc[mask].groupby(labels.loc[mask]['kmeans10gb'].values).median()
Z = hierarchy.linkage(group, 'ward')
fig, ax = plt.subplots(figsize=(25, 15))
dn = hierarchy.dendrogram(Z, labels=group.index)
plt.grid(True, axis='y', which='both')
Z = hierarchy.linkage(median, 'ward')
fig, ax = plt.subplots(figsize=(25, 15))
dn = hierarchy.dendrogram(Z, labels=group.index)
plt.grid(True, axis='y', which='both')
Feature importance indicates which characters are more important in distinguishing between the signature types.
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, verbose=1)
%time clf = clf.fit(data.loc[mask].values, labels.loc[mask]['kmeans10gb'].values)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers. [Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 23.0min
CPU times: user 19h 19min 43s, sys: 2min 14s, total: 19h 21min 57s Wall time: 1h 19min 23s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 79.0min finished
# DO NOT RUN, KILLS THE KERNEL ON OUT OF MEMORY
# clf.score(data.loc[mask].values, labels.loc[mask]['kmeans10gb'].values)
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
importances = clf.feature_importances_
clf
RandomForestClassifier(n_jobs=-1, random_state=42, verbose=1)
std = np.std([clf.feature_importances_ for tree in clf.estimators_], axis=0)
to_plot = pd.DataFrame({"std": std, "imp": clf.feature_importances_}, index=data.columns).sort_values("imp", ascending=False)
to_plot
std | imp | |
---|---|---|
sicCAR_q1 | 4.857226e-17 | 0.036944 |
sicCAR_q2 | 7.632783e-17 | 0.031717 |
mtcWNe_q2 | 3.122502e-17 | 0.023476 |
ltbIBD_q2 | 1.040834e-17 | 0.016662 |
sdcAre_q3 | 3.122502e-17 | 0.016005 |
... | ... | ... |
Code_18_334_q2 | 0.000000e+00 | 0.000000 |
Code_18_141_q1 | 0.000000e+00 | 0.000000 |
Code_18_124_q1 | 0.000000e+00 | 0.000000 |
Code_18_511_q2 | 0.000000e+00 | 0.000000 |
Code_18_521_q3 | 0.000000e+00 | 0.000000 |
328 rows × 2 columns
to_plot['ff'] = pd.Series(to_plot.index).apply(lambda x: "form" if x in data.columns[:177] else "function").values
to_plot["q"] = pd.Series(to_plot.index).apply(lambda x: x[-2:]).values
to_plot.ff.isna().any()
False
import seaborn
seaborn.set()
fig, ax = plt.subplots(figsize=(8, 60))
seaborn.barplot(x='imp', y=to_plot.index, hue='ff', data=to_plot)
<AxesSubplot:xlabel='imp'>
importances = pd.Series(importances.flatten(), index=data.columns).sort_values(ascending=False)
importances.tail(50)
Code_18_512_q2 4.999057e-05 Code_18_512_q3 4.940122e-05 Code_18_211_q1 4.870476e-05 Code_18_243_q1 4.857090e-05 Code_18_122_q2 4.713622e-05 Code_18_122_q3 4.587244e-05 mtdDeg_q1 4.007576e-05 Code_18_111_q1 3.632193e-05 ssbCor_q2 2.848661e-05 Code_18_131_q3 2.597117e-05 Code_18_313_q3 2.332676e-05 Code_18_313_q1 2.211369e-05 Code_18_131_q2 2.180273e-05 Code_18_313_q2 2.172385e-05 Code_18_141_q3 1.715724e-05 Code_18_142_q3 1.472706e-05 Code_18_311_q3 1.048974e-05 xcnSCl_q2 8.842498e-06 Code_18_121_q1 7.240644e-06 mtdDeg_q2 4.812494e-06 Code_18_142_q2 3.522273e-06 sdbCoA_q3 1.751870e-06 lieWCe_q3 1.667145e-06 Code_18_142_q1 1.139103e-06 Code_18_311_q2 5.870861e-07 xcnSCl_q1 1.686649e-07 Code_18_141_q2 1.031014e-07 ssbCor_q1 1.004750e-07 sdbCoA_q1 6.751301e-08 sdbCoA_q2 3.984640e-08 Code_18_311_q1 1.889368e-08 Code_18_132_q1 0.000000e+00 Code_18_334_q3 0.000000e+00 Code_18_512_q1 0.000000e+00 Code_18_521_q1 0.000000e+00 Code_18_411_q1 0.000000e+00 Code_18_132_q2 0.000000e+00 Code_18_122_q1 0.000000e+00 Code_18_244_q2 0.000000e+00 Code_18_244_q1 0.000000e+00 Code_18_244_q3 0.000000e+00 Code_18_521_q2 0.000000e+00 Code_18_131_q1 0.000000e+00 Code_18_334_q1 0.000000e+00 Code_18_511_q1 0.000000e+00 Code_18_334_q2 0.000000e+00 Code_18_141_q1 0.000000e+00 Code_18_124_q1 0.000000e+00 Code_18_511_q2 0.000000e+00 Code_18_521_q3 0.000000e+00 dtype: float64
importances.iloc[150:200]
O,P,Q. Public administration, education and health_q3 0.001755 mtbAli_q2 0.001747 mtbAli_q1 0.001720 F. Construction_q2 0.001702 ltcRea_q3 0.001690 K, L, M, N. Financial, real estate, professional and administrative activities_q2 0.001687 lseERI_q2 0.001680 lseCWA_q3 0.001677 H, J. Transport and communication_q3 0.001675 F. Construction_q1 0.001672 sdsLen_q1 0.001650 sdbPer_q1 0.001636 ldePer_q3 0.001591 sssLin_q3 0.001587 sdsSPW_q3 0.001568 O,P,Q. Public administration, education and health_q2 0.001543 H, J. Transport and communication_q2 0.001519 C. Manufacturing_q3 0.001516 C. Manufacturing_q2 0.001488 mtbAli_q3 0.001486 Code_18_112_q1 0.001482 sdbAre_q1 0.001478 K, L, M, N. Financial, real estate, professional and administrative activities_q1 0.001452 sdsSWD_q1 0.001435 R, S, T, U. Other_q2 0.001433 linP3W_q2 0.001388 G, I. Distribution, hotels and restaurants_q1 0.001360 O,P,Q. Public administration, education and health_q1 0.001359 linP3W_q3 0.001340 H, J. Transport and communication_q1 0.001331 linP3W_q1 0.001307 lteWNB_q2 0.001300 A, B, D, E. Agriculture, energy and water_q2 0.001270 sssLin_q1 0.001265 C. Manufacturing_q1 0.001256 lteWNB_q3 0.001249 lseERI_q1 0.001228 R, S, T, U. Other_q1 0.001224 A, B, D, E. Agriculture, energy and water_q3 0.001211 sscERI_q3 0.001170 culture_nearest 0.001115 nearest_retail_centre 0.001106 A, B, D, E. Agriculture, energy and water_q1 0.001077 lteWNB_q1 0.001068 lseCCo_q2 0.000898 fhrs_counts 0.000868 nearest_water 0.000867 lseCCo_q1 0.000848 lseCCo_q3 0.000825 ssbERI_q3 0.000796 dtype: float64
importances
sicCAR_q1 0.036944 sicCAR_q2 0.031717 mtcWNe_q2 0.023476 ltbIBD_q2 0.016662 sdcAre_q3 0.016005 ... Code_18_334_q2 0.000000 Code_18_141_q1 0.000000 Code_18_124_q1 0.000000 Code_18_511_q2 0.000000 Code_18_521_q3 0.000000 Length: 328, dtype: float64
importances.to_csv("../../urbangrammar_samba/spatial_signatures/clustering_data/spsig_feature_importance.csv")
To better understand the important characters, it is useful to check their actual values.
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(20, 200))
sns.heatmap(group.T, cmap="vlag", center=0, annot=True, cbar=False)
# plt.savefig("../../urbangrammar_samba/spatial_signatures/clustering_data/spsig_heatmap.pdf", bbox_inches="tight")
<AxesSubplot:>
fig, ax = plt.subplots(figsize=(20, 200))
sns.heatmap(group.T.iloc[:, :-4], cmap="vlag", center=0, annot=True, cbar=False)
plt.savefig("../../urbangrammar_samba/spatial_signatures/clustering_data/spsig_heatmap_no_centers.pdf", bbox_inches="tight")