#!/usr/bin/env python # coding: utf-8 # In[1]: # gpu import cudf import cuml import rmm from cuml.manifold import TSNE # cpu import pandas as pd import geopandas as gpd import matplotlib.pyplot as plt from sklearn.decomposition import PCA from clustergram import Clustergram from minisom import MiniSom import numpy as np # In[2]: rmm.reinitialize(managed_memory=True) assert(rmm.is_initialized()) # In[3]: df = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/morphometrics/convolutions/conv_68.pq") # In[4]: df = df.set_index('hindex') # In[5]: df = df.fillna(0) # In[6]: data = (df-df.mean())/df.std() # In[7]: data = cudf.from_pandas(data.fillna(0)) # ## TSNE # In[8]: from cuml.manifold import TSNE import matplotlib.pyplot as plt # In[9]: tsne = TSNE(n_components = 2, method = 'barnes_hut', random_state=23, perplexity=50, n_neighbors=500, learning_rate=5) get_ipython().run_line_magic('time', 'embedding = tsne.fit_transform(data)') # In[10]: fig, ax = plt.subplots(1, figsize = (14, 10)) scatter = plt.scatter(embedding[0].to_array(), embedding[1].to_array(), s = 0.01) # ## PCA # In[12]: pca_float = PCA(n_components = 2) pca_float.fit(data.to_pandas().T) # In[13]: fig, ax = plt.subplots(1, figsize = (14, 10)) scatter = plt.scatter(pca_float.components_[0], pca_float.components_[1], s = .005) # ## DBSCAN # In[16]: dbscan = cuml.DBSCAN(eps=25, min_samples=1000) dbscan.fit(data) # In[17]: dbscan.labels_.value_counts() # In[18]: gdf = gpd.read_parquet("../../urbangrammar_samba/spatial_signatures/tessellation/tess_68.pq") # In[19]: gdf.plot(dbscan.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True) # In[21]: cg = Clustergram(range(1, 20), backend='cuML', n_init=10) cg.fit(data) # In[22]: ax = cg.plot(figsize=(20, 20), linewidth=0.5, cluster_style={"edgecolor": "r", "alpha":.6}, size=1, line_style={"alpha":.5}) # In[23]: out_rm = cuml.KMeans(n_clusters=2, n_init=10).fit(data) out_rm.labels_.value_counts() # In[24]: cg = Clustergram(range(1, 20), backend='cuML', n_init=10) cg.fit(data[out_rm.labels_ == 1]) ax = cg.plot(figsize=(20, 20), linewidth=0.5, cluster_style={"edgecolor": "r", "alpha":.6}, size=1, line_style={"alpha":.5}) # In[30]: ax = cg.plot(figsize=(20, 20), linewidth=0.5, cluster_style={"edgecolor": "r", "alpha":.6}, size=1, line_style={"alpha":.5}) ax.set_ylim(-10, 25) # In[25]: km = cuml.KMeans(n_clusters=6, n_init=10).fit(data[out_rm.labels_ == 1]) # In[28]: ax = gdf[out_rm.labels_.to_array() == 1].plot(km.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True) gdf[out_rm.labels_.to_array() != 1].plot(ax=ax, color='grey') # In[29]: ax = gdf[out_rm.labels_.to_array() == 1].set_geometry('buildings').plot(km.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True) gdf[out_rm.labels_.to_array() != 1].set_geometry('buildings').plot(ax=ax, color='grey') # In[31]: km = cuml.KMeans(n_clusters=15, n_init=10).fit(data[out_rm.labels_ == 1]) # In[35]: ax = gdf[out_rm.labels_.to_array() == 1].set_geometry('buildings').plot(km.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True, cmap='tab20') gdf[out_rm.labels_.to_array() != 1].set_geometry('buildings').plot(ax=ax, color='grey') # In[34]: km.labels_.value_counts() # In[36]: km = cuml.KMeans(n_clusters=15, n_init=10).fit(data) # In[37]: km.labels_.value_counts() # In[50]: ax = gdf.set_geometry('buildings').plot(km.labels_.to_array(), legend=True, figsize=(30, 30), categorical=True, cmap='tab20') # In[39]: ax = gdf.plot(km.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True, cmap='tab20') # In[41]: som_shape = (3, 3) som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.5, learning_rate=.25, neighborhood_function='gaussian', random_seed=10) som.train_batch(data.as_matrix(), 50000, verbose=True) # In[43]: winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()]) hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates) # In[44]: ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(20, 20), categorical=True) # In[45]: som_shape = (4, 4) som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.5, learning_rate=.25, neighborhood_function='gaussian', random_seed=10) som.train_batch(data.as_matrix(), 50000, verbose=True) # In[46]: winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()]) hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates) # In[47]: ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True) # In[48]: som_shape = (4, 4) som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.5, learning_rate=.5, neighborhood_function='gaussian', random_seed=10) som.train_batch(data.as_matrix(), 50000, verbose=True) winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()]) hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates) ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True) # In[51]: som_shape = (3, 3) som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=1, learning_rate=.25, neighborhood_function='gaussian', random_seed=10) som.train_batch(data.as_matrix(), 50000, verbose=True) winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()]) hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates) ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True) # In[57]: som_shape = (3, 3) som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=1, learning_rate=1, neighborhood_function='gaussian', random_seed=10) som.train_batch(data.as_matrix(), 5000, verbose=True) # winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()]) # hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates) # ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True) # In[54]: som.topographic_error(data.as_matrix()) # In[55]: som.quantization_error(data.as_matrix()) # In[56]: help(som) # In[64]: get_ipython().run_cell_magic('time', '', "som_comp = pd.DataFrame(columns=['shape', 'topology', 'sigma', 'learning rate', 'topographic_error', 'quantization_error'])\ni = 0\nfor shape in [(3, 3), (3, 4), (4, 4), (4, 5), (5, 5)]:\n for topology in ['rectangular', 'hexagonal']:\n for sigma in [.1, .25, 5, .75, 1]:\n for rate in [.1, .25, 5, .75, 1]:\n som = MiniSom(shape[0], shape[1], data.shape[1], sigma=sigma, learning_rate=rate,\n neighborhood_function='gaussian', random_seed=10, topology=topology)\n som.train_batch(data.as_matrix(), 5000, verbose=False)\n if topology == 'hexagonal':\n som_comp.loc[i] = [shape, topology, sigma, rate, None, som.quantization_error(data.as_matrix())]\n else:\n som_comp.loc[i] = [shape, topology, sigma, rate, som.topographic_error(data.as_matrix()), som.quantization_error(data.as_matrix())]\n i += 1\n") # In[67]: som_comp.loc[som_comp.quantization_error.idxmin()] # In[68]: som_comp.loc[som_comp.quantization_error.idxmax()] # In[71]: som_comp.loc[som_comp.topographic_error.dropna().astype(float).idxmin()] # In[74]: som_shape = (5, 5) som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.1, learning_rate=.1, neighborhood_function='gaussian', random_seed=10, topology="hexagonal") som.train_batch(data.as_matrix(), 5000, verbose=True) winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()]) hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates) ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True, cmap="tab20") # In[ ]: som_shape = (4, 4) som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.25, learning_rate=.25, neighborhood_function='gaussian', random_seed=10, topology="hexagonal") som.train_batch(data.as_matrix(), 5000, verbose=True) winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()]) hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates) ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True, cmap="tab20") # In[ ]: