#!/usr/bin/env python
# coding: utf-8

# In[1]:


# gpu
import cudf
import cuml
import rmm
from cuml.manifold import TSNE

# cpu
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from clustergram import Clustergram
from minisom import MiniSom
import numpy as np


# In[2]:


rmm.reinitialize(managed_memory=True)
assert(rmm.is_initialized())


# In[3]:


df = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/morphometrics/convolutions/conv_68.pq")


# In[4]:


df = df.set_index('hindex')


# In[5]:


df = df.fillna(0)


# In[6]:


data = (df-df.mean())/df.std()


# In[7]:


data = cudf.from_pandas(data.fillna(0))


# ## TSNE

# In[8]:


from cuml.manifold import TSNE
import matplotlib.pyplot as plt


# In[9]:


tsne = TSNE(n_components = 2, method = 'barnes_hut', random_state=23, perplexity=50, n_neighbors=500, learning_rate=5)
get_ipython().run_line_magic('time', 'embedding = tsne.fit_transform(data)')


# In[10]:


fig, ax = plt.subplots(1, figsize = (14, 10))
scatter = plt.scatter(embedding[0].to_array(), embedding[1].to_array(), s = 0.01)


# ## PCA

# In[12]:


pca_float = PCA(n_components = 2)
pca_float.fit(data.to_pandas().T)


# In[13]:


fig, ax = plt.subplots(1, figsize = (14, 10))
scatter = plt.scatter(pca_float.components_[0], pca_float.components_[1], s = .005)


# ## DBSCAN

# In[16]:


dbscan = cuml.DBSCAN(eps=25, min_samples=1000)
dbscan.fit(data)


# In[17]:


dbscan.labels_.value_counts()


# In[18]:


gdf = gpd.read_parquet("../../urbangrammar_samba/spatial_signatures/tessellation/tess_68.pq")


# In[19]:


gdf.plot(dbscan.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True)


# In[21]:


cg = Clustergram(range(1, 20), backend='cuML', n_init=10)
cg.fit(data)


# In[22]:


ax = cg.plot(figsize=(20, 20), linewidth=0.5, cluster_style={"edgecolor": "r", "alpha":.6}, size=1,
       line_style={"alpha":.5})


# In[23]:


out_rm = cuml.KMeans(n_clusters=2, n_init=10).fit(data)
out_rm.labels_.value_counts()


# In[24]:


cg = Clustergram(range(1, 20), backend='cuML', n_init=10)
cg.fit(data[out_rm.labels_ == 1])
ax = cg.plot(figsize=(20, 20), linewidth=0.5, cluster_style={"edgecolor": "r", "alpha":.6}, size=1,
       line_style={"alpha":.5})


# In[30]:


ax = cg.plot(figsize=(20, 20), linewidth=0.5, cluster_style={"edgecolor": "r", "alpha":.6}, size=1,
       line_style={"alpha":.5})
ax.set_ylim(-10, 25)


# In[25]:


km = cuml.KMeans(n_clusters=6, n_init=10).fit(data[out_rm.labels_ == 1])


# In[28]:


ax = gdf[out_rm.labels_.to_array() == 1].plot(km.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True)
gdf[out_rm.labels_.to_array() != 1].plot(ax=ax, color='grey')


# In[29]:


ax = gdf[out_rm.labels_.to_array() == 1].set_geometry('buildings').plot(km.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True)
gdf[out_rm.labels_.to_array() != 1].set_geometry('buildings').plot(ax=ax, color='grey')


# In[31]:


km = cuml.KMeans(n_clusters=15, n_init=10).fit(data[out_rm.labels_ == 1])


# In[35]:


ax = gdf[out_rm.labels_.to_array() == 1].set_geometry('buildings').plot(km.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True, cmap='tab20')
gdf[out_rm.labels_.to_array() != 1].set_geometry('buildings').plot(ax=ax, color='grey')


# In[34]:


km.labels_.value_counts()


# In[36]:


km = cuml.KMeans(n_clusters=15, n_init=10).fit(data)


# In[37]:


km.labels_.value_counts()


# In[50]:


ax = gdf.set_geometry('buildings').plot(km.labels_.to_array(), legend=True, figsize=(30, 30), categorical=True, cmap='tab20')


# In[39]:


ax = gdf.plot(km.labels_.to_array(), legend=True, figsize=(20, 20), categorical=True, cmap='tab20')


# In[41]:


som_shape = (3, 3)
som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.5, learning_rate=.25,
              neighborhood_function='gaussian', random_seed=10)
som.train_batch(data.as_matrix(), 50000, verbose=True)


# In[43]:


winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()])
hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates)


# In[44]:


ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(20, 20), categorical=True)


# In[45]:


som_shape = (4, 4)
som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.5, learning_rate=.25,
              neighborhood_function='gaussian', random_seed=10)
som.train_batch(data.as_matrix(), 50000, verbose=True)


# In[46]:


winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()])
hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates)


# In[47]:


ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True)


# In[48]:


som_shape = (4, 4)
som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.5, learning_rate=.5,
              neighborhood_function='gaussian', random_seed=10)
som.train_batch(data.as_matrix(), 50000, verbose=True)
winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()])
hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates)
ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True)


# In[51]:


som_shape = (3, 3)
som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=1, learning_rate=.25,
              neighborhood_function='gaussian', random_seed=10)
som.train_batch(data.as_matrix(), 50000, verbose=True)
winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()])
hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates)
ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True)


# In[57]:


som_shape = (3, 3)
som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=1, learning_rate=1,
              neighborhood_function='gaussian', random_seed=10)
som.train_batch(data.as_matrix(), 5000, verbose=True)
# winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()])
# hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates)
# ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True)


# In[54]:


som.topographic_error(data.as_matrix())


# In[55]:


som.quantization_error(data.as_matrix())


# In[56]:


help(som)


# In[64]:


get_ipython().run_cell_magic('time', '', "som_comp = pd.DataFrame(columns=['shape', 'topology', 'sigma', 'learning rate', 'topographic_error', 'quantization_error'])\ni = 0\nfor shape in [(3, 3), (3, 4), (4, 4), (4, 5), (5, 5)]:\n    for topology in ['rectangular', 'hexagonal']:\n        for sigma in [.1, .25, 5, .75, 1]:\n            for rate in [.1, .25, 5, .75, 1]:\n                som = MiniSom(shape[0], shape[1], data.shape[1], sigma=sigma, learning_rate=rate,\n                              neighborhood_function='gaussian', random_seed=10, topology=topology)\n                som.train_batch(data.as_matrix(), 5000, verbose=False)\n                if topology == 'hexagonal':\n                    som_comp.loc[i] = [shape, topology, sigma, rate, None, som.quantization_error(data.as_matrix())]\n                else:\n                    som_comp.loc[i] = [shape, topology, sigma, rate, som.topographic_error(data.as_matrix()), som.quantization_error(data.as_matrix())]\n                i += 1\n")


# In[67]:


som_comp.loc[som_comp.quantization_error.idxmin()]


# In[68]:


som_comp.loc[som_comp.quantization_error.idxmax()]


# In[71]:


som_comp.loc[som_comp.topographic_error.dropna().astype(float).idxmin()]


# In[74]:


som_shape = (5, 5)
som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.1, learning_rate=.1,
              neighborhood_function='gaussian', random_seed=10, topology="hexagonal")
som.train_batch(data.as_matrix(), 5000, verbose=True)
winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()])
hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates)
ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True, cmap="tab20")


# In[ ]:


som_shape = (4, 4)
som = MiniSom(som_shape[0], som_shape[1], data.shape[1], sigma=.25, learning_rate=.25,
              neighborhood_function='gaussian', random_seed=10, topology="hexagonal")
som.train_batch(data.as_matrix(), 5000, verbose=True)
winner_coordinates = np.array([som.winner(x) for x in data.as_matrix()])
hashes = np.apply_along_axis(lambda x: str(tuple(x)), 1, winner_coordinates)
ax = gdf.set_geometry('buildings').plot(hashes, legend=True, figsize=(30, 30), categorical=True, cmap="tab20")


# In[ ]: