#!/usr/bin/env python
# coding: utf-8

# # Spectral Clustering Example

# The image loaded here is a cropped portion of a LANDSAT image of [Walker Lake](Walker_Lake.ipynb).
# 
# In addition to `dask-ml`, we'll use `rasterio` to read the data and `matplotlib` to plot the figures.
# I'm just working on my laptop, so we could use either the threaded or distributed scheduler, but here I'll use the distributed scheduler for the diagnostics.

# In[ ]:


import rasterio
import numpy as np
import xarray as xr
import holoviews as hv
from holoviews import opts
from holoviews.operation.datashader import regrid
import cartopy.crs as ccrs
import dask.array as da
#from dask_ml.cluster import SpectralClustering
from dask.distributed import Client
hv.extension('bokeh')


# In[ ]:


import dask_ml


# In[ ]:


dask_ml.__version__


# In[ ]:


from dask_ml.cluster import SpectralClustering


# In[ ]:


client = Client(processes=False)
#client = Client(n_workers=8, threads_per_worker=1)
client


# In[ ]:


import intake
cat = intake.open_catalog('./catalog.yml')
list(cat)


# In[ ]:


landsat_5_img = cat.landsat_5.read_chunked()
landsat_5_img


# In[ ]:


crs=ccrs.epsg(32611)


# In[ ]:


x_center, y_center = crs.transform_point(-118.7081, 38.6942, ccrs.PlateCarree())
buffer = 1.7e4

xmin = x_center - buffer
xmax = x_center + buffer
ymin = y_center - buffer
ymax = y_center + buffer

ROI = landsat_5_img.sel(x=slice(xmin, xmax), y=slice(ymax, ymin))
ROI = ROI.where(ROI > ROI.nodatavals[0])


# In[ ]:


bands = ROI.astype(float)
bands = (bands - bands.mean()) / bands.std()
bands


# In[ ]:


opts.defaults(
    opts.Image(invert_yaxis=True, width=250, height=250, tools=['hover'], cmap='viridis'))


# In[ ]:


hv.Layout([regrid(hv.Image(band, kdims=['x', 'y'])) for band in bands[:3]])


# In[ ]:


flat_input = bands.stack(z=('y', 'x'))
flat_input


# In[ ]:


flat_input.shape


# We'll reshape the image to be how dask-ml / scikit-learn expect it: `(n_samples, n_features)` where n_features is 1 in this case. Then we'll persist that in memory. We still have a small dataset at this point. The large dataset, which dask helps us manage, is the intermediate `n_samples x n_samples` array that spectral clustering operates on. For our 2,500 x 2,500 pixel subset, that's ~50

# In[ ]:


X = flat_input.values.astype('float').T
X.shape


# In[ ]:


X = da.from_array(X, chunks=100_000)
X = client.persist(X)


# And we'll fit the estimator.

# In[ ]:


clf = SpectralClustering(n_clusters=4, random_state=0,
                         gamma=None,
                         kmeans_params={'init_max_iter': 5},
                         persist_embedding=True)


# In[ ]:


get_ipython().run_line_magic('time', 'clf.fit(X)')


# In[ ]:


labels = clf.assign_labels_.labels_.compute()
labels.shape


# In[ ]:


labels = labels.reshape(bands[0].shape)


# In[ ]:


hv.Layout([regrid(hv.Image(band, kdims=['x', 'y'])) for band in bands]) 


# In[ ]:


hv.Layout([regrid(hv.Image(band, kdims=['x', 'y'])) for band in bands[3:]])


# In[ ]:


hv.Image(labels)