#!/usr/bin/env python # coding: utf-8 # # Spectral Clustering Example # The image loaded here is a cropped portion of a LANDSAT image of [Walker Lake](Walker_Lake.ipynb). # # In addition to `dask-ml`, we'll use `rasterio` to read the data and `matplotlib` to plot the figures. # I'm just working on my laptop, so we could use either the threaded or distributed scheduler, but here I'll use the distributed scheduler for the diagnostics. # In[ ]: import rasterio import numpy as np import xarray as xr import holoviews as hv from holoviews import opts from holoviews.operation.datashader import regrid import cartopy.crs as ccrs import dask.array as da #from dask_ml.cluster import SpectralClustering from dask.distributed import Client hv.extension('bokeh') # In[ ]: import dask_ml # In[ ]: dask_ml.__version__ # In[ ]: from dask_ml.cluster import SpectralClustering # In[ ]: client = Client(processes=False) #client = Client(n_workers=8, threads_per_worker=1) client # In[ ]: import intake cat = intake.open_catalog('./catalog.yml') list(cat) # In[ ]: landsat_5_img = cat.landsat_5.read_chunked() landsat_5_img # In[ ]: crs=ccrs.epsg(32611) # In[ ]: x_center, y_center = crs.transform_point(-118.7081, 38.6942, ccrs.PlateCarree()) buffer = 1.7e4 xmin = x_center - buffer xmax = x_center + buffer ymin = y_center - buffer ymax = y_center + buffer ROI = landsat_5_img.sel(x=slice(xmin, xmax), y=slice(ymax, ymin)) ROI = ROI.where(ROI > ROI.nodatavals[0]) # In[ ]: bands = ROI.astype(float) bands = (bands - bands.mean()) / bands.std() bands # In[ ]: opts.defaults( opts.Image(invert_yaxis=True, width=250, height=250, tools=['hover'], cmap='viridis')) # In[ ]: hv.Layout([regrid(hv.Image(band, kdims=['x', 'y'])) for band in bands[:3]]) # In[ ]: flat_input = bands.stack(z=('y', 'x')) flat_input # In[ ]: flat_input.shape # We'll reshape the image to be how dask-ml / scikit-learn expect it: `(n_samples, n_features)` where n_features is 1 in this case. Then we'll persist that in memory. We still have a small dataset at this point. The large dataset, which dask helps us manage, is the intermediate `n_samples x n_samples` array that spectral clustering operates on. For our 2,500 x 2,500 pixel subset, that's ~50 # In[ ]: X = flat_input.values.astype('float').T X.shape # In[ ]: X = da.from_array(X, chunks=100_000) X = client.persist(X) # And we'll fit the estimator. # In[ ]: clf = SpectralClustering(n_clusters=4, random_state=0, gamma=None, kmeans_params={'init_max_iter': 5}, persist_embedding=True) # In[ ]: get_ipython().run_line_magic('time', 'clf.fit(X)') # In[ ]: labels = clf.assign_labels_.labels_.compute() labels.shape # In[ ]: labels = labels.reshape(bands[0].shape) # In[ ]: hv.Layout([regrid(hv.Image(band, kdims=['x', 'y'])) for band in bands]) # In[ ]: hv.Layout([regrid(hv.Image(band, kdims=['x', 'y'])) for band in bands[3:]]) # In[ ]: hv.Image(labels)