#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') import numpy as np import pandas as pd from pathlib import Path import json import matplotlib.pyplot as plt import seaborn as sns # from geopy.distance import distance # In[3]: PATH = Path('data') list(PATH.iterdir()) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # from mpl_toolkits.basemap import Basemap import folium from folium.plugins import MarkerCluster,FastMarkerCluster # In[4]: df = pd.read_feather(PATH/'houston_ready.feather') # # # Density-based Spatial Clustering of Applications with Noise (DBSCAN) # # This algorithm is similar to mean-shift (density-based). One different thing about DBSCAN is how it processes outliers: Meanshift includes outliers into a cluster, meanwhile DBSCAN identifies them as noises # # Pros: # - no need to specify number of clusters # - good for data with similar density # - can handle arbitrary distance functions, such as haversine/geodetic distance functions -> more suitable for this GPS dataset # # Cons: # - does not work well with varying density clusters (harder to estimate distance threshold and min points) # In[4]: # df_small = df.iloc[-100000:] # In[5]: coords = df[['latitude','longitude']].values # In[6]: from sklearn.cluster import DBSCAN # In[7]: kms_per_radian = 6371.0088 epsilon = 2 / kms_per_radian # In[ ]: db = DBSCAN(eps=epsilon, min_samples=100, algorithm='ball_tree', metric='haversine',n_jobs=-1) cluster = db.fit_predict(np.radians(coords))