#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

import numpy as np
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns
# from geopy.distance import distance


# In[3]:


PATH = Path('data')
list(PATH.iterdir())

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# from mpl_toolkits.basemap import Basemap
import folium
from folium.plugins import MarkerCluster,FastMarkerCluster


# In[4]:


df = pd.read_feather(PATH/'houston_ready.feather')


# 
# # Density-based Spatial Clustering of Applications with Noise (DBSCAN)
# 
# This algorithm is similar to mean-shift (density-based). One different thing about DBSCAN is how it processes outliers: Meanshift includes outliers into a cluster, meanwhile DBSCAN identifies them as noises
# 
# Pros: 
# - no need to specify number of clusters
# - good for data with similar density
# - can handle arbitrary distance functions, such as haversine/geodetic distance functions -> more suitable for this GPS dataset 
# 
# Cons:
# - does not work well with varying density clusters (harder to estimate distance threshold and min points)

# In[4]:


# df_small = df.iloc[-100000:]


# In[5]:


coords = df[['latitude','longitude']].values


# In[6]:


from sklearn.cluster import DBSCAN


# In[7]:


kms_per_radian = 6371.0088
epsilon = 2 / kms_per_radian


# In[ ]:


db = DBSCAN(eps=epsilon, min_samples=100, algorithm='ball_tree', metric='haversine',n_jobs=-1)
cluster = db.fit_predict(np.radians(coords))