%matplotlib inline
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns
# from geopy.distance import distance
PATH = Path('data')
list(PATH.iterdir())
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# from mpl_toolkits.basemap import Basemap
import folium
from folium.plugins import MarkerCluster,FastMarkerCluster
[PosixPath('data/ny.html'), PosixPath('data/houston_processed.feather'), PosixPath('data/florida.html'), PosixPath('data/tx.html'), PosixPath('data/houston_ready.feather'), PosixPath('data/houston.html'), PosixPath('data/location_history.json'), PosixPath('data/houston_processed_miles_time_diff.feather')]
df = pd.read_feather(PATH/'houston_ready.feather')
This algorithm is similar to mean-shift (density-based). One different thing about DBSCAN is how it processes outliers: Meanshift includes outliers into a cluster, meanwhile DBSCAN identifies them as noises
Pros:
Cons:
# df_small = df.iloc[-100000:]
coords = df[['latitude','longitude']].values
from sklearn.cluster import DBSCAN
kms_per_radian = 6371.0088
epsilon = 2 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=100, algorithm='ball_tree', metric='haversine',n_jobs=-1)
cluster = db.fit_predict(np.radians(coords))