The goal of this notebook is to identify and extract clusters that can review user's habit and common routine, such as home/school/work clusters
%matplotlib inline
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns
# from geopy.distance import distance
PATH = Path('data')
list(PATH.iterdir())
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# from mpl_toolkits.basemap import Basemap
import folium
from folium.plugins import MarkerCluster,FastMarkerCluster
from sklearn.cluster import KMeans
k=5
import matplotlib.cm as cmx
import matplotlib.colors as mcolors
# from cycler import cycler
# def get_cmap(N):
# color_norm = mcolors.Normalize(vmin=0, vmax=N-1)
# return cmx.ScalarMappable(norm=color_norm, cmap='tab10').to_rgba
# num_colr = k
# cmap = get_cmap(num_colr)
# colr_list = [cmap(float(x)) for x in range(num_colr)]
df = pd.read_feather(PATH/'houston_ready.feather')
df.head(3).T
df.shape
fig,ax = plt.subplots(figsize=(20,10))
_=ax.scatter(df.longitude,df.latitude,c='blue',s=3,alpha=0.5)
_=ax.set_ylabel('latitude')
_=ax.set_xlabel('longitude')
Let's exclude flight GPS points
# recalculate miles differences
def haversine_array(lat1, lng1, lat2, lng2):
lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
AVG_EARTH_RADIUS = 6371 # in km
lat = lat2 - lat1
lng = lng2 - lng1
d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
return h
lat2 = df.latitude.values.tolist()
long2 = df.longitude.values.tolist()
lat1 = df.latitude.shift().values.tolist()
lat1[0] = lat2[0]
long1 = df.longitude.shift().values.tolist()
long1[0] = long2[0]
km_diff = haversine_array(lat1,long1,lat2,long2)
df['mile_diff'] = km_diff * 0.621371 # to miles
df[df.mile_diff>12].groupby(['year','month','day']).mile_diff.mean()
df['plane'] = df.mile_diff>12
# df.to_feather(PATH/'houston_ready.feather')
df_ground = df[~df.plane].copy().reset_index(drop=True)
k=5
kmeans = KMeans(n_clusters=k,random_state=42).fit(df_ground[['latitude','longitude']])
df_ground['cluster']= kmeans.predict(df_ground[['latitude','longitude']])
fig,ax = plt.subplots(figsize=(20,10))
_=ax.scatter(df_ground.longitude,df_ground.latitude,c=df_ground.cluster,cmap='tab10',s=3,alpha=1)
_=ax.set_ylabel('latitude')
_=ax.set_xlabel('longitude')
Look like we still have some residuals of flight GPS left. We did get rid most of flight GPS to Florida (bottom right) and around Dallas (above dense Houston points).
K-means did a good job on separate travelling points (outside of Houston). Let's plot each of them
fig,axes = plt.subplots(nrows=k,figsize=(15,8*k));
for i in range(k):
_=axes[i].scatter(df_ground[df_ground.cluster==i].longitude,df_ground[df_ground.cluster==i].latitude, c=colr_list[i],s=3,alpha=1)
After few k, k=5 seems to fit best as kmeans recognizes all the major states and cities I visited. Let's take a look at few of them
df_flo = df_ground[df_ground.cluster==1].reset_index(drop=True)
df_flo.shape
callback = """\
function (row) {
var icon, marker;
icon = L.AwesomeMarkers.icon({
icon: "map-marker", markerColor: "red"});
marker = L.marker(new L.LatLng(row[0], row[1]));
marker.setIcon(icon);
return marker;
};
"""
FLORIDA=[26.492328,-80.497977]
m_flo = folium.Map(location=FLORIDA,tiles='cartodbpositron',zoom_start=7)
FastMarkerCluster(data=list(zip(df_flo.latitude.values,df_flo.longitude.values)),
callback = callback).add_to(m_flo)
# for lat,long in zip(df_flo.latitude.values,df_flo.longitude.values):
# _=folium.CircleMarker([lat,long], radius=1,
# color='#0080bb', fill_color='#0080bb').add_to(m_flo);
folium.LayerControl().add_to(m_flo)
# m_flo.save(str(PATH/'florida.html'))
m_flo
df_ny = df_ground[df_ground.cluster==2].reset_index(drop=True)
df_ny.shape
NY=[40.735954,-73.993896]
m_ny = folium.Map(location=NY,tiles='cartodbpositron',zoom_start=10)
FastMarkerCluster(data=list(zip(df_ny.latitude.values,df_ny.longitude.values)),
callback = callback).add_to(m_ny)
# for lat,long in zip(df_ny.latitude.values,df_ny.longitude.values):
# _=folium.CircleMarker([lat,long], radius=1,
# color='#0080bb', fill_color='#0080bb').add_to(m_ny);
folium.LayerControl().add_to(m_ny)
# m_ny.save(str(PATH/'ny.html'))
m_ny
df_tx = df_ground[df_ground.cluster==3].reset_index(drop=True)
df_tx.shape
TX=[30.265253, -97.714187]
m_tx = folium.Map(location=TX,tiles='cartodbpositron',zoom_start=7)
FastMarkerCluster(data=list(zip(df_tx.latitude.values,df_tx.longitude.values)),
callback = callback).add_to(m_tx)
# for lat,long in zip(df_tx.latitude.values,df_tx.longitude.values):
# _=folium.CircleMarker([lat,long], radius=1,
# color='#0080bb', fill_color='#0080bb').add_to(m_tx);
folium.LayerControl().add_to(m_tx)
# m_tx.save(str(PATH/'tx.html'))
m_tx
df_md = df_ground[df_ground.cluster==4].reset_index(drop=True)
df_md.shape
MD=[39.084967, -77.152883]
m_md = folium.Map(location=MD,tiles='cartodbpositron',zoom_start=10)
FastMarkerCluster(data=list(zip(df_md.latitude.values,df_md.longitude.values)),
callback = callback).add_to(m_md)
# for lat,long in zip(df_tx.latitude.values,df_tx.longitude.values):
# _=folium.CircleMarker([lat,long], radius=1,
# color='#0080bb', fill_color='#0080bb').add_to(m_tx);
folium.LayerControl().add_to(m_md)
m_md
df_h = df_ground[df_ground.cluster==0].reset_index(drop=True)
df_h.shape
# plot fewer data points for folium
n=50000
idxs = np.random.permutation(len(df_h))[:n]
H=[29.766672, -95.339652]
m_h = folium.Map(location=H,tiles='cartodbpositron',zoom_start=10)
FastMarkerCluster(data=list(zip(df_h.loc[idxs,:].latitude.values,df_h.loc[idxs,:].longitude.values)),
callback = callback).add_to(m_h)
# for lat,long in zip(df_h.latitude.values,df_h.longitude.values):
# _=folium.CircleMarker([lat,long], radius=1,
# color='#0080bb', fill_color='#0080bb').add_to(m_h);
folium.LayerControl().add_to(m_h)
# m_h.save(str(PATH/'houston.html'))
m_h