Unsupervised modeling

The goal of this notebook is to identify and extract clusters that can review user's habit and common routine, such as home/school/work clusters

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns
# from geopy.distance import distance

PATH = Path('data')
list(PATH.iterdir())

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Out[2]:
[PosixPath('data/ny.html'),
 PosixPath('data/houston_processed.feather'),
 PosixPath('data/florida.html'),
 PosixPath('data/tx.html'),
 PosixPath('data/houston_ready.feather'),
 PosixPath('data/houston.html'),
 PosixPath('data/location_history.json'),
 PosixPath('data/houston_processed_miles_time_diff.feather')]
In [3]:
# from mpl_toolkits.basemap import Basemap
import folium
from folium.plugins import MarkerCluster,FastMarkerCluster

from sklearn.cluster import KMeans
k=5

import matplotlib.cm as cmx
import matplotlib.colors as mcolors
# from cycler import cycler

# def get_cmap(N):
#     color_norm  = mcolors.Normalize(vmin=0, vmax=N-1)
#     return cmx.ScalarMappable(norm=color_norm, cmap='tab10').to_rgba
# num_colr = k
# cmap = get_cmap(num_colr)
# colr_list = [cmap(float(x)) for x in range(num_colr)]
In [4]:
df = pd.read_feather(PATH/'houston_ready.feather')
In [5]:
df.head(3).T
df.shape
Out[5]:
0 1 2
accuracy 30 21 1259
altitude 0 0 0
heading NaN NaN NaN
velocity NaN NaN NaN
verticalAccuracy NaN NaN NaN
act_conf1 46 NaN 100
act_type1 UNKNOWN None TILTING
act_cont2 27 NaN NaN
act_type2 IN_VEHICLE None None
extra_intVal NaN NaN NaN
extra_name None None None
extra_type None None None
date_time 2015-11-30 21:41:32.110000-06:00 2015-11-30 21:43:05.510000-06:00 2015-11-30 21:47:40.071000-06:00
year 2015 2015 2015
month 11 11 11
day 30 30 30
day_of_week 0 0 0
hour 21 21 21
minute 41 43 47
latitude 29.6894 29.6893 29.6924
longitude -95.2712 -95.2712 -95.2805
cluster 0 0 0
mile_diff 0 0.00493873 0.599316
plane False False False
Out[5]:
(341045, 24)

Clustering and plotting cluster

In [15]:
fig,ax = plt.subplots(figsize=(20,10))
_=ax.scatter(df.longitude,df.latitude,c='blue',s=3,alpha=0.5)

_=ax.set_ylabel('latitude')
_=ax.set_xlabel('longitude')

Let's exclude flight GPS points

In [6]:
# recalculate miles differences
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

lat2 = df.latitude.values.tolist()
long2 = df.longitude.values.tolist()

lat1 = df.latitude.shift().values.tolist()
lat1[0] = lat2[0]
long1 = df.longitude.shift().values.tolist()
long1[0] = long2[0]

km_diff = haversine_array(lat1,long1,lat2,long2) 
df['mile_diff'] = km_diff * 0.621371 # to miles
In [10]:
df[df.mile_diff>12].groupby(['year','month','day']).mile_diff.mean()
Out[10]:
year  month  day
2016  4      18      19.314191
      5      31     162.667397
      6      3      708.936381
      10     13     236.593397
      12     21      26.120457
             22     938.012779
             24     139.839309
             26      25.258437
             29      95.911094
2017  1      3      139.469908
Name: mile_diff, dtype: float64
In [11]:
df['plane'] = df.mile_diff>12
In [8]:
# df.to_feather(PATH/'houston_ready.feather')
In [12]:
df_ground = df[~df.plane].copy().reset_index(drop=True)

Apply kmeans clustering

In [13]:
k=5
kmeans = KMeans(n_clusters=k,random_state=42).fit(df_ground[['latitude','longitude']])
df_ground['cluster']= kmeans.predict(df_ground[['latitude','longitude']])

fig,ax = plt.subplots(figsize=(20,10))
_=ax.scatter(df_ground.longitude,df_ground.latitude,c=df_ground.cluster,cmap='tab10',s=3,alpha=1)

_=ax.set_ylabel('latitude')
_=ax.set_xlabel('longitude')

Look like we still have some residuals of flight GPS left. We did get rid most of flight GPS to Florida (bottom right) and around Dallas (above dense Houston points).

K-means did a good job on separate travelling points (outside of Houston). Let's plot each of them

In [20]:
fig,axes = plt.subplots(nrows=k,figsize=(15,8*k));
for i in range(k):
    _=axes[i].scatter(df_ground[df_ground.cluster==i].longitude,df_ground[df_ground.cluster==i].latitude, c=colr_list[i],s=3,alpha=1)

After few k, k=5 seems to fit best as kmeans recognizes all the major states and cities I visited. Let's take a look at few of them

Mapping GPS point with Folium

cluster 1: Florida

In [11]:
df_flo = df_ground[df_ground.cluster==1].reset_index(drop=True)
df_flo.shape
Out[11]:
(8830, 24)
In [93]:
callback = """\
function (row) {
    var icon, marker;
    icon = L.AwesomeMarkers.icon({
        icon: "map-marker", markerColor: "red"});
    marker = L.marker(new L.LatLng(row[0], row[1]));
    marker.setIcon(icon);
    return marker;
};
"""
In [14]:
FLORIDA=[26.492328,-80.497977]

                             
m_flo = folium.Map(location=FLORIDA,tiles='cartodbpositron',zoom_start=7)

FastMarkerCluster(data=list(zip(df_flo.latitude.values,df_flo.longitude.values)),
                 callback = callback).add_to(m_flo)

# for lat,long in zip(df_flo.latitude.values,df_flo.longitude.values):   
#     _=folium.CircleMarker([lat,long], radius=1,
#                     color='#0080bb', fill_color='#0080bb').add_to(m_flo);

folium.LayerControl().add_to(m_flo)
Out[14]:
<folium.plugins.fast_marker_cluster.FastMarkerCluster at 0x7f53db2b6320>
Out[14]:
<folium.map.LayerControl at 0x7f53db2b6d30>
In [13]:
# m_flo.save(str(PATH/'florida.html'))
In [15]:
m_flo
Out[15]:

cluster 2: NY

In [16]:
df_ny = df_ground[df_ground.cluster==2].reset_index(drop=True)
df_ny.shape
Out[16]:
(3165, 24)
In [19]:
NY=[40.735954,-73.993896]

                             
m_ny = folium.Map(location=NY,tiles='cartodbpositron',zoom_start=10)

FastMarkerCluster(data=list(zip(df_ny.latitude.values,df_ny.longitude.values)),
                 callback = callback).add_to(m_ny)

# for lat,long in zip(df_ny.latitude.values,df_ny.longitude.values):   
#     _=folium.CircleMarker([lat,long], radius=1,
#                     color='#0080bb', fill_color='#0080bb').add_to(m_ny);


folium.LayerControl().add_to(m_ny)
Out[19]:
<folium.plugins.fast_marker_cluster.FastMarkerCluster at 0x7f53d0ef7828>
Out[19]:
<folium.map.LayerControl at 0x7f53d0ef7710>
In [18]:
# m_ny.save(str(PATH/'ny.html'))
In [20]:
m_ny
Out[20]:

Cluster 3: Austin + San Antonio + Dallas

In [21]:
df_tx = df_ground[df_ground.cluster==3].reset_index(drop=True)
df_tx.shape
Out[21]:
(7395, 24)
In [24]:
TX=[30.265253, -97.714187]

                             
m_tx = folium.Map(location=TX,tiles='cartodbpositron',zoom_start=7)

FastMarkerCluster(data=list(zip(df_tx.latitude.values,df_tx.longitude.values)),
                 callback = callback).add_to(m_tx)

# for lat,long in zip(df_tx.latitude.values,df_tx.longitude.values):   
#     _=folium.CircleMarker([lat,long], radius=1,
#                     color='#0080bb', fill_color='#0080bb').add_to(m_tx);


folium.LayerControl().add_to(m_tx)
Out[24]:
<folium.plugins.fast_marker_cluster.FastMarkerCluster at 0x7f53db2b6908>
Out[24]:
<folium.map.LayerControl at 0x7f53ced7fa20>
In [23]:
# m_tx.save(str(PATH/'tx.html'))
In [25]:
m_tx
Out[25]:

Cluster 4: Maryland

In [26]:
df_md = df_ground[df_ground.cluster==4].reset_index(drop=True)
df_md.shape
Out[26]:
(549, 24)
In [27]:
MD=[39.084967, -77.152883]

                             
m_md = folium.Map(location=MD,tiles='cartodbpositron',zoom_start=10)

FastMarkerCluster(data=list(zip(df_md.latitude.values,df_md.longitude.values)),
                 callback = callback).add_to(m_md)

# for lat,long in zip(df_tx.latitude.values,df_tx.longitude.values):   
#     _=folium.CircleMarker([lat,long], radius=1,
#                     color='#0080bb', fill_color='#0080bb').add_to(m_tx);


folium.LayerControl().add_to(m_md)
Out[27]:
<folium.plugins.fast_marker_cluster.FastMarkerCluster at 0x7f53ccbe8e48>
Out[27]:
<folium.map.LayerControl at 0x7f53cc77e748>
In [30]:
m_md
Out[30]:

Main target: hometown Houston

In [20]:
df_h = df_ground[df_ground.cluster==0].reset_index(drop=True)
df_h.shape
Out[20]:
(321048, 24)
In [27]:
# plot fewer data points for folium
n=50000
idxs = np.random.permutation(len(df_h))[:n]
In [28]:
H=[29.766672, -95.339652]

                             
m_h = folium.Map(location=H,tiles='cartodbpositron',zoom_start=10)

FastMarkerCluster(data=list(zip(df_h.loc[idxs,:].latitude.values,df_h.loc[idxs,:].longitude.values)),
                 callback = callback).add_to(m_h)

# for lat,long in zip(df_h.latitude.values,df_h.longitude.values):   
#     _=folium.CircleMarker([lat,long], radius=1,
#                     color='#0080bb', fill_color='#0080bb').add_to(m_h);


folium.LayerControl().add_to(m_h)
Out[28]:
<folium.plugins.fast_marker_cluster.FastMarkerCluster at 0x104b01d0>
Out[28]:
<folium.map.LayerControl at 0x18731128>
In [33]:
# m_h.save(str(PATH/'houston.html'))
In [29]:
m_h
Out[29]: