#!/usr/bin/env python # coding: utf-8 # ## Travel time prediction in Indian Metro cities using Uber Movement data and OpenStreetMap # # Uber provides anonymized and aggregated travel time data through [Uber Movement](https://movement.uber.com/) platform for many citites across the world. For India, current and historic data is available for 5 cities - Bangalore, Hyderabad, New Delhi, Mumbai and Kolkata. It also provides the details on the ward boundaries in the form of JSON file. # [OpenStreetMap](https://wiki.openstreetmap.org/wiki/About_OpenStreetMap) (OSM) is a free, editable map of the whole world that is being built by volunteers largely from scratch and released with an open-content license. OSM data includes a global navigable street network dataset. Several services exists that provide routing and network analysis on top of this data. # In this project, we use the open travel time dataset from Uber and leverage open-source routing services for OpenStreetMap to build a fairly accurate model for travel time within each of the metro cities in India. We show that by using rich ecosystem of Python Geospatial libraries, we can easily consume, process, and visualize large amount of geospatial data easily and incorporate it easily into a machine learning model. # **Open datasets** # - Uber Movement - Travel times and ward boundaries # - OpenStreetMap # # **Python libraries** # - geopandas # - shapely # - matplotlib # - folium # - scikit-learn # # **Services** # - Open Source Routing Machine (OSRM) # - OpenRouteService (ORS) API # In[16]: import pandas as pd import geopandas as gpd import numpy as np import requests import shapely import matplotlib.pyplot as plt import datetime import os import math import random import folium from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from matplotlib import pyplot get_ipython().run_line_magic('matplotlib', 'inline') # ## Reading Datasets # ### Travel Times # In[3]: data_folder = os.path.join('data', 'uber') # The Uber Movement Travel Times data comes as a CSV file for each quarter. Here we are using the **Travel Times By Date By Hour Buckets (All Days)** dataset. This data set includes the arithmetic mean, geometric mean, and standard deviations for aggregated travel times between every ward in the city, for every day of the quarter and aggregated into time categories. This is a large dataset with over **7M rows**. # # We import the data as a Pandas DataFrame and call `convert_dtypes()` to select the best datatypes for each column. # In[4]: travel_times_file = 'bangalore-wards-2020-1-All-DatesByHourBucketsAggregate.csv' travel_times_filepath = os.path.join(data_folder, travel_times_file) travel_times= pd.read_csv(travel_times_filepath) travel_times = travel_times.convert_dtypes() # In[39]: travel_times # ### Ward Boundaries # The travel times dataset contain details of travel between *Zones*. For Indian citites, the zones are **Wards** as defined by the local municipal corporation. This data comes as a **GeoJSON** file that contains the polygon representation of each ward. We use `geopandas` to read the file as a GeoDataFrame. # In[6]: wards_file = 'bangalore_wards.json' wards_filepath = os.path.join(data_folder, wards_file) wards = gpd.read_file(wards_filepath) # In[7]: wards # In[11]: fig, ax = plt.subplots(figsize=(10,10)) wards['geometry'].plot(color='grey',ax=ax) # ## Data Pre-Processing # ### Travel Times # In[49]: np.random.seed(0) travel_times = pd.concat([travel_times]*5, ignore_index=True) # In[50]: travel_times['random'] = np.random.uniform(0, 1, len(travel_times)) # In[51]: travel_times['travel_time'] = np.exp(travel_times['random']*np.log(travel_times['geometric_standard_deviation_travel_time']) + np.log(travel_times['geometric_mean_travel_time'])) # The source data contains the travel times grouped by blocks of time (peak/off-peak etc.), defined by `start_hour` and `end_hour` columns. To allow us to model this easily, we add a `time_period` columns and assign an integer category value. # In[52]: travel_times # In[53]: categories_to_hour = { 1: [0, 6], 2: [7, 9], 3: [10, 15], 4: [16, 18], 5: [19, 23] } def get_time_period(hour): for category, (start_hour, end_hour) in categories_to_hour.items(): if hour >= start_hour and hour <= end_hour: return category # In[54]: travel_times['time_period'] = travel_times['start_hour'].apply(get_time_period) # Travel time has a strong correlation with the day of the week. So we compute a new column `dow` from the day and month columns # In[55]: year = 2020 def get_dow(row): return datetime.date(year, int(row['month']), int(row['day'])).weekday() # In[ ]: travel_times['dow'] = travel_times.apply(get_dow, axis=1) # In[ ]: travel_times # ### Ward Boundaries # For modeling purposes, we use centroid of each ward to represent the ward. We use GeoPandas `centroid()` function to get the point geometry representing the centroid. # # Our source data comes in the *EPSG:4326 WGS84 Geographic Projection* - which is not suitable forgeoprocessing operations. To get the accurate centroid computation, we must re-project the data to a *Planar Projection*. We use a UTM projection suitable for the region of the data - *WGS 84 UTM Zone 43N* - which is defined by the code [EPSG:32643](http://epsg.io/32643). Once computed, we transform it back to EPSG:4326 and add it to our GeoDataFrame. # In[30]: centroid_utm = wards.geometry.to_crs('EPSG:32643').centroid wards['centroid'] = centroid_utm.to_crs('EPSG:4326') # In[31]: fig, ax = plt.subplots(figsize=(10,10)) wards['geometry'].plot(color='grey',ax=ax) wards['centroid'].plot(color='red',ax=ax) # ## Distance Computation # We have the travel times for each pair of source and destination wards. The travel time is strongly correlated with the distance between the wards. We need to compute the actual distance along the road network for our ward. # In[32]: ward_no = wards['MOVEMENT_ID'] index = pd.MultiIndex.from_product([ward_no, ward_no], names = ['sourceid', 'dstid']) distancematrix = pd.DataFrame(index = index).reset_index() distancematrix = distancematrix.query('sourceid != dstid') # In[33]: def get_coordinates(row): source_ward = wards[wards['MOVEMENT_ID'] == row['sourceid']].iloc[0] dst_ward = wards[wards['MOVEMENT_ID'] == row['dstid']].iloc[0] src_lon, src_lat = source_ward['centroid'].x, source_ward['centroid'].y dst_lon, dst_lat = dst_ward['centroid'].x, dst_ward['centroid'].y return src_lon, src_lat, dst_lon, dst_lat # In[34]: distancematrix[['src_lon', 'src_lat', 'dst_lon', 'dst_lat']] = distancematrix.apply(get_coordinates, axis=1, result_type='expand') # In[35]: distancematrix # We need to get driving distance between approximately 40,000 coordinates. To do this efficiently, we ran the [Open Source Routing Machine (OSRM)](https://hub.docker.com/r/osrm/osrm-backend/) service locally using docker images provided by the project. OSRM holds the network graph in memory and the routing is extremely fast. We write and apply the following function and get the driving distance in meters. # In[36]: def get_distance(row): coordinates = '{},{};{},{}'.format( row['src_lon'], row['src_lat'], row['dst_lon'], row['dst_lat']) url = 'http://127.0.0.1:5000/route/v1/driving/' response = requests.get(url + coordinates) if response.status_code== 200: data = response.json() distance = data['routes'][0]['distance'] return distance # The resulting distance data is saved locally and used in the subsequent analysis. # In[33]: osrm_data_folder = os.path.join('data', 'osrm') distancematrix_file = 'distancematrix.csv' distancematrix_filepath = os.path.join(osrm_data_folder, distancematrix_file) distancematrix = pd.read_csv(distancematrix_filepath) # In[34]: travel_times = pd.merge(travel_times, distancematrix, on=['sourceid', 'dstid']) # In[39]: travel_times # ## Data Modeling # We use `scikit-learn` library to build and train a linear regressor. # # The independent variables considered are `sourceid, dstid, day, time_period, dow, src_lon, src_lat, dst_lon, dst_lat, distance`. The dependnet variable is the travel time `geometric_mean_travel_time`. # Of the independent variables we goes for one-hot-encoding of to categorical variables `time_period` and `dow` # We sample the travel times to get a subset that will be used for training. # In[35]: num_samples = 500000 samples = travel_times.sample(n=num_samples, random_state=1) # In[36]: sel_input=['sourceid', 'dstid', 'day', 'time_period', 'dow', 'src_lon', 'src_lat', 'dst_lon', 'dst_lat', 'distance'] cat_ip=['time_period','dow'] scale_ip= list(set(sel_input)-set(cat_ip)) # In[37]: x = samples[sel_input].values y = samples['travel_time'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=1) # The data set contains categorical variable in case of 'time_period' and 'dow' and hence these are one_hot_encoded since these categorical values has no significance for linear model. # In[ ]: category_Trans= ColumnTransformer([('encoder',OneHotEncoder(categories='auto', sparse=False),[sel_input.index(i) for i in cat_ip]), ('scaler',StandardScaler(),[sel_input.index(i) for i in scale_ip])],remainder='passthrough') # In[ ]: regressor = Pipeline(steps=[('ct',category_Trans),('model',LinearRegression())]) # In[38]: regressor.fit(x_train,y_train) # In[50]: print('Training Accuracy: ', regressor.score(x_train,y_train)) # In[51]: print('Prediction Accuracy: ', regressor.score(x_test,y_test)) # ## Checking Model Performance # While the model performs well with the test partition, that dataset is not representative of real world data. We want to see how the model performs to routing requests that are not between centroids of wards. To achieve this, we create a dataset with random source and destination coordinates and check the model prediction against travel times predicted by commercial data providers such as Google Maps. # ### Random Points within a Polygon # We generate random coordinate pairs within the bounds of the city. But to ensure that the points fall within the actual city geometry, we do a spatial join to select the points that intersect the wards. After the join, we select a subset of 100 points. # In[31]: n_points = 200 x_min, y_min, x_max, y_max = wards.total_bounds np.random.seed(0) src_x = np.random.uniform(x_min, x_max, n_points) src_y = np.random.uniform(y_min, y_max, n_points) dst_x = np.random.uniform(x_min, x_max, n_points) dst_y = np.random.uniform(y_min, y_max, n_points) src_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(src_x, src_y), crs='EPSG:4326') dst_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(dst_x, dst_y), crs='EPSG:4326') # In[32]: src_gdf = gpd.sjoin(src_gdf, wards, how='inner', op='intersects') dst_gdf = gpd.sjoin(dst_gdf, wards, how='inner', op='intersects') src_selected = src_gdf[:100].reset_index() dst_selected = dst_gdf[:100].reset_index() # In[33]: fig, ax = plt.subplots(figsize=(10,10)) wards['geometry'].plot(color='grey',ax=ax) src_selected.geometry.plot(color='green',ax=ax) dst_selected.geometry.plot(color='red',ax=ax) # ### Random Days and Times # In[34]: start_date = datetime.date(2020, 1, 1) end_date = datetime.date(2020, 3, 31) days = (end_date - start_date).days random.seed(0) random_dates = [start_date + datetime.timedelta(days=random.randrange(days)) for _ in range(100)] months = [x.month for x in random_dates] days = [x.day for x in random_dates] dows = [x.weekday() for x in random_dates] random_time_periods = [random.choice(range(1, 6)) for _ in range(100)] # In[35]: data = pd.DataFrame({ 'sourceid': src_selected['MOVEMENT_ID'], 'dstid': dst_selected['MOVEMENT_ID'], 'month': months, 'day': days, 'time_period': random_time_periods, 'dow': dows, 'src_lon': src_selected.geometry.x, 'src_lat': src_selected.geometry.y, 'dst_lon': dst_selected.geometry.x, 'dst_lat': dst_selected.geometry.y, }) # ### Random Test Dataset # The resulting distance data is saved locally and used in the subsequent analysis. # In[36]: osrm_data_folder = os.path.join('data', 'osrm') model_test_file = 'model_test.csv' model_test_filepath = os.path.join(osrm_data_folder, model_test_file) model_test = pd.read_csv(model_test_filepath) # In[37]: model_test # ### Predicted vs. Reference Travel Times # To validate our model against real-world data, we collected reference travel times from Google Maps. Google Maps allows one to set a specific departure time in the past and get a range of travel times. We used our randomly generated source and destimation pairs along with random departure times and collected reference data. # In[38]: reference_data_folder = os.path.join('data', 'googlemaps') reference_file = 'googlemaps_traveltimes.csv' reference_filepath = os.path.join(reference_data_folder, reference_file) reference_data = pd.read_csv(reference_filepath) # In[39]: reference_data # In[40]: def predict_time(row): input = row[['sourceid', 'dstid', 'day', 'time_period', 'dow', 'src_lon', 'src_lat', 'dst_lon', 'dst_lat', 'distance']] return round(regressor.predict([input])[0]/60) # In[41]: reference_data['predicted'] = reference_data.apply(predict_time, axis=1) results = reference_data[['sourceid', 'dstid', 'distance', 'goog_distance', 'goog_min', 'goog_max', 'predicted']].copy() results['within_range'] = np.where( (results['predicted'] <= results['goog_max']) & (results['predicted'] >= results['goog_min']), 'Y', 'N') results.head(40) # In[ ]: # ## Real-time Routing and Prediction # # To demonstrate the use of our technique in a real-world application, we show how it can be used in a real-time routing application. # In[59]: from dotenv import load_dotenv load_dotenv() ORS_API_KEY = os.getenv('ORS_API_KEY') # In[61]: def get_driving_route(source_coordinates, dest_coordinates): parameters = { 'api_key': ORS_API_KEY, 'start' : '{},{}'.format(source_coordinates[1], source_coordinates[0]), 'end' : '{},{}'.format(dest_coordinates[1], dest_coordinates[0]) } response = requests.get( 'https://api.openrouteservice.org/v2/directions/driving-car', params=parameters) if response.status_code == 200: data = response.json() return data else: print('Request failed.') return -9999 # In[62]: def get_ward(coordinates): df = pd.DataFrame({'x': [coordinates[1]], 'y': [coordinates[0]]}) src_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(df.x, df.y), crs='EPSG:4326') src_gdf = gpd.sjoin(src_gdf, wards, how='inner', op='intersects') return int(src_gdf['MOVEMENT_ID'][0]) # In[80]: def get_route(source, destination, departure_time): sourceid = get_ward(source) dstid = get_ward(destination) day = departure_time.day time_period = get_time_period(departure_time.hour) dow = departure_time.weekday() driving_data = get_driving_route(source, destination) summary = driving_data['features'][0]['properties']['summary'] distance = summary['distance'] input = [sourceid, dstid, day, time_period, dow, source[1], source[0], destination[1], destination[0], distance] travel_time = round(regressor.predict([input])[0]/60) ors_travel_time = round(summary['duration']/60) route= driving_data['features'][0]['geometry']['coordinates'] def swap(coord): coord[0],coord[1]=coord[1],coord[0] return coord route=list(map(swap, route)) m = folium.Map(location=[(source[0] + destination[0])/2,(source[1] + destination[1])/2], zoom_start=13) tooltip = 'Model predicted time = {} mins, \ Default travel time = {} mins'.format(travel_time, ors_travel_time) folium.PolyLine( route, weight=8, color='blue', opacity=0.6, tooltip=tooltip ).add_to(m) folium.Marker( location=(source[0],source[1]), icon=folium.Icon(icon='play',color='green') ).add_to(m) folium.Marker( location=(destination[0],destination[1]), icon=folium.Icon(icon='stop',color='red') ).add_to(m) return m # In[ ]: # ### Live Demo # # We pick a set of coordinates within the city and show how to get turn-by-turn directions using OpenRouteService API and predict the travel-time using our model. # In[86]: source = 12.946538, 77.579975 destination = 12.994029, 77.661008 departure_time = datetime.datetime.now() # In[87]: get_route(source, destination, departure_time) # We can check how the model performs by comparing with the travel time predicted by Google Maps. # In[88]: import webbrowser url='https://www.google.com/maps/dir/{},{}/{},{}'.format(source[0],source[1],destination[0],destination[1]) webbrowser.open(url) # In[ ]: