Preprocessing Madrid CRTM survey data
Madrid (CRTM: Consorcio Regional de Transportes de Madrid), unlike most other cities, provides a rich open data set on survey data on mobility behaviour of ~85.000 people with ~220.000 individual trips.
The description and results can be found here (in spanish): https://www.crtm.es/media/712934/edm18_sintesis.pdf
For our article we did some preprocessing: - merging the different data sets - translate Spanish codes to English and join with data - simplify some answer options (e.g. modal choice or trip motive)
If you want to use this data set for your own project, this preproessing might help you get started.
# import packages
import os
import requests
import pandas as pd
import numpy as np
from datetime import time
%matplotlib inline
Download all four datasets from here: https://crtm.maps.arcgis.com/apps/MinimalGallery/index.html?appid=a60bb2f0142b440eadee1a69a11693fc and store in *data\raw*
# working directory
path = os.getcwd()
print("Current working directory:", path)
# create sub-folders in 'data' folder
for x in ['raw', 'interim', 'processed']:
temp = os.path.join('../data/', x)
try:
os.mkdir(temp)
except:
print('Folder', temp, 'already exists.')
# download and save raw datasets
# HOGARES
url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/d9e8c48ae6a1474faa34083239007307/data'
r = requests.get(url, allow_redirects=True)
output = open('../data/raw/EDM2018HOGARES.xlsx', 'wb')
output.write(r.content)
output.close()
# INDIVIDUOS
url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/07dad41b543641d3964a68851fc9ad11/data'
r = requests.get(url, allow_redirects=True)
output = open('../data/raw/EDM2018INDIVIDUOS.xlsx', 'wb')
output.write(r.content)
output.close()
# VIAJES
url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/6afd4db8175d4902ada0803f08ccf50e/data'
r = requests.get(url, allow_redirects=True)
output = open('../data/raw/EDM2018VIAJES.xlsx', 'wb')
output.write(r.content)
output.close()
# XETAPAS
url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/81919e30e674422d93203a3190eafcdc/data'
r = requests.get(url, allow_redirects=True)
output = open('../data/raw/EDM2018XETAPAS.xlsx', 'wb')
output.write(r.content)
output.close()
# display content of 'raw' folder
print('\n"../data/raw" folder contains:')
print(os.listdir('../data/raw'))
Folder ../data/raw already exists. Folder ../data/interim already exists. Folder ../data/processed already exists. "../data/raw" folder contains: ['.gitkeep', 'EDM2018HOGARES.xlsx', 'EDM2018INDIVIDUOS.xlsx', 'EDM2018VIAJES.xlsx', 'EDM2018XETAPAS.xlsx', 'public_transport_madrid', 'zt1259', 'zt208', 'zt84']
# read downloaded files
ind = pd.read_excel('../data/raw/EDM2018INDIVIDUOS.xlsx')
hog = pd.read_excel('../data/raw/EDM2018HOGARES.xlsx')
via = pd.read_excel('../data/raw/EDM2018VIAJES.xlsx', dtype = {'VORIHORAINI':str, 'VDESHORAFIN':str}) # specify times to be read as strings
etap = pd.read_excel('../data/raw/EDM2018XETAPAS.xlsx')
# set unique identifiers as index for each DataFrame
via.set_index(["ID_HOGAR", "ID_IND", "ID_VIAJE"], inplace = True)
ind.set_index(["ID_HOGAR", "ID_IND"], inplace = True)
hog.set_index("ID_HOGAR", inplace = True)
etap.set_index(["ID_HOGAR", "ID_IND", "ID_VIAJE"], inplace =True)
people = hog.join(ind, lsuffix = "_hog", rsuffix = "_ind")
trips = people.join(via, rsuffix = "_via")
legs = trips.join(etap, rsuffix = "_etap")
# people is a dataset of households and unique people
people.to_csv(r'..\data\interim\people.csv', index = False)
# trips includes households, people and unique trips
trips.to_csv(r'..\data\interim\trips.csv', index = False)
# legs includes households, people, trips and unique legs
legs.to_csv(r'..\data\interim\legs.csv', index = False)
## you likely want to work with "trips" for analysing mobility patterns.
# read codes from second tab of excel files
codes_hog = pd.read_excel ('../data/raw/EDM2018HOGARES.xlsx', sheet_name = 1)
codes_ind = pd.read_excel ('../data/raw/EDM2018INDIVIDUOS.xlsx', sheet_name = 1)
codes_via = pd.read_excel ('../data/raw/EDM2018VIAJES.xlsx', sheet_name = 1)
codes_etap = pd.read_excel ('../data/raw/EDM2018XETAPAS.xlsx', sheet_name = 1)
# append all codes to a single file
codes = codes_hog.append(codes_ind, ignore_index = True, sort = False
).append(codes_via, ignore_index = True, sort = False
).append(codes_etap, ignore_index = True, sort = False
).drop(columns = ["Unnamed: 0", "Unnamed: 1"])
# forward fill varibale and specification to work as proper table (and not only visually in Excel)
codes["VARIABLE"] = codes.VARIABLE.fillna(method = "ffill")
codes["ESPECIFICACIÓN"] = codes["ESPECIFICACIÓN"].fillna(method = "ffill")
# split code and value
codes['CODE'], codes['VALUE'] = codes['VALORES'].str.split(".", 1).str
codes['CODE'], unused = codes.CODE.str.split(" '", 1).str
codes.drop(index = codes[codes.VARIABLE.isna()].index, inplace = True)
codes.to_csv('../data/interim/codes.csv', index = False)
Translate codes by using Google Sheets.
(Google API Package for Python wasnt working? https://pypi.org/project/googletrans/ ) Option to use the Google translate API (https://cloud.google.com/translate/docs/reference/rest) but for this use case wasnt really worth to set it up.
# read csv in case you already stored the csv and don't want to repeat the downloading steps
# trips = pd.read_csv(r'..\data\interim\trips.csv', dtype = {'VORIHORAINI':str, 'VDESHORAFIN':str})
codes = pd.read_csv(r'..\data\processed\codes_translated.csv', dtype = {'CODE': float})
Set value of sex to 'Man' and 'Woman' instead of numbers.
code_sex = codes[codes.VARIABLE == "C2SEXO"][['CODE', 'VALUE_en']].rename({'VALUE_en': 'sex'}, axis = 1)
trips = trips.join(code_sex.set_index('CODE'), on = "C2SEXO")
Compute duration from start (VORIHORAINI) and endtime (VDESHORAFIN).
mins = pd.to_numeric(trips.VDESHORAFIN.str.slice(2)) - pd.to_numeric(trips.VORIHORAINI.str.slice(2))
hours = (pd.to_numeric(trips.VDESHORAFIN.str.slice(0, 2)) - pd.to_numeric(trips.VORIHORAINI.str.slice(0, 2))) * 60
trips["duration"] = mins + hours
Create a datetime object from string.
trips['start_time'] = pd.to_datetime(trips.VORIHORAINI, format = '%H%M').dt.time
Endtimes sometimes exceeds 24 hours. (If a trip starts at 11pm and ends at 1am, then the endtime is 2500). Datetime does not work with times > 24h. Therefore this needs to be fixed.
trips["end_time"] = np.where(pd.to_numeric(trips['VDESHORAFIN']) > 2400,
pd.to_numeric(trips['VDESHORAFIN']) - 2400,
pd.to_numeric(trips['VDESHORAFIN']))
trips['end_time'] = pd.to_datetime(trips.end_time, format = '%H%M', errors = 'coerce').dt.time
# speed
trips["speed"] = np.where(trips.duration != 0, # skip if start and Endtime is the same
trips.DISTANCIA_VIAJE / (trips.duration / 60), np.NaN)
# remove implausible data
trips.loc[trips.speed > 150, "speed"] = None
The original survey has a 24 different modes:
These are simplified to four modes:
# simplify mode
code_mode = codes[codes['VARIABLE'] == 'MODO_PRIORITARIO'][['CODE', 'VALUE_en']].drop_duplicates()
trips = trips.join(code_mode.set_index('CODE'), on = 'MODO_PRIORITARIO', how = "left").reset_index()
trips.rename({"VALUE_en": "mode"}, axis = "columns", inplace = True)
trips["mode_simple"] = trips.MODO_PRIORITARIO
trips.loc[trips.MODO_PRIORITARIO < 10, "mode_simple"] = "public transport"
trips.loc[(trips.MODO_PRIORITARIO > 9) & (trips.MODO_PRIORITARIO < 17), "mode_simple"] = "car"
trips.loc[(trips.MODO_PRIORITARIO > 16) & (trips.MODO_PRIORITARIO < 24), "mode_simple"] = "other"
trips.loc[trips.MODO_PRIORITARIO == 24, "mode_simple"] = "walk"
The original survey has 12 trip motives:
These are simplified to 5 trip motives:
# simplify trip motive
code_motive = codes[codes['VARIABLE'] == 'MOTIVO_PRIORITARIO'][['CODE', 'VALUE_en']]
trips = trips.join(code_motive.set_index('CODE'), on = 'MOTIVO_PRIORITARIO').reset_index()
trips.rename({"VALUE_en": "motive"}, axis = "columns", inplace = True)
trips.loc[:,"motive_simple"] = trips.motive
trips.loc[(trips.MOTIVO_PRIORITARIO == 2) | (trips.MOTIVO_PRIORITARIO == 3), "motive_simple"] = "work"
trips.loc[((trips.MOTIVO_PRIORITARIO >= 5) & (trips.MOTIVO_PRIORITARIO <= 8) | (trips.MOTIVO_PRIORITARIO == 10)), "motive_simple"] = "care / errand"
trips.loc[(trips.MOTIVO_PRIORITARIO >= 8) & (trips.MOTIVO_PRIORITARIO <= 9), "motive_simple"] = "leisure"
trips.loc[(trips.MOTIVO_PRIORITARIO == 1) | (trips.MOTIVO_PRIORITARIO == 11) | (trips.MOTIVO_PRIORITARIO == 12), "motive_simple"] = "other"
Traffic can be divided into three main categories time wise:
A new category 'daytime' is created for this.
# daytime: rush hour, off-peak, low traffic
conditions = [
((trips.start_time >= time(7)) & (trips.start_time < time(9))) | ((trips.start_time >= time(17)) & (trips.start_time < time(20))),
(trips.start_time >= time(9)) & (trips.start_time < time(17)),
(trips.start_time >= time(20)) | (trips.start_time < time(7))
]
choices = ['rush hour (HVZ)', 'off-peak hour (NVZ)', 'low traffic time (SVZ)']
trips['daytime'] = np.select(conditions, choices, default= None)
Round age to 5 to get larger groups with the same age and make it easier to work with.
# create variable for rounded age
trips["rounded_age"] = 5 * (trips.EDAD_FIN / 5).round()
Create four different age groups:
# age group
conditions = [
trips.EDAD_FIN <= 18,
(trips.EDAD_FIN > 18) & (trips.EDAD_FIN <= 25),
(trips.EDAD_FIN > 25) & (trips.EDAD_FIN <= 55),
trips.EDAD_FIN > 55]
choices = ['kids', 'young adults', 'adults', 'seniors']
trips['age_group'] = np.select(conditions, choices, default= None)
trips.drop(["index"], axis = 1, inplace = True)
trips.to_csv(r'..\data\processed\trips_custom_variables.csv', index = False)