import os
import zipfile
from pathlib import Path
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.point import Point
from tqdm.auto import tqdm
import re
tqdm.pandas()
C:\Users\16098\anaconda3\lib\site-packages\tqdm\std.py:697: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version from pandas import Panel
data_directory = Path.cwd()
for file in data_directory.iterdir():
if zipfile.is_zipfile(file):
with zipfile.ZipFile(file) as item:
item.extractall()
files = [file for file in data_directory.rglob('*.csv') if file.is_file() and file.match('CRASH_PHILADELPHIA*.csv')]
df_list = []
for file in files:
csv = pd.read_csv(file)
df_list.append(csv)
C:\Users\16098\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3146: DtypeWarning: Columns (96,97,98) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
crash_df = pd.concat(df_list)
cols = ['BELTED_DEATH_COUNT', 'BICYCLE_DEATH_COUNT', 'COLLISION_TYPE', 'CRASH_MONTH',
'CRASH_YEAR', 'DAY_OF_WEEK', 'DEC_LAT', 'DEC_LONG', 'FATAL_COUNT',
'HOUR_OF_DAY', 'INJURY_COUNT', 'INTERSECT_TYPE', 'MCYCLE_DEATH_COUNT',
'PED_DEATH_COUNT']
def transform_crash_data(crash_df):
cols = ['BELTED_DEATH_COUNT', 'BICYCLE_DEATH_COUNT', 'COLLISION_TYPE', 'CRASH_MONTH',
'CRASH_YEAR', 'DAY_OF_WEEK', 'DEC_LAT', 'DEC_LONG', 'FATAL_COUNT',
'HOUR_OF_DAY', 'INJURY_COUNT', 'INTERSECT_TYPE', 'MCYCLE_DEATH_COUNT',
'PED_DEATH_COUNT']
return (crash_df
[cols]
.assign(CRASH_REPORT_NUMBER=1,
DEC_LAT=crash_df.DEC_LAT.fillna(method='ffill'),
DEC_LONG=crash_df.DEC_LONG.fillna(method='ffill'))
.astype({'BELTED_DEATH_COUNT': 'int8', 'BICYCLE_DEATH_COUNT': 'int8', 'COLLISION_TYPE': 'int8',
'CRASH_MONTH': 'int8','CRASH_YEAR': 'int16', 'DAY_OF_WEEK': 'int8', 'FATAL_COUNT': 'int8',
'HOUR_OF_DAY': 'int8', 'INJURY_COUNT': 'int8', 'INTERSECT_TYPE': 'int8',
'MCYCLE_DEATH_COUNT': 'int8','PED_DEATH_COUNT': 'int8'})
)
transform_crash_data(crash_df)
BELTED_DEATH_COUNT | BICYCLE_DEATH_COUNT | COLLISION_TYPE | CRASH_MONTH | CRASH_YEAR | DAY_OF_WEEK | DEC_LAT | DEC_LONG | FATAL_COUNT | HOUR_OF_DAY | INJURY_COUNT | INTERSECT_TYPE | MCYCLE_DEATH_COUNT | PED_DEATH_COUNT | CRASH_REPORT_NUMBER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 4 | 1 | 2015 | 5 | 39.9579 | -75.1623 | 0 | 17 | 0 | 1 | 0 | 0 | 1 |
1 | 0 | 0 | 7 | 1 | 2015 | 6 | 39.9553 | -75.1478 | 0 | 22 | 0 | 0 | 0 | 0 | 1 |
2 | 0 | 0 | 0 | 1 | 2015 | 5 | 40.0396 | -75.0067 | 0 | 13 | 3 | 0 | 0 | 0 | 1 |
3 | 0 | 0 | 1 | 1 | 2015 | 6 | 40.0207 | -75.0424 | 0 | 18 | 1 | 0 | 0 | 0 | 1 |
4 | 0 | 0 | 7 | 1 | 2015 | 7 | 39.9750 | -75.1174 | 0 | 2 | 1 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10131 | 0 | 0 | 4 | 12 | 2020 | 5 | 40.0107 | -75.1627 | 0 | 20 | 1 | 1 | 0 | 0 | 1 |
10132 | 0 | 0 | 5 | 5 | 2020 | 4 | 40.0308 | -75.0546 | 0 | 18 | 4 | 0 | 0 | 0 | 1 |
10133 | 0 | 0 | 5 | 5 | 2020 | 4 | 40.0277 | -75.0610 | 0 | 18 | 0 | 0 | 0 | 0 | 1 |
10134 | 0 | 0 | 8 | 11 | 2020 | 2 | 39.9695 | -75.2304 | 0 | 11 | 1 | 2 | 0 | 0 | 1 |
10135 | 0 | 0 | 1 | 8 | 2020 | 2 | 40.0010 | -75.1328 | 0 | 8 | 1 | 0 | 0 | 0 | 1 |
67333 rows × 15 columns
locator = Nominatim(user_agent="myGeolocator", timeout=10)
rgeocode = RateLimiter(locator.reverse, min_delay_seconds=.25)
def reverse_geocoding(GEOM):
try:
location = locator.reverse(Point(GEOM))
return location.raw['address']['postcode']
except:
return None
crash_df = (transform_crash_data(crash_df)
.assign(GEOM=crash_df['DEC_LAT'].map(str) + ',' + " " + crash_df['DEC_LONG'].map(str)
))
crash_df
BELTED_DEATH_COUNT | BICYCLE_DEATH_COUNT | COLLISION_TYPE | CRASH_MONTH | CRASH_YEAR | DAY_OF_WEEK | DEC_LAT | DEC_LONG | FATAL_COUNT | HOUR_OF_DAY | INJURY_COUNT | INTERSECT_TYPE | MCYCLE_DEATH_COUNT | PED_DEATH_COUNT | CRASH_REPORT_NUMBER | GEOM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 4 | 1 | 2015 | 5 | 39.9579 | -75.1623 | 0 | 17 | 0 | 1 | 0 | 0 | 1 | 39.9579, -75.1623 |
1 | 0 | 0 | 7 | 1 | 2015 | 6 | 39.9553 | -75.1478 | 0 | 22 | 0 | 0 | 0 | 0 | 1 | 39.9553, -75.1478 |
2 | 0 | 0 | 0 | 1 | 2015 | 5 | 40.0396 | -75.0067 | 0 | 13 | 3 | 0 | 0 | 0 | 1 | 40.0396, -75.0067 |
3 | 0 | 0 | 1 | 1 | 2015 | 6 | 40.0207 | -75.0424 | 0 | 18 | 1 | 0 | 0 | 0 | 1 | 40.0207, -75.0424 |
4 | 0 | 0 | 7 | 1 | 2015 | 7 | 39.9750 | -75.1174 | 0 | 2 | 1 | 0 | 0 | 0 | 1 | 39.975, -75.1174 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10131 | 0 | 0 | 4 | 12 | 2020 | 5 | 40.0107 | -75.1627 | 0 | 20 | 1 | 1 | 0 | 0 | 1 | 40.0107, -75.1627 |
10132 | 0 | 0 | 5 | 5 | 2020 | 4 | 40.0308 | -75.0546 | 0 | 18 | 4 | 0 | 0 | 0 | 1 | 40.0308, -75.0546 |
10133 | 0 | 0 | 5 | 5 | 2020 | 4 | 40.0277 | -75.0610 | 0 | 18 | 0 | 0 | 0 | 0 | 1 | 40.0277, -75.061 |
10134 | 0 | 0 | 8 | 11 | 2020 | 2 | 39.9695 | -75.2304 | 0 | 11 | 1 | 2 | 0 | 0 | 1 | 39.9695, -75.2304 |
10135 | 0 | 0 | 1 | 8 | 2020 | 2 | 40.0010 | -75.1328 | 0 | 8 | 1 | 0 | 0 | 0 | 1 | 40.001, -75.1328 |
67333 rows × 16 columns
tqdm.pandas()
crash_df['ZIP'] = crash_df['GEOM'].progress_apply(reverse_geocoding)
crash_df.to_csv('crash.csv', index=False)
C:\Users\16098\anaconda3\lib\site-packages\tqdm\std.py:697: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version from pandas import Panel 100%|██████████| 67333/67333 [9:18:32<00:00, 2.01it/s]
crash_df = pd.read_csv('crash.csv')
crash_df = (crash_df
.assign(ZIP=crash_df.ZIP.str[0:5])
)
crash_df.to_csv('crash.csv', index=False)