# Authorize access to Google Drive
from google.colab import drive, auth
drive.mount('/content/drive',force_remount=True)
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import os
import re
from string import capwords
# Filepaths
root = '/content/drive/MyDrive/TCC (Transportation Choices Coalition)/'
data_dir = root + 'Data/'
cad_dir = data_dir + '2022-11-28 - Snohomish County 911 (SNO911) public records - CAD logs/'
# Load files
filenames = os.listdir(cad_dir)
filenames.sort()
data = []
for fn in filenames:
with open(cad_dir + fn, 'r') as file:
data_this = file.read()
data.append(data_this)
# Extract data fields
output = pd.DataFrame()
for fn_idx, fn in enumerate(filenames):
cfs = re.compile(r"Call For Service Detail Report - CFS ([0-9]+)\n")
cfs_matches = np.array(cfs.findall(data[fn_idx]))
this_output = pd.DataFrame(index=np.arange(len(cfs_matches)),data={'CFS':cfs_matches})
this_output['Filename'] = fn
address = re.compile(r"Address\n\n(.+)\n\n")
address_matches = address.findall(data[fn_idx])
this_output['Original address'] = [capwords(s) for s in address_matches]
this_output[['Original address','Town/city']] = \
this_output['Original address'].str.split(', ',expand=True)
this_output[['Primary street','Cross street']] = \
this_output['Original address'].str.split('@ ',expand=True)
this_output.loc[~this_output['Original address'].str.contains('@'),'Address'] = \
this_output.loc[~this_output['Original address'].str.contains('@'),'Primary street']
this_output.loc[~this_output['Original address'].str.contains('@'),'Primary street'] = ''
this_output.loc[this_output['Address'].isnull(),'Address'] = ''
date = re.compile(r"Create Date\n\n(.+) (.+ .+)\n\n")
date_matches = date.findall(data[fn_idx])
date_matches_dist = []
date_match_idx = 0
for sub_cad_log in data[fn_idx].split('Call For Service Detail Report - CFS')[1:]:
if date_match_idx >= len(date_matches):
date_matches_dist.append(('',''))
elif (date_matches[date_match_idx][0] in sub_cad_log) and \
(date_matches[date_match_idx][1] in sub_cad_log):
date_matches_dist.append(date_matches[date_match_idx])
date_match_idx += 1
else:
date_matches_dist.append(('',''))
this_output['Date'] = pd.to_datetime(np.array(date_matches_dist)[:,0])
this_output['Time'] = pd.to_datetime(np.array(date_matches_dist)[:,1])
this_output['Time'] = this_output['Time'].dt.floor('Min').dt.time
incnum = re.compile(r"Incident Number\n(.+(?:\n.+)*)\n\nJurisdiction")
incnum_matches = [s.rstrip("\nUnits") for s in incnum.findall(data[fn_idx])]
incnum_matches_dist = []
incnum_match_idx = 0
for sub_cad_log in data[fn_idx].split('Call For Service Detail Report - CFS')[1:]:
if incnum_match_idx >= len(incnum_matches):
incnum_matches_dist.append('')
elif incnum_matches[incnum_match_idx] in sub_cad_log:
incnum_matches_dist.append(incnum_matches[incnum_match_idx])
incnum_match_idx += 1
else:
incnum_matches_dist.append('')
incnum_matches_dist = ['CHECK_INC_NUM' if s == '' else s.replace('\n',', ') \
for s in incnum_matches_dist]
this_output['Incident'] = incnum_matches_dist
this_output['IDs'] = this_output['CFS'].str.cat(this_output['Incident'],sep=', ')
this_output = this_output[['Filename','IDs','Town/city','Address',
'Primary street','Cross street','Date','Time']]
output = output.append(this_output)
output = output.reset_index().drop(columns=['index'])
output.to_excel(cad_dir + 'SNO911_parsed.xlsx')
display(output)
Filename | IDs | Town/city | Address | Primary street | Cross street | Date | Time | |
---|---|---|---|---|---|---|---|---|
0 | 2017_CAD_Logs_R.txt | 436, CHECK_INC_NUM | Everett | 19th Ave Se | 100th St Se | 2017-09-17 | 20:56:00 | |
1 | 2017_CAD_Logs_R.txt | 452, 2017-00119029 | None | 6600 Blk Evergreen | None | 2017-09-30 | 19:07:00 | |
2 | 2017_CAD_Logs_R.txt | 884, 2017-00130003 | Everett | 221 Se Everett Mall Way 103 | None | 2017-10-25 | 11:57:00 | |
3 | 2017_CAD_Logs_R.txt | 627, CHECK_INC_NUM | Everett | 112th St Sw | Evergreen Way | 2017-11-01 | 01:33:00 | |
4 | 2017_CAD_Logs_R.txt | 621, 2017-00146009 | Everett | Broadway | 13th St | 2017-12-02 | 10:57:00 | |
... | ... | ... | ... | ... | ... | ... | ... | ... |
429 | 2022_CAD_LOGS_R.txt | 572, 2022-00148970 | Everett | 12000 Hwy 99 | None | 2022-10-23 | 12:37:00 | |
430 | 2022_CAD_LOGS_R.txt | 499, 2022-00040803 | Everett | Airport Rd | Hwy 99 | 2022-03-24 | 06:23:00 | |
431 | 2022_CAD_LOGS_R.txt | 881, 2022-00041899 | Everett | 214 128th St Sw | None | 2022-03-26 | 07:49:00 | |
432 | 2022_CAD_LOGS_R.txt | 660, 2022-00067798 | Lynnwood | 14000 Blk Hwy 99 | None | 2022-05-17 | 20:57:00 | |
433 | 2022_CAD_LOGS_R.txt | 505, 2022-00017574 | Lake Stevens | Sr 9 Ne | Market Pl | 2022-09-28 | 20:34:00 |
434 rows × 8 columns