import warnings
warnings.filterwarnings("ignore")
import requests
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
from datetime import datetime, timedelta
df_film= pd.read_csv(r'../data/Film_Permits 2018_present.csv')
#df_film = pd.read_csv('https://data.cityofnewyork.us/api/views/tg4x-b46p/rows.csv?accessType=DOWNLOAD')
#Uncomment the code above if you need to access the dataset without download it
#divide the multiple address string into multiple lists
df_film['OFT_Text'] = df_film['ParkingHeld'].str.split(',')
df_film['OFT_Text']
0 [SHEEPSHEAD BAY ROAD between VOORHIES AVENUE a... 1 [QUEENS PLAZA SOUTH between 21 STREET and 22 S... 2 [WEST 51 STREET between 9 AVENUE and 10 AVENUE] 3 [BEDFORD AVENUE between LEFFERTS AVENUE and LI... 4 [WEST 43 STREET between AVENUE OF THE AMERIC... ... 20159 [DEGRAW STREET between 3 AVENUE and DEAD END, ... 20160 [COVERT STREET between WYCKOFF AVENUE and IRVI... 20161 [BROADWAY between GRAND STREET and HOWARD STRE... 20162 [35 STREET between 34 AVENUE and 35 AVENUE, ... 20163 [WEST 48 STREET between 6 AVENUE and 7 AVENUE] Name: OFT_Text, Length: 20164, dtype: object
# create the cache-like database to store the result of query.
# If the address has been queried, then it won't be query again and saves time.
data = {}
oft_lookup = {}
def getOFT(url):
data = oft_lookup.get(url)
if url not in oft_lookup:
r = requests.get(url)
data = r.json()
oft_lookup[url] = data
return data
segidlookup=[]
def getGeom(row):
boro = row['Borough']
#print(boro)
segids = []
for oft in row['OFT_Text']:
oft_1 = oft.strip()
try:
# split the address into on_street, from_street, to_street format, then the API can read
on_street = oft_1.split('between')[0]
from_street, to_street = oft_1.split('between')[1].split('and')
# query from geocoding service API
url = f'https://www.locationservice.nyc/locationserviceapi//api/Block?OnStreet={on_street}&CrossStreetOne={from_street}&CrossStreetTwo={to_street}&Borough={boro}&BlockType=ExtendedStretch&ExtendedStretchType=Blockface&'
data = getOFT(url)
# extract the SegementID from JSON format
if 'BlockFaceList' in data:
for i in data['BlockFaceList']:
if 'SegmentID' in i:
segids.append(i['SegmentID'])
segidlookup.append({
'EventID': row['EventID'],
'SegmentID': i['SegmentID']
})
except:
continue
return segids
df_film['segids'] = df_film[:50].apply(getGeom, axis = 1)
#I just run the first 50 data to demonstrate bc the whold dataset takes a while.
#If you hope to geocode the whole dataset, please remove [:50]
#create the dataset with eventID and Segment ID
df_segidlookup = pd.DataFrame(segidlookup)
df_segidlookup
EventID | SegmentID | |
---|---|---|
0 | 559223 | 0108149 |
1 | 559223 | 0115148 |
2 | 559223 | 0026080 |
3 | 559223 | 0026083 |
4 | 559223 | 0127321 |
... | ... | ... |
230 | 558434 | 0033164 |
231 | 558434 | 0310049 |
232 | 558745 | 0048512 |
233 | 558745 | 0048509 |
234 | 558215 | 0052376 |
235 rows × 2 columns
#import LION shapfile
lion = gpd.read_file("../vector/lion_line")
lion_sel = lion[['SegmentID','Street','geometry']]
lion_sel.head(5)
SegmentID | Street | geometry | |
---|---|---|---|
0 | 0078126 | EAST 168 STREET | LINESTRING (1010964.447 241812.261, 1011265.49... |
1 | 0079796 | WEST 192 STREET | LINESTRING (1011576.687 255023.583, 1011335.15... |
2 | 0077356 | UNION AVENUE | LINESTRING (1011600.676 239639.743, 1011785.79... |
3 | 0077356 | UNION AVENUE | LINESTRING (1011600.676 239639.743, 1011785.79... |
4 | 0077356 | UNION AVENUE | LINESTRING (1011600.676 239639.743, 1011785.79... |
#join LION to permit data
df_film_seg = df_film.merge(df_segidlookup,how='left',on='EventID')
df_film_lion = df_film_seg.merge(lion_sel,how='left',on='SegmentID')
df_film_lion
df_film_lion.head(2)
EventID | EventType | StartDateTime | EndDateTime | EnteredOn | EventAgency | ParkingHeld | Borough | CommunityBoard(s) | PolicePrecinct(s) | Category | SubCategoryName | Country | OFT_Text | segids | SegmentID | Street | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 559223 | Shooting Permit | 01/22/2021 10:00:00 AM | 01/23/2021 1:00:00 AM | 01/19/2021 1:13:21 PM | Mayor's Office of Film, Theatre & Broadcasting | SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an... | Brooklyn | 15 | 61 | Television | Episodic series | United States of America | [SHEEPSHEAD BAY ROAD between VOORHIES AVENUE a... | [0108149, 0115148, 0026080, 0026083, 0127321, ... | 0108149 | SHEEPSHEAD BAY ROAD | LINESTRING (997364.767 153166.254, 997613.031 ... |
1 | 559223 | Shooting Permit | 01/22/2021 10:00:00 AM | 01/23/2021 1:00:00 AM | 01/19/2021 1:13:21 PM | Mayor's Office of Film, Theatre & Broadcasting | SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an... | Brooklyn | 15 | 61 | Television | Episodic series | United States of America | [SHEEPSHEAD BAY ROAD between VOORHIES AVENUE a... | [0108149, 0115148, 0026080, 0026083, 0127321, ... | 0115148 | VOORHIES AVENUE | LINESTRING (997060.499 152673.840, 997307.372 ... |
#Check the number/percentage of permits with no geometry
(df_film_lion['geometry'].isnull().sum(axis = 0))/len(df_film_lion)
0.9871474123129752
#Turn permit dataframe to geodataframe
gdf_film_lion = gpd.GeoDataFrame(df_film_lion, crs="EPSG:4326").set_geometry('geometry')
gdf_film_lion_output = gdf_film_lion.drop(columns=['OFT_Text','segids'])
#Export the geodataframe to shapefile
#gdf_film_lion_output.to_file('../output/film permit_city.shp')
#Uncomment the code above if you need to download the shapefile just created
gdf_film_lion_output
EventID | EventType | StartDateTime | EndDateTime | EnteredOn | EventAgency | ParkingHeld | Borough | CommunityBoard(s) | PolicePrecinct(s) | Category | SubCategoryName | Country | SegmentID | Street | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 559223 | Shooting Permit | 01/22/2021 10:00:00 AM | 01/23/2021 1:00:00 AM | 01/19/2021 1:13:21 PM | Mayor's Office of Film, Theatre & Broadcasting | SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an... | Brooklyn | 15 | 61 | Television | Episodic series | United States of America | 0108149 | SHEEPSHEAD BAY ROAD | LINESTRING (997364.767 153166.254, 997613.031 ... |
1 | 559223 | Shooting Permit | 01/22/2021 10:00:00 AM | 01/23/2021 1:00:00 AM | 01/19/2021 1:13:21 PM | Mayor's Office of Film, Theatre & Broadcasting | SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an... | Brooklyn | 15 | 61 | Television | Episodic series | United States of America | 0115148 | VOORHIES AVENUE | LINESTRING (997060.499 152673.840, 997307.372 ... |
2 | 559223 | Shooting Permit | 01/22/2021 10:00:00 AM | 01/23/2021 1:00:00 AM | 01/19/2021 1:13:21 PM | Mayor's Office of Film, Theatre & Broadcasting | SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an... | Brooklyn | 15 | 61 | Television | Episodic series | United States of America | 0026080 | VOORHIES AVENUE | LINESTRING (997307.372 152707.114, 997613.031 ... |
3 | 559223 | Shooting Permit | 01/22/2021 10:00:00 AM | 01/23/2021 1:00:00 AM | 01/19/2021 1:13:21 PM | Mayor's Office of Film, Theatre & Broadcasting | SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an... | Brooklyn | 15 | 61 | Television | Episodic series | United States of America | 0026083 | VOORHIES AVENUE | LINESTRING (997613.031 152752.377, 997840.586 ... |
4 | 559223 | Shooting Permit | 01/22/2021 10:00:00 AM | 01/23/2021 1:00:00 AM | 01/19/2021 1:13:21 PM | Mayor's Office of Film, Theatre & Broadcasting | SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an... | Brooklyn | 15 | 61 | Television | Episodic series | United States of America | 0127321 | EAST 17 STREET | LINESTRING (997380.797 154082.381, 997400.761 ... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
20380 | 391005 | Shooting Permit | 01/04/2018 7:00:00 AM | 01/04/2018 11:00:00 PM | 01/03/2018 11:21:59 AM | Mayor's Office of Film, Theatre & Broadcasting | DEGRAW STREET between 3 AVENUE and DEAD END, ... | Brooklyn | 6 | 78 | Television | Cable-episodic | United States of America | NaN | NaN | None |
20381 | 390906 | Shooting Permit | 01/04/2018 6:30:00 AM | 01/04/2018 9:00:00 PM | 01/02/2018 2:50:09 PM | Mayor's Office of Film, Theatre & Broadcasting | COVERT STREET between WYCKOFF AVENUE and IRVIN... | Queens | 4, 5 | 104, 83 | Television | Episodic series | United States of America | NaN | NaN | None |
20382 | 390784 | Shooting Permit | 01/04/2018 6:00:00 AM | 01/04/2018 8:00:00 PM | 01/02/2018 7:41:45 AM | Mayor's Office of Film, Theatre & Broadcasting | BROADWAY between GRAND STREET and HOWARD STREE... | Manhattan | 2 | 1, 5 | WEB | Not Applicable | United States of America | NaN | NaN | None |
20383 | 391010 | Shooting Permit | 01/04/2018 6:00:00 AM | 01/04/2018 10:00:00 PM | 01/03/2018 11:31:29 AM | Mayor's Office of Film, Theatre & Broadcasting | 35 STREET between 34 AVENUE and 35 AVENUE, 35... | Queens | 1 | 114 | Television | Cable-episodic | United States of America | NaN | NaN | None |
20384 | 390863 | Shooting Permit | 01/03/2018 3:00:00 AM | 01/03/2018 11:00:00 AM | 01/02/2018 12:47:16 PM | Mayor's Office of Film, Theatre & Broadcasting | WEST 48 STREET between 6 AVENUE and 7 AVENUE | Manhattan | 5 | 18 | Television | News | United States of America | NaN | NaN | None |
20385 rows × 16 columns
df_seg_count = gdf_film_lion_output[['EventID','SegmentID']].groupby('SegmentID').count()
df_seg_count = df_seg_count.rename(columns={'EventID':'counts'}).reset_index()
df_seg_count.sort_values(by='counts', ascending=False)
SegmentID | counts | |
---|---|---|
78 | 0044931 | 6 |
34 | 0034197 | 6 |
82 | 0045053 | 6 |
73 | 0044925 | 6 |
96 | 0052376 | 5 |
... | ... | ... |
19 | 0032496 | 1 |
20 | 0032497 | 1 |
21 | 0032501 | 1 |
23 | 0032928 | 1 |
159 | 9001744 | 1 |
160 rows × 2 columns
df_film_lion_count = df_seg_count.merge(lion_sel,how='left',on='SegmentID')
df_film_lion_count = gpd.GeoDataFrame(df_film_lion_count, crs="EPSG:4326").set_geometry('geometry')
# df_film_lion_count.to_file('../output/permit_city_segment_count.shp')
df_film_lion_count = df_film_lion_count.sort_values('Counts',ascending=False)
df_film_lion_count.head(10)
SegmentID | Counts | CD | CB | Street | geometry | |
---|---|---|---|---|---|---|
6359 | 0035681 | 349 | 33 | 301 | MONITOR STREET | LINESTRING (999919.040 205536.203, 999847.829 ... |
6226 | 0035515 | 229 | 33 | 301 | CALYER STREET | LINESTRING (998361.937 205063.691, 998591.352 ... |
6233 | 0035520 | 220 | 33 | 301 | CALYER STREET | LINESTRING (998591.352 205163.176, 998820.922 ... |
6222 | 0035511 | 208 | 33 | 301 | JEWEL STREET | LINESTRING (998755.108 204787.219, 998591.352 ... |
6220 | 0035507 | 173 | 33 | 301 | DIAMOND STREET | LINESTRING (998525.443 204687.439, 998361.937 ... |
6358 | 0035676 | 170 | 33 | 301 | NORTH HENRY STREET | LINESTRING (999661.937 205507.703, 999600.658 ... |
10630 | 0048509 | 150 | 37 | 304 | COVERT STREET | LINESTRING (1010582.250 192025.794, 1010582.25... |
10628 | 0048509 | 150 | 34 | 304 | COVERT STREET | LINESTRING (1010582.250 192025.795, 1010582.25... |
10629 | 0048509 | 150 | 34 | 405 | COVERT STREET | LINESTRING (1010582.250 192025.795, 1010809.74... |
6047 | 0035023 | 149 | 33 | 301 | EAGLE STREET | LINESTRING (995324.180 206940.246, 995776.604 ... |
#A quick way to visualize the segement and see the density through opcacity settings
df_film_lion_count[pd.notna(df_film_lion_count['geometry'])][
[col for col in list(df_film_lion_count) if col not in ('StartDateTime')]
].explore(
tiles="CartoDB positron",
style_kwds={'opacity':0.02})
data = gpd.read_file("../output/film permit_city.shp" )
data_sel = data[['EventID','StartDateT','EndDateTim','SegmentID','geometry']]
# See how many duplicate data in the dataset
print(len(data_sel[data_sel.duplicated()==True]))
print(len(data_sel[data_sel.duplicated()==True])/len(data_sel))
7687 0.17144322769141557
# eliminate the duplicate data
data_sel = data_sel.drop_duplicates()
data_sel
EventID | StartDateT | EndDateTim | SegmentID | geometry | |
---|---|---|---|---|---|
0 | 628327 | 03/08/2022 07:00:00 AM | 03/08/2022 07:00:00 PM | 0065898 | LINESTRING (1000675.780 203773.602, 1000590.62... |
2 | 629351 | 03/08/2022 07:00:00 AM | 03/08/2022 10:00:00 PM | 0048509 | LINESTRING (1010582.250 192025.794, 1010809.74... |
3 | 629319 | 03/08/2022 07:00:00 AM | 03/08/2022 09:00:00 PM | 0044431 | LINESTRING (1002265.974 195997.473, 1002202.52... |
4 | 629319 | 03/08/2022 07:00:00 AM | 03/08/2022 09:00:00 PM | 0044457 | LINESTRING (1002202.526 196249.592, 1002715.21... |
5 | 629319 | 03/08/2022 07:00:00 AM | 03/08/2022 09:00:00 PM | 0044453 | LINESTRING (1002022.559 196965.059, 1002571.01... |
... | ... | ... | ... | ... | ... |
44831 | 623949 | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 0105864 | LINESTRING (982584.908 196291.744, 982718.819 ... |
44832 | 623949 | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 0023296 | LINESTRING (982183.843 196510.021, 982273.246 ... |
44833 | 623949 | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 0023304 | LINESTRING (982273.246 196628.042, 982369.951 ... |
44834 | 623949 | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 0023303 | None |
44835 | 623949 | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 0023182 | LINESTRING (981806.626 196495.926, 981736.245 ... |
37150 rows × 5 columns
#Since the boundaries of census tract are on the central line of the street, the permit segments need to be buffered.
data_sel_buffer=data_sel
data_sel_buffer['geometry']= data_sel.buffer(3,cap_style=2)
#import census tract shapefile
ct = gpd.read_file("../vector/2020 Census Tract_NYC")
ct_sel = ct[['BoroCT2020','NTAName','GEOID','geometry']]
# Join the census tract information to permit data
ct_data = ct_sel.sjoin(data_sel_buffer, how="left")
# count the number of permits in the CT
ct_data_count = ct_data[['BoroCT2020','SegmentID']].groupby(by=['BoroCT2020']).count()
ct_data_count = ct_data_count.rename(columns={"SegmentID":"Counts"})
# join the permit number table back to the CT shapefile
ct_count = pd.merge(ct_sel, ct_data_count, how="left", on=["BoroCT2020"])
ct_count
BoroCT2020 | NTAName | GEOID | geometry | Counts | |
---|---|---|---|---|---|
0 | 1000100 | The Battery-Governors Island-Ellis Island-Libe... | 36061000100 | MULTIPOLYGON (((972081.788 190733.467, 972184.... | 0 |
1 | 1000201 | Chinatown-Two Bridges | 36061000201 | POLYGON ((988548.218 197770.375, 987978.808 19... | 8 |
2 | 1000202 | Lower East Side | 36061000202 | MULTIPOLYGON (((989137.110 196325.439, 989113.... | 31 |
3 | 1000500 | The Battery-Governors Island-Ellis Island-Libe... | 36061000500 | MULTIPOLYGON (((981219.056 188655.316, 980940.... | 0 |
4 | 1000600 | Chinatown-Two Bridges | 36061000600 | POLYGON ((986961.185 199553.643, 987206.139 19... | 95 |
... | ... | ... | ... | ... | ... |
2320 | 5030302 | Mariner's Harbor-Arlington-Graniteville | 36085030302 | POLYGON ((939696.662 167578.833, 939768.394 16... | 0 |
2321 | 5031901 | Mariner's Harbor-Arlington-Graniteville | 36085031901 | POLYGON ((939696.662 167578.833, 939508.024 16... | 0 |
2322 | 5031902 | Mariner's Harbor-Arlington-Graniteville | 36085031902 | POLYGON ((939747.491 171919.373, 939778.967 17... | 5 |
2323 | 5032300 | Mariner's Harbor-Arlington-Graniteville | 36085032300 | POLYGON ((936740.120 173125.853, 936712.951 17... | 0 |
2324 | 5990100 | Hoffman & Swinburne Islands | 36085990100 | MULTIPOLYGON (((970217.022 145643.332, 970227.... | 0 |
2325 rows × 5 columns
#Find out the unit
ct_count.crs
<Derived Projected CRS: EPSG:2263> Name: NAD83 / New York Long Island (ftUS) Axis Info [cartesian]: - [east]: Easting (US survey foot) - [north]: Northing (US survey foot) Area of Use: - undefined Coordinate Operation: - name: unnamed - method: Lambert Conic Conformal (2SP) Datum: North American Datum 1983 - Ellipsoid: GRS 1980 - Prime Meridian: Greenwich
#calculate the area to get the density of the counts
ct_count['area_mile'] = ct_count.area/2.788e+7
ct_count['count_per_mile']= ct_count['Counts']/ct_count['area_mile']
ct_count
BoroCT2020 | NTAName | GEOID | geometry | Counts | area_mile | count_per_mile | |
---|---|---|---|---|---|---|---|
0 | 1000100 | The Battery-Governors Island-Ellis Island-Libe... | 36061000100 | MULTIPOLYGON (((972081.788 190733.467, 972184.... | 0 | 0.066155 | 0.000000 |
1 | 1000201 | Chinatown-Two Bridges | 36061000201 | POLYGON ((988548.218 197770.375, 987978.808 19... | 8 | 0.034875 | 229.391366 |
2 | 1000202 | Lower East Side | 36061000202 | MULTIPOLYGON (((989137.110 196325.439, 989113.... | 31 | 0.118907 | 260.708959 |
3 | 1000500 | The Battery-Governors Island-Ellis Island-Libe... | 36061000500 | MULTIPOLYGON (((981219.056 188655.316, 980940.... | 0 | 0.325796 | 0.000000 |
4 | 1000600 | Chinatown-Two Bridges | 36061000600 | POLYGON ((986961.185 199553.643, 987206.139 19... | 95 | 0.092636 | 1025.513854 |
... | ... | ... | ... | ... | ... | ... | ... |
2320 | 5030302 | Mariner's Harbor-Arlington-Graniteville | 36085030302 | POLYGON ((939696.662 167578.833, 939768.394 16... | 0 | 0.351083 | 0.000000 |
2321 | 5031901 | Mariner's Harbor-Arlington-Graniteville | 36085031901 | POLYGON ((939696.662 167578.833, 939508.024 16... | 0 | 0.148013 | 0.000000 |
2322 | 5031902 | Mariner's Harbor-Arlington-Graniteville | 36085031902 | POLYGON ((939747.491 171919.373, 939778.967 17... | 5 | 0.251258 | 19.899850 |
2323 | 5032300 | Mariner's Harbor-Arlington-Graniteville | 36085032300 | POLYGON ((936740.120 173125.853, 936712.951 17... | 0 | 1.493005 | 0.000000 |
2324 | 5990100 | Hoffman & Swinburne Islands | 36085990100 | MULTIPOLYGON (((970217.022 145643.332, 970227.... | 0 | 0.022801 | 0.000000 |
2325 rows × 7 columns
# visualize the permit counts by census tract
ax = ct.boundary.plot(figsize=(15,12),linewidth=0.2, edgecolor='w', alpha=0.8)
ct_count.plot(ax=ax,column='Counts', cmap='Reds',legend=True, vmax=300)
<AxesSubplot:>
# visualize the permit count density by census tract
ax = ct.boundary.plot(figsize=(15,12),linewidth=0.2, edgecolor='w', alpha=0.8)
ct_count.plot(ax=ax,column='count_per_mile', cmap='Reds',legend=True, vmax=2500)
<AxesSubplot:>
data = df_film_lion
data = data.drop_duplicates(subset=['EventID'])
data.head()
EventID | EventType | StartDateTime | EndDateTime | EnteredOn | EventAgency | ParkingHeld | Borough | CommunityBoard(s) | PolicePrecinct(s) | Category | SubCategoryName | Country | OFT_Text | segids | SegmentID | Street | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 559223 | Shooting Permit | 01/22/2021 10:00:00 AM | 01/23/2021 1:00:00 AM | 01/19/2021 1:13:21 PM | Mayor's Office of Film, Theatre & Broadcasting | SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an... | Brooklyn | 15 | 61 | Television | Episodic series | United States of America | [SHEEPSHEAD BAY ROAD between VOORHIES AVENUE a... | [0108149, 0115148, 0026080, 0026083, 0127321, ... | 0108149 | SHEEPSHEAD BAY ROAD | LINESTRING (997364.767 153166.254, 997613.031 ... |
9 | 559248 | Shooting Permit | 01/22/2021 10:00:00 AM | 01/23/2021 2:00:00 AM | 01/19/2021 2:33:31 PM | Mayor's Office of Film, Theatre & Broadcasting | QUEENS PLAZA SOUTH between 21 STREET and 22 ST... | Queens | 2 | 108 | Television | Episodic series | United States of America | [QUEENS PLAZA SOUTH between 21 STREET and 22 S... | [] | NaN | NaN | None |
10 | 559290 | Shooting Permit | 01/22/2021 9:00:00 AM | 01/22/2021 11:59:00 PM | 01/20/2021 10:39:44 AM | Mayor's Office of Film, Theatre & Broadcasting | WEST 51 STREET between 9 AVENUE and 10 AVENUE | Manhattan | 4 | 18 | Television | Episodic series | United States of America | [WEST 51 STREET between 9 AVENUE and 10 AVENUE] | [0033977] | 0033977 | WEST 51 STREET | LINESTRING (987436.794 217663.877, 986650.937 ... |
11 | 558551 | Shooting Permit | 01/22/2021 8:00:00 AM | 01/22/2021 10:00:00 PM | 01/15/2021 10:21:26 AM | Mayor's Office of Film, Theatre & Broadcasting | BEDFORD AVENUE between LEFFERTS AVENUE and LIN... | Brooklyn | 9 | 71 | Television | Episodic series | United States of America | [BEDFORD AVENUE between LEFFERTS AVENUE and LI... | [0029490, 0029273, 0029280, 0109171, 0028388, ... | 0029490 | BEDFORD AVENUE | LINESTRING (996167.827 180451.067, 996191.780 ... |
22 | 558610 | Shooting Permit | 01/22/2021 8:00:00 AM | 01/23/2021 12:00:00 AM | 01/15/2021 11:35:16 AM | Mayor's Office of Film, Theatre & Broadcasting | WEST 43 STREET between AVENUE OF THE AMERICA... | Manhattan | 5 | 14, 18 | Television | Cable-episodic | United States of America | [WEST 43 STREET between AVENUE OF THE AMERIC... | [0034189, 0034194, 0035787, 0034192, 0034197, ... | 0034189 | WEST 43 STREET | LINESTRING (989681.508 214036.225, 988786.877 ... |
#create timespan for each permit
def range_to_list(a, b):
return pd.date_range(a, b, freq = 'H')[:-1]
data['StartEnd'] = [[i, j] for i, j in zip( data['StartDateTime'], data['EndDateTime'])]
# create hourly time list from the timespan
data['TimeList'] = data['StartEnd'].apply(lambda x: range_to_list(x[0], x[1]))
data['StartEnd'][0]
['01/22/2021 10:00:00 AM', '01/23/2021 1:00:00 AM']
data['TimeList'].head()
0 DatetimeIndex(['2021-01-22 10:00:00', '2021-01... 9 DatetimeIndex(['2021-01-22 10:00:00', '2021-01... 10 DatetimeIndex(['2021-01-22 09:00:00', '2021-01... 11 DatetimeIndex(['2021-01-22 08:00:00', '2021-01... 22 DatetimeIndex(['2021-01-22 08:00:00', '2021-01... Name: TimeList, dtype: object
#create new column to store the list of hour numbers and the list of date
data['hours'] = data['TimeList'].apply(lambda x: list(x.hour))
data['day'] = data['TimeList'].apply(lambda x: list(set(x.date)))
#create new column to count how many days within a permit timespan
data['day_length'] = data['TimeList'].apply(lambda x: len(list(set(x.date))))
f = lambda x: list(z for y in x for z in y)
event_hour_sum = data.groupby('EventID')['hours'].agg(f).reset_index()
event_hour_sum
EventID | hours | |
---|---|---|
0 | 390784 | [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1... |
1 | 390785 | [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1... |
2 | 390787 | [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ... |
3 | 390837 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... |
4 | 390863 | [3, 4, 5, 6, 7, 8, 9, 10] |
... | ... | ... |
20159 | 559279 | [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] |
20160 | 559290 | [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20... |
20161 | 559305 | [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17... |
20162 | 559338 | [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,... |
20163 | 559418 | [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ... |
20164 rows × 2 columns
# count the number of hours in the list
event_hour_sum['hour_sum'] = event_hour_sum['hours'].apply(lambda x: len(x))
event_hour_sum = event_hour_sum.astype({'EventID': 'int'})
#Sort the table with the hour sum and turn the hour into days
event_hour_sum = pd.DataFrame(event_hour_sum)
event_hour_sum = event_hour_sum.sort_values(by='hour_sum')
event_hour_sum['day']=event_hour_sum['hour_sum']/24
event_hour_sum
EventID | hours | hour_sum | day | |
---|---|---|---|---|
3551 | 423452 | [] | 0 | 0.000000 |
19011 | 548759 | [] | 0 | 0.000000 |
15453 | 511810 | [10] | 1 | 0.041667 |
15565 | 512208 | [8] | 1 | 0.041667 |
12616 | 492598 | [10] | 1 | 0.041667 |
... | ... | ... | ... | ... |
9988 | 465679 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | 815 | 33.958333 |
3052 | 418275 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | 1583 | 65.958333 |
9042 | 458022 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | 8610 | 358.750000 |
17473 | 525411 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | 8640 | 360.000000 |
19969 | 556250 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | 8687 | 361.958333 |
20164 rows × 4 columns
#drop outlier
from scipy import stats
event_hour_sum = event_hour_sum[(np.abs(stats.zscore(event_hour_sum['hour_sum'])) < 3)]
event_hour_sum
EventID | hours | hour_sum | day | |
---|---|---|---|---|
3551 | 423452 | [] | 0 | 0.000000 |
19011 | 548759 | [] | 0 | 0.000000 |
15453 | 511810 | [10] | 1 | 0.041667 |
15565 | 512208 | [8] | 1 | 0.041667 |
12616 | 492598 | [10] | 1 | 0.041667 |
... | ... | ... | ... | ... |
6410 | 443446 | [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1... | 336 | 14.000000 |
15618 | 512412 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | 341 | 14.208333 |
19951 | 555849 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | 348 | 14.500000 |
19948 | 555831 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | 348 | 14.500000 |
4749 | 434223 | [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1... | 352 | 14.666667 |
20095 rows × 4 columns
event_plot = event_hour_sum[['EventID','hour_sum']].groupby(['hour_sum']).count()
event_plot = event_plot.rename(columns={'EventID':'Counts'})
event_plot = event_plot.sort_values(by='Counts')
event_plot.plot(figsize=(10,8),marker='o',linestyle='',markersize=2)
<AxesSubplot:xlabel='hour_sum'>
import itertools
# extract the hour from the list and make it a dataframe
flat_hour = list(itertools.chain(*data['hours']))
flat_hour = pd.DataFrame(flat_hour)
flat_hour_new = flat_hour.rename(columns={0:"hour"})
flat_hour_new = flat_hour_new.reset_index()
# group by the hour counts
flat_hour_count = flat_hour_new.groupby('hour').count()
flat_hour_count = flat_hour_count.rename(columns= {'index':"count"})
flat_hour_count = flat_hour_count.reset_index()
flat_hour_count.plot(kind="bar",figsize=(20, 8))
<AxesSubplot:>
flat_day = list(itertools.chain(*data['day']))
flat_day = pd.DataFrame(flat_day)
flat_day = flat_day.rename(columns={0:"date"})
flat_day = flat_day.reset_index()
flat_day_count = flat_day.groupby('date').count()
flat_day_count = flat_day_count.rename(columns= {'index':"count"})
flat_day_count = flat_day_count.reset_index()
flat_day_count
date | count | |
---|---|---|
0 | 2018-01-03 | 1 |
1 | 2018-01-04 | 5 |
2 | 2018-01-05 | 3 |
3 | 2018-01-06 | 1 |
4 | 2018-01-07 | 2 |
... | ... | ... |
1454 | 2021-12-27 | 1 |
1455 | 2021-12-28 | 1 |
1456 | 2021-12-29 | 1 |
1457 | 2021-12-30 | 1 |
1458 | 2021-12-31 | 1 |
1459 rows × 2 columns
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(12,8)})
sns.lineplot(x = "date", y = "count",
data = flat_day_count)
<AxesSubplot:xlabel='date', ylabel='count'>
Avg_4days = []
for i in range(0,len(flat_day_count),4):
date = flat_day_count['date'][i]
if i >=len(flat_day_count)-2:
count = flat_day_count['count'][i]
Avg_4days.append({'date': date, 'count(avg)': count})
else:
count = round(flat_day_count['count'][i:i+3].mean(),0)
Avg_4days.append({'date': date, 'count(avg)': count})
Avg_4days = pd.DataFrame(Avg_4days)
Avg_4days
date | count(avg) | |
---|---|---|
0 | 2018-01-03 | 3.0 |
1 | 2018-01-07 | 11.0 |
2 | 2018-01-11 | 16.0 |
3 | 2018-01-15 | 21.0 |
4 | 2018-01-19 | 19.0 |
... | ... | ... |
360 | 2021-12-13 | 1.0 |
361 | 2021-12-17 | 1.0 |
362 | 2021-12-21 | 1.0 |
363 | 2021-12-25 | 1.0 |
364 | 2021-12-29 | 1.0 |
365 rows × 2 columns
Calendar_rolling = []
for i in range(0,len(flat_day_count)):
date = flat_day_count['date'][i]
if i<3 or i >= len(flat_day_count)-3:
Calendar_rolling.append({'date': date, 'count(rolling)': None})
else:
count = round(sum(flat_day_count['count'][i-3:i+3])/7)
Calendar_rolling.append({'date': date, 'count(rolling)': count})
Calendar_rolling = pd.DataFrame(Calendar_rolling)
flat_day_count['count(rolling)'] = Calendar_rolling['count(rolling)']
flat_day_count
date | count | count(rolling) | |
---|---|---|---|
0 | 2018-01-03 | 1 | NaN |
1 | 2018-01-04 | 5 | NaN |
2 | 2018-01-05 | 3 | NaN |
3 | 2018-01-06 | 1 | 3.0 |
4 | 2018-01-07 | 2 | 6.0 |
... | ... | ... | ... |
1454 | 2021-12-27 | 1 | 1.0 |
1455 | 2021-12-28 | 1 | 1.0 |
1456 | 2021-12-29 | 1 | NaN |
1457 | 2021-12-30 | 1 | NaN |
1458 | 2021-12-31 | 1 | NaN |
1459 rows × 3 columns
Avg_4days_rolling = []
for i in range(0,len(flat_day_count),4):
date = flat_day_count['date'][i]
if i >=len(flat_day_count)-2:
count_rol = flat_day_count['count(rolling)'][i]
Avg_4days_rolling.append({'date': date, 'rolling(avg)': count_rol})
else:
count_rol = round(flat_day_count['count(rolling)'][i:i+3].mean(),0)
Avg_4days_rolling.append({'date': date, 'rolling(avg)': count_rol})
Avg_4days_rolling = pd.DataFrame(Avg_4days_rolling)
Avg_4days['rolling(avg)'] = Avg_4days_rolling['rolling(avg)']
Avg_4days
date | count(avg) | rolling(avg) | |
---|---|---|---|
0 | 2018-01-03 | 3.0 | NaN |
1 | 2018-01-07 | 11.0 | 7.0 |
2 | 2018-01-11 | 16.0 | 13.0 |
3 | 2018-01-15 | 21.0 | 16.0 |
4 | 2018-01-19 | 19.0 | 20.0 |
... | ... | ... | ... |
360 | 2021-12-13 | 1.0 | 1.0 |
361 | 2021-12-17 | 1.0 | 1.0 |
362 | 2021-12-21 | 1.0 | 1.0 |
363 | 2021-12-25 | 1.0 | 1.0 |
364 | 2021-12-29 | 1.0 | NaN |
365 rows × 3 columns
street_hour=data[['SegmentID','hours']]
street_hour['hour_sum_event'] = data['hours'].apply(lambda x: len(x))
street_hour_sum = street_hour.groupby('SegmentID').sum().reset_index()
street_hour_sum = street_hour_sum.rename(columns={'hour_sum_event':'hour_sum'})
street_hour_sum
#Sort the table with the hour sum and turn the hour into days
street_hour_sum = pd.DataFrame(street_hour_sum)
street_hour_sum = street_hour_sum.sort_values(by='hour_sum')
street_hour_sum['day']=street_hour_sum['hour_sum']/24
street_hour_sum.sort_values('day',ascending=False)
SegmentID | hour_sum | day | |
---|---|---|---|
20 | 0052376 | 64 | 2.666667 |
14 | 0044912 | 43 | 1.791667 |
7 | 0035023 | 42 | 1.750000 |
6 | 0034189 | 32 | 1.333333 |
0 | 0020644 | 30 | 1.250000 |
9 | 0035681 | 30 | 1.250000 |
24 | 0070495 | 30 | 1.250000 |
17 | 0048509 | 30 | 1.250000 |
1 | 0029490 | 28 | 1.166667 |
30 | 0269872 | 25 | 1.041667 |
23 | 0067747 | 18 | 0.750000 |
21 | 0053880 | 17 | 0.708333 |
16 | 0046498 | 17 | 0.708333 |
19 | 0052369 | 16 | 0.666667 |
3 | 0032928 | 16 | 0.666667 |
15 | 0046294 | 16 | 0.666667 |
13 | 0043358 | 15 | 0.625000 |
12 | 0038710 | 15 | 0.625000 |
22 | 0066083 | 15 | 0.625000 |
25 | 0085164 | 15 | 0.625000 |
8 | 0035511 | 15 | 0.625000 |
26 | 0108149 | 15 | 0.625000 |
27 | 0165292 | 15 | 0.625000 |
4 | 0033171 | 15 | 0.625000 |
2 | 0032497 | 15 | 0.625000 |
10 | 0035970 | 15 | 0.625000 |
18 | 0048512 | 14 | 0.583333 |
29 | 0220873 | 14 | 0.583333 |
11 | 0037346 | 14 | 0.583333 |
5 | 0033977 | 14 | 0.583333 |
28 | 0215772 | 14 | 0.583333 |
# checking the top 3 duration street location
data[data['SegmentID']=='0035681'].head()
EventID | EventType | StartDateTime | EndDateTime | EnteredOn | EventAgency | ParkingHeld | Borough | CommunityBoard(s) | PolicePrecinct(s) | ... | OFT_Text | segids | SegmentID | Street | geometry | StartEnd | TimeList | hours | day | day_length | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
150 | 558287 | Shooting Permit | 01/21/2021 7:00:00 AM | 01/21/2021 9:00:00 PM | 01/14/2021 1:35:06 PM | Mayor's Office of Film, Theatre & Broadcasting | KINGSLAND AVENUE between NORMAN AVENUE and GRE... | Brooklyn | 1 | 94 | ... | [KINGSLAND AVENUE between NORMAN AVENUE and GR... | [0035681] | 0035681 | MONITOR STREET | LINESTRING (999919.040 205536.203, 999847.829 ... | [01/21/2021 7:00:00 AM, 01/21/2021 9:00:00 PM] | DatetimeIndex(['2021-01-21 07:00:00', '2021-01... | [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ... | [2021-01-21] | 1 |
192 | 559256 | Shooting Permit | 01/21/2021 6:00:00 AM | 01/21/2021 10:00:00 PM | 01/19/2021 3:44:55 PM | Mayor's Office of Film, Theatre & Broadcasting | MONITOR STREET between GREENPOINT AVENUE and N... | Brooklyn | 1 | 94 | ... | [MONITOR STREET between GREENPOINT AVENUE and ... | [0035681, 0035681] | 0035681 | MONITOR STREET | LINESTRING (999919.040 205536.203, 999847.829 ... | [01/21/2021 6:00:00 AM, 01/21/2021 10:00:00 PM] | DatetimeIndex(['2021-01-21 06:00:00', '2021-01... | [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1... | [2021-01-21] | 1 |
2 rows × 23 columns
data = gpd.read_file("../output/film permit_city.shp" )
data_fil=data.drop(columns=['OBJECTID','Join_ID'])
data_fil=data_fil.drop_duplicates().reset_index()
#data prep for getting time and date related information
data_fil['StartEnd'] = [[i, j] for i, j in zip( data_fil['StartDateT'], data_fil['EndDateTim'])]
def range_to_list(a, b):
return pd.date_range(a, b, freq = 'H')[:-1]
data_fil['TimeList'] = data_fil['StartEnd'].apply(lambda x: range_to_list(x[0], x[1]))
data_fil['hours'] = data_fil['TimeList'].apply(lambda x: list(x.hour))
data_fil['month'] = data_fil['TimeList'].apply(lambda x: list(set(x.month)))
data_fil['date'] = data_fil['TimeList'].apply(lambda x: list(set(x.date)))
data_fil['hour_sum'] = data_fil['hours'].apply(lambda x: len(x))
data_fil['day_sum']=data_fil['hour_sum']/24
data_fil['StartDateTime'] = data_fil['StartDateT'].apply(pd.to_datetime)
# change month columns from list to string
month1 = data_fil.apply(lambda x: pd.Series(x['month']),axis=1).stack().reset_index(level=1, drop=True)
month1.name = 'month'
data_fil = data_fil.drop('month', axis=1).join(month1)
data_fil['month'] = data_fil['month'].astype(str)
data_fil.head()
index | EventID | EventType | StartDateT | EndDateTim | EnteredOn | EventAgenc | ParkingHel | Borough | CommunityB | ... | SegmentID | geometry | StartEnd | TimeList | hours | date | hour_sum | day_sum | StartDateTime | month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 628327 | Shooting Permit | 03/08/2022 07:00:00 AM | 03/08/2022 07:00:00 PM | 02/28/2022 11:13:50 AM | Mayor's Office of Film, Theatre & Broadcasting | SUTTON STREET between NORMAN AVENUE and NASSAU... | Brooklyn | 1 | ... | 0065898 | LINESTRING (1000675.780 203773.602, 1000590.62... | [03/08/2022 07:00:00 AM, 03/08/2022 07:00:00 PM] | DatetimeIndex(['2022-03-08 07:00:00', '2022-03... | [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] | [2022-03-08] | 12 | 0.500000 | 2022-03-08 07:00:00 | 3.0 |
1 | 2 | 629351 | Shooting Permit | 03/08/2022 07:00:00 AM | 03/08/2022 10:00:00 PM | 03/03/2022 02:51:17 PM | Mayor's Office of Film, Theatre & Broadcasting | COVERT ST between WYCKOFF AVENUE and IRVING AV... | Queens | 5 | ... | 0048509 | LINESTRING (1010582.250 192025.794, 1010809.74... | [03/08/2022 07:00:00 AM, 03/08/2022 10:00:00 PM] | DatetimeIndex(['2022-03-08 07:00:00', '2022-03... | [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ... | [2022-03-08] | 15 | 0.625000 | 2022-03-08 07:00:00 | 3.0 |
2 | 3 | 629319 | Shooting Permit | 03/08/2022 07:00:00 AM | 03/08/2022 09:00:00 PM | 03/03/2022 01:49:15 PM | Mayor's Office of Film, Theatre & Broadcasting | WHITE STREET between SEIGEL STREET and MOORE S... | Brooklyn | 1 | ... | 0044431 | LINESTRING (1002265.974 195997.473, 1002202.52... | [03/08/2022 07:00:00 AM, 03/08/2022 09:00:00 PM] | DatetimeIndex(['2022-03-08 07:00:00', '2022-03... | [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ... | [2022-03-08] | 14 | 0.583333 | 2022-03-08 07:00:00 | 3.0 |
3 | 4 | 629319 | Shooting Permit | 03/08/2022 07:00:00 AM | 03/08/2022 09:00:00 PM | 03/03/2022 01:49:15 PM | Mayor's Office of Film, Theatre & Broadcasting | WHITE STREET between SEIGEL STREET and MOORE S... | Brooklyn | 1 | ... | 0044457 | LINESTRING (1002202.526 196249.592, 1002715.21... | [03/08/2022 07:00:00 AM, 03/08/2022 09:00:00 PM] | DatetimeIndex(['2022-03-08 07:00:00', '2022-03... | [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ... | [2022-03-08] | 14 | 0.583333 | 2022-03-08 07:00:00 | 3.0 |
4 | 5 | 629319 | Shooting Permit | 03/08/2022 07:00:00 AM | 03/08/2022 09:00:00 PM | 03/03/2022 01:49:15 PM | Mayor's Office of Film, Theatre & Broadcasting | WHITE STREET between SEIGEL STREET and MOORE S... | Brooklyn | 1 | ... | 0044453 | LINESTRING (1002022.559 196965.059, 1002571.01... | [03/08/2022 07:00:00 AM, 03/08/2022 09:00:00 PM] | DatetimeIndex(['2022-03-08 07:00:00', '2022-03... | [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ... | [2022-03-08] | 14 | 0.583333 | 2022-03-08 07:00:00 | 3.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
37145 | 44831 | 623949 | Shooting Permit | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 02/01/2022 01:32:18 PM | Mayor's Office of Film, Theatre & Broadcasting | PEARL STREET between PECK SLIP and BEEKMAN STR... | Manhattan | 1 | ... | 0105864 | LINESTRING (982584.908 196291.744, 982718.819 ... | [02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM] | DatetimeIndex(['2022-02-04 08:00:00', '2022-02... | [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,... | [2022-02-04] | 13 | 0.541667 | 2022-02-04 08:00:00 | 2.0 |
37146 | 44832 | 623949 | Shooting Permit | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 02/01/2022 01:32:18 PM | Mayor's Office of Film, Theatre & Broadcasting | PEARL STREET between PECK SLIP and BEEKMAN STR... | Manhattan | 1 | ... | 0023296 | LINESTRING (982183.843 196510.021, 982273.246 ... | [02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM] | DatetimeIndex(['2022-02-04 08:00:00', '2022-02... | [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,... | [2022-02-04] | 13 | 0.541667 | 2022-02-04 08:00:00 | 2.0 |
37147 | 44833 | 623949 | Shooting Permit | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 02/01/2022 01:32:18 PM | Mayor's Office of Film, Theatre & Broadcasting | PEARL STREET between PECK SLIP and BEEKMAN STR... | Manhattan | 1 | ... | 0023304 | LINESTRING (982273.246 196628.042, 982369.951 ... | [02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM] | DatetimeIndex(['2022-02-04 08:00:00', '2022-02... | [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,... | [2022-02-04] | 13 | 0.541667 | 2022-02-04 08:00:00 | 2.0 |
37148 | 44834 | 623949 | Shooting Permit | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 02/01/2022 01:32:18 PM | Mayor's Office of Film, Theatre & Broadcasting | PEARL STREET between PECK SLIP and BEEKMAN STR... | Manhattan | 1 | ... | 0023303 | None | [02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM] | DatetimeIndex(['2022-02-04 08:00:00', '2022-02... | [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,... | [2022-02-04] | 13 | 0.541667 | 2022-02-04 08:00:00 | 2.0 |
37149 | 44835 | 623949 | Shooting Permit | 02/04/2022 08:00:00 AM | 02/04/2022 09:30:00 PM | 02/01/2022 01:32:18 PM | Mayor's Office of Film, Theatre & Broadcasting | PEARL STREET between PECK SLIP and BEEKMAN STR... | Manhattan | 1 | ... | 0023182 | LINESTRING (981806.626 196495.926, 981736.245 ... | [02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM] | DatetimeIndex(['2022-02-04 08:00:00', '2022-02... | [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,... | [2022-02-04] | 13 | 0.541667 | 2022-02-04 08:00:00 | 2.0 |
37451 rows × 25 columns
# create buffer for avoiding the census tract boundary and street segment overlapping conflicts
data_time_buffer=data_fil
data_time_buffer['geometry']= data_time_buffer.buffer(3,cap_style=2)
# spatial join with census tract shapefile
ct_data_time = ct_sel.sjoin(data_time_buffer, how="left")
ct_data_time=ct_data_time.dropna().reset_index()
#spatial join with Council District 33 shapefile to get CD data
cd = gpd.read_file("../vector/Council Districts/nycc.shp")
cd33 = cd[cd['CounDist']==33]
ct_time_cd33 = gpd.overlay(ct_data_time, cd33, how='intersection')
ct_time_cd33_forsum=ct_time_cd33[['BoroCT2020','EventID','hour_sum','day_sum','month']]
ct_time_cd33_forsum=ct_time_cd33_forsum.drop_duplicates()
ct_time_cd33_forsum['month']= ct_time_cd33_forsum['month'].astype(float).astype(int)
ct_time_sum = ct_time_cd33_forsum[['BoroCT2020','hour_sum','day_sum','month']].groupby(by=[
'BoroCT2020','month']).sum()
ct_time_sum = ct_time_sum.reset_index()
ct_time_sum.sort_values('day_sum')
BoroCT2020 | month | hour_sum | day_sum | |
---|---|---|---|---|
401 | 3051100 | 2 | 1.0 | 0.041667 |
96 | 3001100 | 1 | 2.0 | 0.083333 |
279 | 3012700 | 1 | 4.0 | 0.166667 |
653 | 3059300 | 8 | 5.0 | 0.208333 |
635 | 3058902 | 8 | 5.0 | 0.208333 |
... | ... | ... | ... | ... |
621 | 3058901 | 3 | 1186.0 | 49.416667 |
620 | 3058901 | 2 | 1552.0 | 64.666667 |
607 | 3057902 | 1 | 1555.0 | 64.791667 |
609 | 3057902 | 3 | 1606.0 | 66.916667 |
608 | 3057902 | 2 | 1893.0 | 78.875000 |
656 rows × 4 columns
ct_time_sum = ct_time_sum.pivot_table(index =['BoroCT2020'], values =['day_sum'],columns='month')
ct_time_sum
day_sum | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
month | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
BoroCT2020 | ||||||||||||
1000900 | 1.083333 | 2.541667 | 1.916667 | 1.958333 | 2.125000 | NaN | 4.583333 | 2.291667 | 1.916667 | 1.583333 | 3.916667 | 1.916667 |
1001501 | 1.083333 | 4.416667 | 0.333333 | 1.625000 | 2.416667 | 1.708333 | 7.000000 | 4.708333 | 4.416667 | 6.000000 | 4.166667 | 0.500000 |
1001502 | 7.625000 | 10.708333 | 1.291667 | 4.208333 | 7.541667 | 2.791667 | 11.041667 | 6.083333 | 12.541667 | 10.708333 | 3.208333 | 1.166667 |
1002400 | NaN | NaN | NaN | NaN | 0.500000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1004400 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.666667 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3057902 | 64.791667 | 78.875000 | 66.916667 | 40.333333 | 29.666667 | 15.958333 | 13.125000 | 36.083333 | 30.000000 | 38.375000 | 34.625000 | 22.708333 |
3058901 | 48.958333 | 64.666667 | 49.416667 | 29.500000 | 17.416667 | 10.250000 | 11.250000 | 19.500000 | 19.750000 | 25.791667 | 29.875000 | 17.916667 |
3058902 | 1.458333 | 3.041667 | 2.625000 | 0.750000 | NaN | NaN | NaN | 0.208333 | 1.333333 | NaN | 2.708333 | NaN |
3059100 | 1.500000 | 3.250000 | 2.083333 | NaN | NaN | 0.416667 | 0.750000 | 1.875000 | 1.291667 | 3.166667 | 3.916667 | 0.541667 |
3059300 | 0.625000 | 2.333333 | 2.625000 | 1.416667 | NaN | NaN | 0.750000 | 0.208333 | 1.333333 | NaN | 1.083333 | NaN |
84 rows × 12 columns
#Exporting pivot table dataset to csv file
# ct_time_sum.to_csv('../output/Pivot_permits_cd33_time.csv')
#Uncomment the code above if you need to access the dataset without download it
import requests
from bs4 import BeautifulSoup
import usaddress
import os
#input the webpage we try to parse
#read the page with BeautifulSoup and extract ti
url = "https://www1.nyc.gov/site/mome/industries/studios-stages.page"
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
#get addresses from each table row
facilities = []
content = soup.find_all('tr')
for i, row in enumerate(content):
#skip header
if i == 0:
continue
cell = row.find_all('td')[0]
facility = row.find_all('td')[-1].text
try:
title = cell.find('strong').text
except:
title = cell.find('a').text
text = cell.get_text(separator = ',', strip = True)
address = text.replace(title,'', 1)
address = address.replace(',','', 1)
facilities.append([title,address,facility])
#create the studio dataset from the parsing data
facilities_df = pd.DataFrame(facilities, columns = ['Name','Address','Type'])
facilities_df.head()
Name | Address | Type | |
---|---|---|---|
0 | 1717 Troutman | 1717 Troutman Street, #300,Ridgewood, NY 11385 | Not applicable |
1 | The 1896 Studio & Stages | 592 Johnson Avenue/,211-215 Ingraham Avenue,Br... | Level 1 |
2 | 19th Avenue Stage | 19-02 Steinway Street,Astoria, NY 11105 | Level 2 |
3 | 718 Studios | 130 Thames Street,Brooklyn, NY 11247,718studio... | Not applicable |
4 | 94 Jewel St. | 94 Jewel Street, Ground level,Brooklyn, NY 112... | Not applicable |
#data in the address is too dirty, so I manual it clean it on excel and import it again.
#but you can find better way to fix it.
facilities_df2 = pd.read_csv('facilities_df_geocoded.csv')
#clean up addresses before geocoding
def cleanAddreses(row):
addr = row['Address']
#past to usaddrees, if it fails use blank
try:
addr_tags = usaddress.tag(row['Address'])[0]
except:
return ''
#use keep array to order a return with the address items needed + comma before zipcode
keep_addr_tags = ['AddressNumber','StreetNamePreDirectional', 'StreetName', 'StreetNamePostType', '_Comma', 'ZipCode']
addr_tags['_Comma'] = ','
cleaned_addr = ''
for tag in keep_addr_tags:
if tag in addr_tags:
cleaned_addr = f'{cleaned_addr} {addr_tags[tag]}'
return cleaned_addr
facilities_df2['_Address'] = facilities_df2.apply(cleanAddreses, axis = 1)
facilities_df2['_Address']
0 1717 Troutman Street , 11385 1 592 Johnson Avenue , 11237 2 19-02 Steinway Street , 11105 3 130 Thames Street , 11247 4 94 Jewel Street , 11222 ... 126 21-10 51st Avenue , 11101 127 3905 2nd Avenue , 11232 128 320 West 66th Street , 10023 129 300 Kingsland Avenue , 11234 130 55 Broadway , 10007 Name: _Address, Length: 131, dtype: object
#if file exist then import, otherwise copy from facilities_df... this will help with manual fixes
file = 'facilities_df_geocoded.csv'
if os.path.exists(file):
facilities_df_geocoded = pd.read_csv(file)
else:
facilities_df_geocoded = facilities_df2.copy()
facilities_df_geocoded.to_csv(file, index = False)
print('facilities_df_geocoded generated')
geocoded = []
def geocode(row):
lat, lng = np.nan, np.nan
if 'bbl' in row:
bbl = row['bbl']
else:
bbl = np.nan
addr = row['_Address']
#only geocode if bbl is blank (NA) and addr is not empty
if pd.isna(bbl) and (addr != '' and pd.notna(addr)):
print(bbl, addr)
#geocode!
url = f'https://geosearch.planninglabs.nyc/v1/search?text={addr}&size=120'
r = requests.get(url)
features = r.json()['features']
if len(features):
feature = features[0]
lng, lat = feature['geometry']['coordinates']
bbl = feature['properties']['pad_bbl']
return pd.Series([lat,lng, bbl])
facilities_df_geocoded[['lat','lng','bbl']] = facilities_df_geocoded.apply(geocode, axis = 1)
#save results to file
# facilities_df_geocoded.to_csv('facilities_df_geocoded.csv', index = False)