In [63]:

import warnings
warnings.filterwarnings("ignore")

import requests
import numpy as np
import pandas as pd
import geopandas as gpd 
from shapely.geometry import shape

from datetime import datetime, timedelta

Table¶

- Geocoding ¶

Address Data Cleaning
Create cache-like database
Geocoding for street segment
Get Geometry from LION
Create permit GeoDataFrame and Export it

- Count the Number of Permit by Geographical Unit ¶

Count Permit by Street Segments
Count Permit by Census Tract

- Count the Number of Permit by Time ¶

Extract hour and day from Timeframe by Event
Count Event Duration
Count Permit Numbers in Hour of a Day
Count Permit Numbers by day
Create Rolling Avg for Daily Count
Count Time Duration by Street

- Calculate Permit Time Duration by Time by Geo Unit ¶

Extract Time Related Information from Timeframe
Obtain Geo Unit Information by Sptial Join
Sum Up Time Duration by Census Tract
Create Pivot Table

Geocoding¶

Address Data Cleaning¶

In [64]:

df_film= pd.read_csv(r'../data/Film_Permits 2018_present.csv')
#df_film = pd.read_csv('https://data.cityofnewyork.us/api/views/tg4x-b46p/rows.csv?accessType=DOWNLOAD')
#Uncomment the code above if you need to access the dataset without download it

In [65]:

#divide the multiple address string into multiple lists
df_film['OFT_Text'] = df_film['ParkingHeld'].str.split(',')
df_film['OFT_Text'] 

Out[65]:

0        [SHEEPSHEAD BAY ROAD between VOORHIES AVENUE a...
1        [QUEENS PLAZA SOUTH between 21 STREET and 22 S...
2        [WEST   51 STREET between 9 AVENUE and 10 AVENUE]
3        [BEDFORD AVENUE between LEFFERTS AVENUE and LI...
4        [WEST   43 STREET between AVENUE OF THE AMERIC...
                               ...                        
20159    [DEGRAW STREET between 3 AVENUE and DEAD END, ...
20160    [COVERT STREET between WYCKOFF AVENUE and IRVI...
20161    [BROADWAY between GRAND STREET and HOWARD STRE...
20162    [35 STREET between 34 AVENUE and 35 AVENUE,   ...
20163     [WEST   48 STREET between 6 AVENUE and 7 AVENUE]
Name: OFT_Text, Length: 20164, dtype: object

Create cache-like database¶

In [66]:

# create the cache-like database to store the result of query.
# If the address has been queried, then it won't be query again and saves time. 

data = {}
oft_lookup = {}
def getOFT(url):
    data = oft_lookup.get(url)
    if url not in oft_lookup:
        r = requests.get(url)
        data = r.json()
        oft_lookup[url] = data 
    return data    

Geocoding for street segment¶

In [67]:

segidlookup=[]

def getGeom(row):
    boro = row['Borough']
    #print(boro)
    segids = []
    for oft in row['OFT_Text']:
        oft_1 = oft.strip()
        try:
            # split the address into on_street, from_street, to_street format, then the API can read
            on_street = oft_1.split('between')[0]
            from_street, to_street  = oft_1.split('between')[1].split('and')

            # query from geocoding service API
            url = f'https://www.locationservice.nyc/locationserviceapi//api/Block?OnStreet={on_street}&CrossStreetOne={from_street}&CrossStreetTwo={to_street}&Borough={boro}&BlockType=ExtendedStretch&ExtendedStretchType=Blockface&'
            data = getOFT(url)

            # extract the SegementID from JSON format
            if 'BlockFaceList' in data:
                for i in data['BlockFaceList']:
                    if 'SegmentID' in i:
                        segids.append(i['SegmentID'])
                        segidlookup.append({
                            'EventID': row['EventID'],
                            'SegmentID': i['SegmentID']
                        })
        except:

            continue          
    return segids
      
df_film['segids'] = df_film[:50].apply(getGeom, axis = 1)
#I just run the first 50 data to demonstrate bc the whold dataset takes a while. 
#If you hope to geocode the whole dataset, please remove [:50]

Get Geometry from LION¶

In [68]:

#create the dataset with eventID and Segment ID
df_segidlookup = pd.DataFrame(segidlookup)
df_segidlookup

Out[68]:

	EventID	SegmentID
0	559223	0108149
1	559223	0115148
2	559223	0026080
3	559223	0026083
4	559223	0127321
...	...	...
230	558434	0033164
231	558434	0310049
232	558745	0048512
233	558745	0048509
234	558215	0052376

235 rows × 2 columns

In [69]:

#import LION shapfile
lion = gpd.read_file("../vector/lion_line")
lion_sel = lion[['SegmentID','Street','geometry']]
lion_sel.head(5)

Out[69]:

	SegmentID	Street	geometry
0	0078126	EAST 168 STREET	LINESTRING (1010964.447 241812.261, 1011265.49...
1	0079796	WEST 192 STREET	LINESTRING (1011576.687 255023.583, 1011335.15...
2	0077356	UNION AVENUE	LINESTRING (1011600.676 239639.743, 1011785.79...
3	0077356	UNION AVENUE	LINESTRING (1011600.676 239639.743, 1011785.79...
4	0077356	UNION AVENUE	LINESTRING (1011600.676 239639.743, 1011785.79...

In [70]:

#join LION to permit data
df_film_seg = df_film.merge(df_segidlookup,how='left',on='EventID')
df_film_lion = df_film_seg.merge(lion_sel,how='left',on='SegmentID')
df_film_lion
df_film_lion.head(2)

Out[70]:

	EventID	EventType	StartDateTime	EndDateTime	EnteredOn	EventAgency	ParkingHeld	Borough	CommunityBoard(s)	PolicePrecinct(s)	Category	SubCategoryName	Country	OFT_Text	segids	SegmentID	Street	geometry
0	559223	Shooting Permit	01/22/2021 10:00:00 AM	01/23/2021 1:00:00 AM	01/19/2021 1:13:21 PM	Mayor's Office of Film, Theatre & Broadcasting	SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an...	Brooklyn	15	61	Television	Episodic series	United States of America	[SHEEPSHEAD BAY ROAD between VOORHIES AVENUE a...	[0108149, 0115148, 0026080, 0026083, 0127321, ...	0108149	SHEEPSHEAD BAY ROAD	LINESTRING (997364.767 153166.254, 997613.031 ...
1	559223	Shooting Permit	01/22/2021 10:00:00 AM	01/23/2021 1:00:00 AM	01/19/2021 1:13:21 PM	Mayor's Office of Film, Theatre & Broadcasting	SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an...	Brooklyn	15	61	Television	Episodic series	United States of America	[SHEEPSHEAD BAY ROAD between VOORHIES AVENUE a...	[0108149, 0115148, 0026080, 0026083, 0127321, ...	0115148	VOORHIES AVENUE	LINESTRING (997060.499 152673.840, 997307.372 ...

In [71]:

#Check the number/percentage of permits with no geometry
(df_film_lion['geometry'].isnull().sum(axis = 0))/len(df_film_lion)

Out[71]:

0.9871474123129752

Create permit GeoDataFrame and Export it¶

In [72]:

#Turn permit dataframe to geodataframe
gdf_film_lion = gpd.GeoDataFrame(df_film_lion, crs="EPSG:4326").set_geometry('geometry')
gdf_film_lion_output = gdf_film_lion.drop(columns=['OFT_Text','segids'])

#Export the geodataframe to shapefile
#gdf_film_lion_output.to_file('../output/film permit_city.shp')  

#Uncomment the code above if you need to download the shapefile just created

In [118]:

gdf_film_lion_output

Out[118]:

	EventID	EventType	StartDateTime	EndDateTime	EnteredOn	EventAgency	ParkingHeld	Borough	CommunityBoard(s)	PolicePrecinct(s)	Category	SubCategoryName	Country	SegmentID	Street	geometry
0	559223	Shooting Permit	01/22/2021 10:00:00 AM	01/23/2021 1:00:00 AM	01/19/2021 1:13:21 PM	Mayor's Office of Film, Theatre & Broadcasting	SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an...	Brooklyn	15	61	Television	Episodic series	United States of America	0108149	SHEEPSHEAD BAY ROAD	LINESTRING (997364.767 153166.254, 997613.031 ...
1	559223	Shooting Permit	01/22/2021 10:00:00 AM	01/23/2021 1:00:00 AM	01/19/2021 1:13:21 PM	Mayor's Office of Film, Theatre & Broadcasting	SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an...	Brooklyn	15	61	Television	Episodic series	United States of America	0115148	VOORHIES AVENUE	LINESTRING (997060.499 152673.840, 997307.372 ...
2	559223	Shooting Permit	01/22/2021 10:00:00 AM	01/23/2021 1:00:00 AM	01/19/2021 1:13:21 PM	Mayor's Office of Film, Theatre & Broadcasting	SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an...	Brooklyn	15	61	Television	Episodic series	United States of America	0026080	VOORHIES AVENUE	LINESTRING (997307.372 152707.114, 997613.031 ...
3	559223	Shooting Permit	01/22/2021 10:00:00 AM	01/23/2021 1:00:00 AM	01/19/2021 1:13:21 PM	Mayor's Office of Film, Theatre & Broadcasting	SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an...	Brooklyn	15	61	Television	Episodic series	United States of America	0026083	VOORHIES AVENUE	LINESTRING (997613.031 152752.377, 997840.586 ...
4	559223	Shooting Permit	01/22/2021 10:00:00 AM	01/23/2021 1:00:00 AM	01/19/2021 1:13:21 PM	Mayor's Office of Film, Theatre & Broadcasting	SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an...	Brooklyn	15	61	Television	Episodic series	United States of America	0127321	EAST 17 STREET	LINESTRING (997380.797 154082.381, 997400.761 ...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
20380	391005	Shooting Permit	01/04/2018 7:00:00 AM	01/04/2018 11:00:00 PM	01/03/2018 11:21:59 AM	Mayor's Office of Film, Theatre & Broadcasting	DEGRAW STREET between 3 AVENUE and DEAD END, ...	Brooklyn	6	78	Television	Cable-episodic	United States of America	NaN	NaN	None
20381	390906	Shooting Permit	01/04/2018 6:30:00 AM	01/04/2018 9:00:00 PM	01/02/2018 2:50:09 PM	Mayor's Office of Film, Theatre & Broadcasting	COVERT STREET between WYCKOFF AVENUE and IRVIN...	Queens	4, 5	104, 83	Television	Episodic series	United States of America	NaN	NaN	None
20382	390784	Shooting Permit	01/04/2018 6:00:00 AM	01/04/2018 8:00:00 PM	01/02/2018 7:41:45 AM	Mayor's Office of Film, Theatre & Broadcasting	BROADWAY between GRAND STREET and HOWARD STREE...	Manhattan	2	1, 5	WEB	Not Applicable	United States of America	NaN	NaN	None
20383	391010	Shooting Permit	01/04/2018 6:00:00 AM	01/04/2018 10:00:00 PM	01/03/2018 11:31:29 AM	Mayor's Office of Film, Theatre & Broadcasting	35 STREET between 34 AVENUE and 35 AVENUE, 35...	Queens	1	114	Television	Cable-episodic	United States of America	NaN	NaN	None
20384	390863	Shooting Permit	01/03/2018 3:00:00 AM	01/03/2018 11:00:00 AM	01/02/2018 12:47:16 PM	Mayor's Office of Film, Theatre & Broadcasting	WEST 48 STREET between 6 AVENUE and 7 AVENUE	Manhattan	5	18	Television	News	United States of America	NaN	NaN	None

20385 rows × 16 columns

Count the Number of Permit by Geographical Unit¶

Count Permit by Street Segments¶

In [73]:

df_seg_count = gdf_film_lion_output[['EventID','SegmentID']].groupby('SegmentID').count()
df_seg_count = df_seg_count.rename(columns={'EventID':'counts'}).reset_index()
df_seg_count.sort_values(by='counts', ascending=False)

Out[73]:

	SegmentID	counts
78	0044931	6
34	0034197	6
82	0045053	6
73	0044925	6
96	0052376	5
...	...	...
19	0032496	1
20	0032497	1
21	0032501	1
23	0032928	1
159	9001744	1

160 rows × 2 columns

In [74]:

df_film_lion_count = df_seg_count.merge(lion_sel,how='left',on='SegmentID')
df_film_lion_count = gpd.GeoDataFrame(df_film_lion_count, crs="EPSG:4326").set_geometry('geometry')
# df_film_lion_count.to_file('../output/permit_city_segment_count.shp')  

In [114]:

df_film_lion_count = df_film_lion_count.sort_values('Counts',ascending=False)
df_film_lion_count.head(10)

Out[114]:

	SegmentID	Counts	CD	CB	Street	geometry
6359	0035681	349	33	301	MONITOR STREET	LINESTRING (999919.040 205536.203, 999847.829 ...
6226	0035515	229	33	301	CALYER STREET	LINESTRING (998361.937 205063.691, 998591.352 ...
6233	0035520	220	33	301	CALYER STREET	LINESTRING (998591.352 205163.176, 998820.922 ...
6222	0035511	208	33	301	JEWEL STREET	LINESTRING (998755.108 204787.219, 998591.352 ...
6220	0035507	173	33	301	DIAMOND STREET	LINESTRING (998525.443 204687.439, 998361.937 ...
6358	0035676	170	33	301	NORTH HENRY STREET	LINESTRING (999661.937 205507.703, 999600.658 ...
10630	0048509	150	37	304	COVERT STREET	LINESTRING (1010582.250 192025.794, 1010582.25...
10628	0048509	150	34	304	COVERT STREET	LINESTRING (1010582.250 192025.795, 1010582.25...
10629	0048509	150	34	405	COVERT STREET	LINESTRING (1010582.250 192025.795, 1010809.74...
6047	0035023	149	33	301	EAGLE STREET	LINESTRING (995324.180 206940.246, 995776.604 ...

In [122]:

#A quick way to visualize the segement and see the density through opcacity settings 
df_film_lion_count[pd.notna(df_film_lion_count['geometry'])][
    [col for col in list(df_film_lion_count) if col not in ('StartDateTime')]
].explore(
    tiles="CartoDB positron",
    style_kwds={'opacity':0.02})

Out[122]:

Make this Notebook Trusted to load map: File -> Trust Notebook

Count Permit by Census Tract¶

In [77]:

data = gpd.read_file("../output/film permit_city.shp" )

In [78]:

data_sel = data[['EventID','StartDateT','EndDateTim','SegmentID','geometry']]             

In [79]:

# See how many duplicate data in the dataset
print(len(data_sel[data_sel.duplicated()==True]))
print(len(data_sel[data_sel.duplicated()==True])/len(data_sel))

7687
0.17144322769141557

In [80]:

# eliminate the duplicate data
data_sel = data_sel.drop_duplicates()
data_sel

Out[80]:

	EventID	StartDateT	EndDateTim	SegmentID	geometry
0	628327	03/08/2022 07:00:00 AM	03/08/2022 07:00:00 PM	0065898	LINESTRING (1000675.780 203773.602, 1000590.62...
2	629351	03/08/2022 07:00:00 AM	03/08/2022 10:00:00 PM	0048509	LINESTRING (1010582.250 192025.794, 1010809.74...
3	629319	03/08/2022 07:00:00 AM	03/08/2022 09:00:00 PM	0044431	LINESTRING (1002265.974 195997.473, 1002202.52...
4	629319	03/08/2022 07:00:00 AM	03/08/2022 09:00:00 PM	0044457	LINESTRING (1002202.526 196249.592, 1002715.21...
5	629319	03/08/2022 07:00:00 AM	03/08/2022 09:00:00 PM	0044453	LINESTRING (1002022.559 196965.059, 1002571.01...
...	...	...	...	...	...
44831	623949	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	0105864	LINESTRING (982584.908 196291.744, 982718.819 ...
44832	623949	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	0023296	LINESTRING (982183.843 196510.021, 982273.246 ...
44833	623949	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	0023304	LINESTRING (982273.246 196628.042, 982369.951 ...
44834	623949	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	0023303	None
44835	623949	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	0023182	LINESTRING (981806.626 196495.926, 981736.245 ...

37150 rows × 5 columns

In [81]:

#Since the boundaries of census tract are on the central line of the street, the permit segments need to be buffered.
data_sel_buffer=data_sel
data_sel_buffer['geometry']= data_sel.buffer(3,cap_style=2)

In [82]:

#import census tract shapefile
ct = gpd.read_file("../vector/2020 Census Tract_NYC")
ct_sel = ct[['BoroCT2020','NTAName','GEOID','geometry']]

# Join the census tract information to permit data
ct_data = ct_sel.sjoin(data_sel_buffer, how="left")

In [83]:

# count the number of permits in the CT
ct_data_count = ct_data[['BoroCT2020','SegmentID']].groupby(by=['BoroCT2020']).count()
ct_data_count = ct_data_count.rename(columns={"SegmentID":"Counts"})

# join the permit number table back to the CT shapefile
ct_count = pd.merge(ct_sel, ct_data_count, how="left", on=["BoroCT2020"])
ct_count

Out[83]:

	BoroCT2020	NTAName	GEOID	geometry	Counts
0	1000100	The Battery-Governors Island-Ellis Island-Libe...	36061000100	MULTIPOLYGON (((972081.788 190733.467, 972184....	0
1	1000201	Chinatown-Two Bridges	36061000201	POLYGON ((988548.218 197770.375, 987978.808 19...	8
2	1000202	Lower East Side	36061000202	MULTIPOLYGON (((989137.110 196325.439, 989113....	31
3	1000500	The Battery-Governors Island-Ellis Island-Libe...	36061000500	MULTIPOLYGON (((981219.056 188655.316, 980940....	0
4	1000600	Chinatown-Two Bridges	36061000600	POLYGON ((986961.185 199553.643, 987206.139 19...	95
...	...	...	...	...	...
2320	5030302	Mariner's Harbor-Arlington-Graniteville	36085030302	POLYGON ((939696.662 167578.833, 939768.394 16...	0
2321	5031901	Mariner's Harbor-Arlington-Graniteville	36085031901	POLYGON ((939696.662 167578.833, 939508.024 16...	0
2322	5031902	Mariner's Harbor-Arlington-Graniteville	36085031902	POLYGON ((939747.491 171919.373, 939778.967 17...	5
2323	5032300	Mariner's Harbor-Arlington-Graniteville	36085032300	POLYGON ((936740.120 173125.853, 936712.951 17...	0
2324	5990100	Hoffman & Swinburne Islands	36085990100	MULTIPOLYGON (((970217.022 145643.332, 970227....	0

2325 rows × 5 columns

In [84]:

#Find out the unit
ct_count.crs

Out[84]:

<Derived Projected CRS: EPSG:2263>
Name: NAD83 / New York Long Island (ftUS)
Axis Info [cartesian]:
- [east]: Easting (US survey foot)
- [north]: Northing (US survey foot)
Area of Use:
- undefined
Coordinate Operation:
- name: unnamed
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [85]:

#calculate the area to get the density of the counts
ct_count['area_mile'] = ct_count.area/2.788e+7
ct_count['count_per_mile']= ct_count['Counts']/ct_count['area_mile']
ct_count

Out[85]:

	BoroCT2020	NTAName	GEOID	geometry	Counts	area_mile	count_per_mile
0	1000100	The Battery-Governors Island-Ellis Island-Libe...	36061000100	MULTIPOLYGON (((972081.788 190733.467, 972184....	0	0.066155	0.000000
1	1000201	Chinatown-Two Bridges	36061000201	POLYGON ((988548.218 197770.375, 987978.808 19...	8	0.034875	229.391366
2	1000202	Lower East Side	36061000202	MULTIPOLYGON (((989137.110 196325.439, 989113....	31	0.118907	260.708959
3	1000500	The Battery-Governors Island-Ellis Island-Libe...	36061000500	MULTIPOLYGON (((981219.056 188655.316, 980940....	0	0.325796	0.000000
4	1000600	Chinatown-Two Bridges	36061000600	POLYGON ((986961.185 199553.643, 987206.139 19...	95	0.092636	1025.513854
...	...	...	...	...	...	...	...
2320	5030302	Mariner's Harbor-Arlington-Graniteville	36085030302	POLYGON ((939696.662 167578.833, 939768.394 16...	0	0.351083	0.000000
2321	5031901	Mariner's Harbor-Arlington-Graniteville	36085031901	POLYGON ((939696.662 167578.833, 939508.024 16...	0	0.148013	0.000000
2322	5031902	Mariner's Harbor-Arlington-Graniteville	36085031902	POLYGON ((939747.491 171919.373, 939778.967 17...	5	0.251258	19.899850
2323	5032300	Mariner's Harbor-Arlington-Graniteville	36085032300	POLYGON ((936740.120 173125.853, 936712.951 17...	0	1.493005	0.000000
2324	5990100	Hoffman & Swinburne Islands	36085990100	MULTIPOLYGON (((970217.022 145643.332, 970227....	0	0.022801	0.000000

2325 rows × 7 columns

In [86]:

# visualize the permit counts by census tract 
ax = ct.boundary.plot(figsize=(15,12),linewidth=0.2, edgecolor='w', alpha=0.8)
ct_count.plot(ax=ax,column='Counts', cmap='Reds',legend=True, vmax=300)

Out[86]:

<AxesSubplot:>

In [87]:

# visualize the permit count density by census tract 
ax = ct.boundary.plot(figsize=(15,12),linewidth=0.2, edgecolor='w', alpha=0.8)
ct_count.plot(ax=ax,column='count_per_mile', cmap='Reds',legend=True, vmax=2500)

Out[87]:

<AxesSubplot:>

Count the Number of Permit by Time¶

Extract hour and day from Timeframe by Event¶

In [88]:

data = df_film_lion
data = data.drop_duplicates(subset=['EventID'])
data.head()

Out[88]:

	EventID	EventType	StartDateTime	EndDateTime	EnteredOn	EventAgency	ParkingHeld	Borough	CommunityBoard(s)	PolicePrecinct(s)	Category	SubCategoryName	Country	OFT_Text	segids	SegmentID	Street	geometry
0	559223	Shooting Permit	01/22/2021 10:00:00 AM	01/23/2021 1:00:00 AM	01/19/2021 1:13:21 PM	Mayor's Office of Film, Theatre & Broadcasting	SHEEPSHEAD BAY ROAD between VOORHIES AVENUE an...	Brooklyn	15	61	Television	Episodic series	United States of America	[SHEEPSHEAD BAY ROAD between VOORHIES AVENUE a...	[0108149, 0115148, 0026080, 0026083, 0127321, ...	0108149	SHEEPSHEAD BAY ROAD	LINESTRING (997364.767 153166.254, 997613.031 ...
9	559248	Shooting Permit	01/22/2021 10:00:00 AM	01/23/2021 2:00:00 AM	01/19/2021 2:33:31 PM	Mayor's Office of Film, Theatre & Broadcasting	QUEENS PLAZA SOUTH between 21 STREET and 22 ST...	Queens	2	108	Television	Episodic series	United States of America	[QUEENS PLAZA SOUTH between 21 STREET and 22 S...	[]	NaN	NaN	None
10	559290	Shooting Permit	01/22/2021 9:00:00 AM	01/22/2021 11:59:00 PM	01/20/2021 10:39:44 AM	Mayor's Office of Film, Theatre & Broadcasting	WEST 51 STREET between 9 AVENUE and 10 AVENUE	Manhattan	4	18	Television	Episodic series	United States of America	[WEST 51 STREET between 9 AVENUE and 10 AVENUE]	[0033977]	0033977	WEST 51 STREET	LINESTRING (987436.794 217663.877, 986650.937 ...
11	558551	Shooting Permit	01/22/2021 8:00:00 AM	01/22/2021 10:00:00 PM	01/15/2021 10:21:26 AM	Mayor's Office of Film, Theatre & Broadcasting	BEDFORD AVENUE between LEFFERTS AVENUE and LIN...	Brooklyn	9	71	Television	Episodic series	United States of America	[BEDFORD AVENUE between LEFFERTS AVENUE and LI...	[0029490, 0029273, 0029280, 0109171, 0028388, ...	0029490	BEDFORD AVENUE	LINESTRING (996167.827 180451.067, 996191.780 ...
22	558610	Shooting Permit	01/22/2021 8:00:00 AM	01/23/2021 12:00:00 AM	01/15/2021 11:35:16 AM	Mayor's Office of Film, Theatre & Broadcasting	WEST 43 STREET between AVENUE OF THE AMERICA...	Manhattan	5	14, 18	Television	Cable-episodic	United States of America	[WEST 43 STREET between AVENUE OF THE AMERIC...	[0034189, 0034194, 0035787, 0034192, 0034197, ...	0034189	WEST 43 STREET	LINESTRING (989681.508 214036.225, 988786.877 ...

In [89]:

#create timespan for each permit
def range_to_list(a, b):
    return pd.date_range(a, b, freq = 'H')[:-1]
data['StartEnd'] = [[i, j] for i, j in zip( data['StartDateTime'], data['EndDateTime'])]

# create hourly time list from the timespan
data['TimeList'] = data['StartEnd'].apply(lambda x: range_to_list(x[0], x[1]))

In [90]:

data['StartEnd'][0]

Out[90]:

['01/22/2021 10:00:00 AM', '01/23/2021 1:00:00 AM']

In [91]:

data['TimeList'].head()

Out[91]:

0     DatetimeIndex(['2021-01-22 10:00:00', '2021-01...
9     DatetimeIndex(['2021-01-22 10:00:00', '2021-01...
10    DatetimeIndex(['2021-01-22 09:00:00', '2021-01...
11    DatetimeIndex(['2021-01-22 08:00:00', '2021-01...
22    DatetimeIndex(['2021-01-22 08:00:00', '2021-01...
Name: TimeList, dtype: object

In [92]:

#create new column to store the list of hour numbers and the list of date 
data['hours'] = data['TimeList'].apply(lambda x: list(x.hour))
data['day'] = data['TimeList'].apply(lambda x: list(set(x.date)))

#create new column to count how many days within a permit timespan
data['day_length'] = data['TimeList'].apply(lambda x: len(list(set(x.date))))

In [93]:

f = lambda x: list(z for y in x for z in y)    
event_hour_sum = data.groupby('EventID')['hours'].agg(f).reset_index()

In [94]:

event_hour_sum

Out[94]:

	EventID	hours
0	390784	[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...
1	390785	[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...
2	390787	[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...
3	390837	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
4	390863	[3, 4, 5, 6, 7, 8, 9, 10]
...	...	...
20159	559279	[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
20160	559290	[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20...
20161	559305	[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
20162	559338	[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...
20163	559418	[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...

20164 rows × 2 columns

Count Event Duration¶

In [95]:

# count the number of hours in the list
event_hour_sum['hour_sum'] = event_hour_sum['hours'].apply(lambda x: len(x))  
event_hour_sum = event_hour_sum.astype({'EventID': 'int'})

#Sort the table with the hour sum and turn the hour into days
event_hour_sum = pd.DataFrame(event_hour_sum)
event_hour_sum = event_hour_sum.sort_values(by='hour_sum')
event_hour_sum['day']=event_hour_sum['hour_sum']/24
event_hour_sum

Out[95]:

	EventID	hours	hour_sum	day
3551	423452	[]	0	0.000000
19011	548759	[]	0	0.000000
15453	511810	[10]	1	0.041667
15565	512208	[8]	1	0.041667
12616	492598	[10]	1	0.041667
...	...	...	...	...
9988	465679	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	815	33.958333
3052	418275	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	1583	65.958333
9042	458022	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	8610	358.750000
17473	525411	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	8640	360.000000
19969	556250	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	8687	361.958333

20164 rows × 4 columns

In [96]:

#drop outlier
from scipy import stats
event_hour_sum = event_hour_sum[(np.abs(stats.zscore(event_hour_sum['hour_sum'])) < 3)]
event_hour_sum

Out[96]:

	EventID	hours	hour_sum	day
3551	423452	[]	0	0.000000
19011	548759	[]	0	0.000000
15453	511810	[10]	1	0.041667
15565	512208	[8]	1	0.041667
12616	492598	[10]	1	0.041667
...	...	...	...	...
6410	443446	[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...	336	14.000000
15618	512412	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	341	14.208333
19951	555849	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	348	14.500000
19948	555831	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	348	14.500000
4749	434223	[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...	352	14.666667

20095 rows × 4 columns

In [97]:

event_plot = event_hour_sum[['EventID','hour_sum']].groupby(['hour_sum']).count()
event_plot = event_plot.rename(columns={'EventID':'Counts'})
event_plot = event_plot.sort_values(by='Counts')
event_plot.plot(figsize=(10,8),marker='o',linestyle='',markersize=2)

Out[97]:

<AxesSubplot:xlabel='hour_sum'>

Count Permit Numbers in Hour of a Day¶

In [98]:

import itertools

# extract the hour from the list and make it a dataframe
flat_hour = list(itertools.chain(*data['hours']))
flat_hour = pd.DataFrame(flat_hour)
flat_hour_new = flat_hour.rename(columns={0:"hour"})
flat_hour_new = flat_hour_new.reset_index()

# group by the hour counts
flat_hour_count = flat_hour_new.groupby('hour').count()
flat_hour_count = flat_hour_count.rename(columns= {'index':"count"})
flat_hour_count = flat_hour_count.reset_index()

In [99]:

flat_hour_count.plot(kind="bar",figsize=(20, 8))

Out[99]:

<AxesSubplot:>

Count Permit Numbers by day¶

In [100]:

flat_day = list(itertools.chain(*data['day']))
flat_day = pd.DataFrame(flat_day)
flat_day = flat_day.rename(columns={0:"date"})
flat_day = flat_day.reset_index()
flat_day_count = flat_day.groupby('date').count()
flat_day_count = flat_day_count.rename(columns= {'index':"count"})
flat_day_count = flat_day_count.reset_index()
flat_day_count

Out[100]:

	date	count
0	2018-01-03	1
1	2018-01-04	5
2	2018-01-05	3
3	2018-01-06	1
4	2018-01-07	2
...	...	...
1454	2021-12-27	1
1455	2021-12-28	1
1456	2021-12-29	1
1457	2021-12-30	1
1458	2021-12-31	1

1459 rows × 2 columns

In [101]:

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(12,8)})

sns.lineplot(x = "date", y = "count",
             data = flat_day_count)

Out[101]:

<AxesSubplot:xlabel='date', ylabel='count'>

Create Rolling Avg for Daily Count¶

In [102]:

Avg_4days = []

for i in range(0,len(flat_day_count),4):    
    date = flat_day_count['date'][i]
    if i >=len(flat_day_count)-2:
        count = flat_day_count['count'][i]
        Avg_4days.append({'date': date, 'count(avg)': count})   
    else:    
        count = round(flat_day_count['count'][i:i+3].mean(),0)
        Avg_4days.append({'date': date, 'count(avg)': count}) 
    
Avg_4days = pd.DataFrame(Avg_4days)
Avg_4days

Out[102]:

	date	count(avg)
0	2018-01-03	3.0
1	2018-01-07	11.0
2	2018-01-11	16.0
3	2018-01-15	21.0
4	2018-01-19	19.0
...	...	...
360	2021-12-13	1.0
361	2021-12-17	1.0
362	2021-12-21	1.0
363	2021-12-25	1.0
364	2021-12-29	1.0

365 rows × 2 columns

In [109]:

Calendar_rolling = []

for i in range(0,len(flat_day_count)):
    date = flat_day_count['date'][i]
    if i<3 or i >= len(flat_day_count)-3:
        Calendar_rolling.append({'date': date, 'count(rolling)': None}) 
    else:
        count = round(sum(flat_day_count['count'][i-3:i+3])/7)
        Calendar_rolling.append({'date': date, 'count(rolling)': count}) 
    
Calendar_rolling = pd.DataFrame(Calendar_rolling)
flat_day_count['count(rolling)'] = Calendar_rolling['count(rolling)']
flat_day_count

Out[109]:

	date	count	count(rolling)
0	2018-01-03	1	NaN
1	2018-01-04	5	NaN
2	2018-01-05	3	NaN
3	2018-01-06	1	3.0
4	2018-01-07	2	6.0
...	...	...	...
1454	2021-12-27	1	1.0
1455	2021-12-28	1	1.0
1456	2021-12-29	1	NaN
1457	2021-12-30	1	NaN
1458	2021-12-31	1	NaN

1459 rows × 3 columns

In [123]:

Avg_4days_rolling = []

for i in range(0,len(flat_day_count),4):    
    date = flat_day_count['date'][i]
    if i >=len(flat_day_count)-2:
        count_rol = flat_day_count['count(rolling)'][i]
        Avg_4days_rolling.append({'date': date, 'rolling(avg)': count_rol})   
    else:    
        count_rol = round(flat_day_count['count(rolling)'][i:i+3].mean(),0)
        Avg_4days_rolling.append({'date': date, 'rolling(avg)': count_rol}) 
    

Avg_4days_rolling = pd.DataFrame(Avg_4days_rolling)
Avg_4days['rolling(avg)'] = Avg_4days_rolling['rolling(avg)']
Avg_4days

Out[123]:

	date	count(avg)	rolling(avg)
0	2018-01-03	3.0	NaN
1	2018-01-07	11.0	7.0
2	2018-01-11	16.0	13.0
3	2018-01-15	21.0	16.0
4	2018-01-19	19.0	20.0
...	...	...	...
360	2021-12-13	1.0	1.0
361	2021-12-17	1.0	1.0
362	2021-12-21	1.0	1.0
363	2021-12-25	1.0	1.0
364	2021-12-29	1.0	NaN

365 rows × 3 columns

Count Time Duration by Street¶

In [124]:

street_hour=data[['SegmentID','hours']]
street_hour['hour_sum_event'] = data['hours'].apply(lambda x: len(x))  
street_hour_sum = street_hour.groupby('SegmentID').sum().reset_index()
street_hour_sum = street_hour_sum.rename(columns={'hour_sum_event':'hour_sum'})
street_hour_sum

#Sort the table with the hour sum and turn the hour into days
street_hour_sum = pd.DataFrame(street_hour_sum)
street_hour_sum = street_hour_sum.sort_values(by='hour_sum')
street_hour_sum['day']=street_hour_sum['hour_sum']/24

street_hour_sum.sort_values('day',ascending=False)

Out[124]:

	SegmentID	hour_sum	day
20	0052376	64	2.666667
14	0044912	43	1.791667
7	0035023	42	1.750000
6	0034189	32	1.333333
0	0020644	30	1.250000
9	0035681	30	1.250000
24	0070495	30	1.250000
17	0048509	30	1.250000
1	0029490	28	1.166667
30	0269872	25	1.041667
23	0067747	18	0.750000
21	0053880	17	0.708333
16	0046498	17	0.708333
19	0052369	16	0.666667
3	0032928	16	0.666667
15	0046294	16	0.666667
13	0043358	15	0.625000
12	0038710	15	0.625000
22	0066083	15	0.625000
25	0085164	15	0.625000
8	0035511	15	0.625000
26	0108149	15	0.625000
27	0165292	15	0.625000
4	0033171	15	0.625000
2	0032497	15	0.625000
10	0035970	15	0.625000
18	0048512	14	0.583333
29	0220873	14	0.583333
11	0037346	14	0.583333
5	0033977	14	0.583333
28	0215772	14	0.583333

In [125]:

# checking the top 3 duration street location
data[data['SegmentID']=='0035681'].head()

Out[125]:

	EventID	EventType	StartDateTime	EndDateTime	EnteredOn	EventAgency	ParkingHeld	Borough	CommunityBoard(s)	PolicePrecinct(s)	...	OFT_Text	segids	SegmentID	Street	geometry	StartEnd	TimeList	hours	day	day_length
150	558287	Shooting Permit	01/21/2021 7:00:00 AM	01/21/2021 9:00:00 PM	01/14/2021 1:35:06 PM	Mayor's Office of Film, Theatre & Broadcasting	KINGSLAND AVENUE between NORMAN AVENUE and GRE...	Brooklyn	1	94	...	[KINGSLAND AVENUE between NORMAN AVENUE and GR...	[0035681]	0035681	MONITOR STREET	LINESTRING (999919.040 205536.203, 999847.829 ...	[01/21/2021 7:00:00 AM, 01/21/2021 9:00:00 PM]	DatetimeIndex(['2021-01-21 07:00:00', '2021-01...	[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...	[2021-01-21]	1
192	559256	Shooting Permit	01/21/2021 6:00:00 AM	01/21/2021 10:00:00 PM	01/19/2021 3:44:55 PM	Mayor's Office of Film, Theatre & Broadcasting	MONITOR STREET between GREENPOINT AVENUE and N...	Brooklyn	1	94	...	[MONITOR STREET between GREENPOINT AVENUE and ...	[0035681, 0035681]	0035681	MONITOR STREET	LINESTRING (999919.040 205536.203, 999847.829 ...	[01/21/2021 6:00:00 AM, 01/21/2021 10:00:00 PM]	DatetimeIndex(['2021-01-21 06:00:00', '2021-01...	[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...	[2021-01-21]	1

2 rows × 23 columns

Calculate Permit Time Duration by Time by Geo Unit¶

In [131]:

data = gpd.read_file("../output/film permit_city.shp" )
data_fil=data.drop(columns=['OBJECTID','Join_ID'])
data_fil=data_fil.drop_duplicates().reset_index()

In [134]:

#data prep for getting time and date related information
data_fil['StartEnd'] = [[i, j] for i, j in zip( data_fil['StartDateT'], data_fil['EndDateTim'])]
def range_to_list(a, b):
    return pd.date_range(a, b, freq = 'H')[:-1]
data_fil['TimeList'] = data_fil['StartEnd'].apply(lambda x: range_to_list(x[0], x[1]))
data_fil['hours'] = data_fil['TimeList'].apply(lambda x: list(x.hour))
data_fil['month'] = data_fil['TimeList'].apply(lambda x: list(set(x.month)))
data_fil['date'] = data_fil['TimeList'].apply(lambda x: list(set(x.date)))
data_fil['hour_sum'] = data_fil['hours'].apply(lambda x: len(x))  

data_fil['day_sum']=data_fil['hour_sum']/24
data_fil['StartDateTime'] = data_fil['StartDateT'].apply(pd.to_datetime)

In [135]:

# change month columns from list to string
month1 = data_fil.apply(lambda x: pd.Series(x['month']),axis=1).stack().reset_index(level=1, drop=True)
month1.name = 'month'
data_fil = data_fil.drop('month', axis=1).join(month1)
data_fil['month'] = data_fil['month'].astype(str)
data_fil.head()

Out[135]:

	index	EventID	EventType	StartDateT	EndDateTim	EnteredOn	EventAgenc	ParkingHel	Borough	CommunityB	...	SegmentID	geometry	StartEnd	TimeList	hours	date	hour_sum	day_sum	StartDateTime	month
0	0	628327	Shooting Permit	03/08/2022 07:00:00 AM	03/08/2022 07:00:00 PM	02/28/2022 11:13:50 AM	Mayor's Office of Film, Theatre & Broadcasting	SUTTON STREET between NORMAN AVENUE and NASSAU...	Brooklyn	1	...	0065898	LINESTRING (1000675.780 203773.602, 1000590.62...	[03/08/2022 07:00:00 AM, 03/08/2022 07:00:00 PM]	DatetimeIndex(['2022-03-08 07:00:00', '2022-03...	[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]	[2022-03-08]	12	0.500000	2022-03-08 07:00:00	3.0
1	2	629351	Shooting Permit	03/08/2022 07:00:00 AM	03/08/2022 10:00:00 PM	03/03/2022 02:51:17 PM	Mayor's Office of Film, Theatre & Broadcasting	COVERT ST between WYCKOFF AVENUE and IRVING AV...	Queens	5	...	0048509	LINESTRING (1010582.250 192025.794, 1010809.74...	[03/08/2022 07:00:00 AM, 03/08/2022 10:00:00 PM]	DatetimeIndex(['2022-03-08 07:00:00', '2022-03...	[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...	[2022-03-08]	15	0.625000	2022-03-08 07:00:00	3.0
2	3	629319	Shooting Permit	03/08/2022 07:00:00 AM	03/08/2022 09:00:00 PM	03/03/2022 01:49:15 PM	Mayor's Office of Film, Theatre & Broadcasting	WHITE STREET between SEIGEL STREET and MOORE S...	Brooklyn	1	...	0044431	LINESTRING (1002265.974 195997.473, 1002202.52...	[03/08/2022 07:00:00 AM, 03/08/2022 09:00:00 PM]	DatetimeIndex(['2022-03-08 07:00:00', '2022-03...	[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...	[2022-03-08]	14	0.583333	2022-03-08 07:00:00	3.0
3	4	629319	Shooting Permit	03/08/2022 07:00:00 AM	03/08/2022 09:00:00 PM	03/03/2022 01:49:15 PM	Mayor's Office of Film, Theatre & Broadcasting	WHITE STREET between SEIGEL STREET and MOORE S...	Brooklyn	1	...	0044457	LINESTRING (1002202.526 196249.592, 1002715.21...	[03/08/2022 07:00:00 AM, 03/08/2022 09:00:00 PM]	DatetimeIndex(['2022-03-08 07:00:00', '2022-03...	[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...	[2022-03-08]	14	0.583333	2022-03-08 07:00:00	3.0
4	5	629319	Shooting Permit	03/08/2022 07:00:00 AM	03/08/2022 09:00:00 PM	03/03/2022 01:49:15 PM	Mayor's Office of Film, Theatre & Broadcasting	WHITE STREET between SEIGEL STREET and MOORE S...	Brooklyn	1	...	0044453	LINESTRING (1002022.559 196965.059, 1002571.01...	[03/08/2022 07:00:00 AM, 03/08/2022 09:00:00 PM]	DatetimeIndex(['2022-03-08 07:00:00', '2022-03...	[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...	[2022-03-08]	14	0.583333	2022-03-08 07:00:00	3.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
37145	44831	623949	Shooting Permit	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	02/01/2022 01:32:18 PM	Mayor's Office of Film, Theatre & Broadcasting	PEARL STREET between PECK SLIP and BEEKMAN STR...	Manhattan	1	...	0105864	LINESTRING (982584.908 196291.744, 982718.819 ...	[02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM]	DatetimeIndex(['2022-02-04 08:00:00', '2022-02...	[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...	[2022-02-04]	13	0.541667	2022-02-04 08:00:00	2.0
37146	44832	623949	Shooting Permit	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	02/01/2022 01:32:18 PM	Mayor's Office of Film, Theatre & Broadcasting	PEARL STREET between PECK SLIP and BEEKMAN STR...	Manhattan	1	...	0023296	LINESTRING (982183.843 196510.021, 982273.246 ...	[02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM]	DatetimeIndex(['2022-02-04 08:00:00', '2022-02...	[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...	[2022-02-04]	13	0.541667	2022-02-04 08:00:00	2.0
37147	44833	623949	Shooting Permit	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	02/01/2022 01:32:18 PM	Mayor's Office of Film, Theatre & Broadcasting	PEARL STREET between PECK SLIP and BEEKMAN STR...	Manhattan	1	...	0023304	LINESTRING (982273.246 196628.042, 982369.951 ...	[02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM]	DatetimeIndex(['2022-02-04 08:00:00', '2022-02...	[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...	[2022-02-04]	13	0.541667	2022-02-04 08:00:00	2.0
37148	44834	623949	Shooting Permit	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	02/01/2022 01:32:18 PM	Mayor's Office of Film, Theatre & Broadcasting	PEARL STREET between PECK SLIP and BEEKMAN STR...	Manhattan	1	...	0023303	None	[02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM]	DatetimeIndex(['2022-02-04 08:00:00', '2022-02...	[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...	[2022-02-04]	13	0.541667	2022-02-04 08:00:00	2.0
37149	44835	623949	Shooting Permit	02/04/2022 08:00:00 AM	02/04/2022 09:30:00 PM	02/01/2022 01:32:18 PM	Mayor's Office of Film, Theatre & Broadcasting	PEARL STREET between PECK SLIP and BEEKMAN STR...	Manhattan	1	...	0023182	LINESTRING (981806.626 196495.926, 981736.245 ...	[02/04/2022 08:00:00 AM, 02/04/2022 09:30:00 PM]	DatetimeIndex(['2022-02-04 08:00:00', '2022-02...	[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...	[2022-02-04]	13	0.541667	2022-02-04 08:00:00	2.0

37451 rows × 25 columns

Obtain Geo Unit Information by Sptial Join¶

In [136]:

# create buffer for avoiding the census tract boundary and street segment overlapping conflicts
data_time_buffer=data_fil
data_time_buffer['geometry']= data_time_buffer.buffer(3,cap_style=2)

In [137]:

# spatial join with census tract shapefile
ct_data_time = ct_sel.sjoin(data_time_buffer, how="left")
ct_data_time=ct_data_time.dropna().reset_index()

In [138]:

#spatial join with Council District 33 shapefile to get CD data 
cd = gpd.read_file("../vector/Council Districts/nycc.shp")
cd33 = cd[cd['CounDist']==33]
ct_time_cd33 = gpd.overlay(ct_data_time, cd33, how='intersection')

Sum Up Time Duration by Census Tract¶

In [139]:

ct_time_cd33_forsum=ct_time_cd33[['BoroCT2020','EventID','hour_sum','day_sum','month']]
ct_time_cd33_forsum=ct_time_cd33_forsum.drop_duplicates()
ct_time_cd33_forsum['month']= ct_time_cd33_forsum['month'].astype(float).astype(int)

ct_time_sum = ct_time_cd33_forsum[['BoroCT2020','hour_sum','day_sum','month']].groupby(by=[
    'BoroCT2020','month']).sum()
ct_time_sum = ct_time_sum.reset_index()
ct_time_sum.sort_values('day_sum')

Out[139]:

	BoroCT2020	month	hour_sum	day_sum
401	3051100	2	1.0	0.041667
96	3001100	1	2.0	0.083333
279	3012700	1	4.0	0.166667
653	3059300	8	5.0	0.208333
635	3058902	8	5.0	0.208333
...	...	...	...	...
621	3058901	3	1186.0	49.416667
620	3058901	2	1552.0	64.666667
607	3057902	1	1555.0	64.791667
609	3057902	3	1606.0	66.916667
608	3057902	2	1893.0	78.875000

656 rows × 4 columns

Create Pivot Table¶

In [140]:

ct_time_sum = ct_time_sum.pivot_table(index =['BoroCT2020'], values =['day_sum'],columns='month')
ct_time_sum

Out[140]:

	day_sum
month	1	2	3	4	5	6	7	8	9	10	11	12
BoroCT2020
1000900	1.083333	2.541667	1.916667	1.958333	2.125000	NaN	4.583333	2.291667	1.916667	1.583333	3.916667	1.916667
1001501	1.083333	4.416667	0.333333	1.625000	2.416667	1.708333	7.000000	4.708333	4.416667	6.000000	4.166667	0.500000
1001502	7.625000	10.708333	1.291667	4.208333	7.541667	2.791667	11.041667	6.083333	12.541667	10.708333	3.208333	1.166667
1002400	NaN	NaN	NaN	NaN	0.500000	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1004400	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.666667	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...
3057902	64.791667	78.875000	66.916667	40.333333	29.666667	15.958333	13.125000	36.083333	30.000000	38.375000	34.625000	22.708333
3058901	48.958333	64.666667	49.416667	29.500000	17.416667	10.250000	11.250000	19.500000	19.750000	25.791667	29.875000	17.916667
3058902	1.458333	3.041667	2.625000	0.750000	NaN	NaN	NaN	0.208333	1.333333	NaN	2.708333	NaN
3059100	1.500000	3.250000	2.083333	NaN	NaN	0.416667	0.750000	1.875000	1.291667	3.166667	3.916667	0.541667
3059300	0.625000	2.333333	2.625000	1.416667	NaN	NaN	0.750000	0.208333	1.333333	NaN	1.083333	NaN

84 rows × 12 columns

In [141]:

#Exporting pivot table dataset to csv file
# ct_time_sum.to_csv('../output/Pivot_permits_cd33_time.csv')

#Uncomment the code above if you need to access the dataset without download it

Data Parsing¶

Parsing the Film Studio List¶

In [142]:

import requests
from bs4 import BeautifulSoup
import usaddress
import os

In [143]:

#input the webpage we try to parse
#read the page with BeautifulSoup and extract ti

url = "https://www1.nyc.gov/site/mome/industries/studios-stages.page"
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")

In [144]:

#get addresses from each table row
facilities = []

content = soup.find_all('tr')
for i, row in enumerate(content):
    #skip header
    if i == 0:
        continue
        
        
    cell = row.find_all('td')[0]
    facility = row.find_all('td')[-1].text
    try:
        title = cell.find('strong').text
    except:
        title = cell.find('a').text   

    text = cell.get_text(separator = ',', strip = True)
    address = text.replace(title,'', 1)
    address = address.replace(',','', 1)
    facilities.append([title,address,facility])

In [145]:

#create the studio dataset from the parsing data
facilities_df = pd.DataFrame(facilities, columns = ['Name','Address','Type'])
facilities_df.head()

Out[145]:

	Name	Address	Type
0	1717 Troutman	1717 Troutman Street, #300,Ridgewood, NY 11385	Not applicable
1	The 1896 Studio & Stages	592 Johnson Avenue/,211-215 Ingraham Avenue,Br...	Level 1
2	19th Avenue Stage	19-02 Steinway Street,Astoria, NY 11105	Level 2
3	718 Studios	130 Thames Street,Brooklyn, NY 11247,718studio...	Not applicable
4	94 Jewel St.	94 Jewel Street, Ground level,Brooklyn, NY 112...	Not applicable

In [146]:

#data in the address is too dirty, so I manual it clean it on excel and import it again.
#but you can find better way to fix it.
facilities_df2 = pd.read_csv('facilities_df_geocoded.csv')

Geocoding for Parsing Data¶

In [147]:

#clean up addresses before geocoding
def cleanAddreses(row):
    addr = row['Address']
    #past to usaddrees, if it fails use blank
    try:
        addr_tags = usaddress.tag(row['Address'])[0]
    except:
        return ''
    
    #use keep array to order a return with the address items needed + comma before zipcode
    keep_addr_tags = ['AddressNumber','StreetNamePreDirectional', 'StreetName', 'StreetNamePostType', '_Comma', 'ZipCode']
    addr_tags['_Comma'] = ','
    cleaned_addr = ''
    
    for tag in keep_addr_tags:
        if tag in addr_tags:
            cleaned_addr = f'{cleaned_addr} {addr_tags[tag]}'
    return cleaned_addr
        
facilities_df2['_Address'] = facilities_df2.apply(cleanAddreses, axis = 1)

facilities_df2['_Address']

Out[147]:

0        1717 Troutman Street , 11385
1          592 Johnson Avenue , 11237
2       19-02 Steinway Street , 11105
3           130 Thames Street , 11247
4             94 Jewel Street , 11222
                    ...              
126         21-10 51st Avenue , 11101
127           3905 2nd Avenue , 11232
128      320 West 66th Street , 10023
129      300 Kingsland Avenue , 11234
130               55 Broadway , 10007
Name: _Address, Length: 131, dtype: object

In [148]:

#if file exist then import, otherwise copy from facilities_df... this will help with manual fixes
file = 'facilities_df_geocoded.csv'
if os.path.exists(file):
    facilities_df_geocoded = pd.read_csv(file)
else:
    facilities_df_geocoded = facilities_df2.copy()
    facilities_df_geocoded.to_csv(file, index = False)
    print('facilities_df_geocoded generated')

geocoded = []
def geocode(row):
    lat, lng = np.nan, np.nan
    if 'bbl' in row:
        bbl = row['bbl']
    else:
        bbl = np.nan
    addr = row['_Address']
    #only geocode if bbl is blank (NA) and addr is not empty
    if pd.isna(bbl) and (addr != '' and pd.notna(addr)):
        print(bbl, addr)
        
        #geocode! 
        url = f'https://geosearch.planninglabs.nyc/v1/search?text={addr}&size=120'
        r = requests.get(url)
        features = r.json()['features']
        if len(features):
            feature = features[0]
            lng, lat = feature['geometry']['coordinates']
            bbl = feature['properties']['pad_bbl']

    
    return pd.Series([lat,lng, bbl])
    

facilities_df_geocoded[['lat','lng','bbl']] = facilities_df_geocoded.apply(geocode, axis = 1)

#save results to file
# facilities_df_geocoded.to_csv('facilities_df_geocoded.csv', index = False)