#!/usr/bin/env python # coding: utf-8 # # Analyzing NYC's 311 Street Flooding Complaints from 2010 to 2020 # ## Streets with the Most Street Flooding Complaints # # Author: Mark Bauer # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib from matplotlib.ticker import FuncFormatter from mpl_toolkits.axes_grid1 import make_axes_locatable import seaborn as sns import geopandas as gpd plt.rcParams['savefig.facecolor'] = 'white' get_ipython().run_line_magic('matplotlib', 'inline') # Printing verions of Python modules and packages with **watermark** - the IPython magic extension. # Documention for installing watermark: https://github.com/rasbt/watermark # In[2]: get_ipython().run_line_magic('reload_ext', 'watermark') get_ipython().run_line_magic('watermark', '-v -p numpy,pandas,geopandas,matplotlib,seaborn') # # Read in Data # In[3]: # list items in data folder get_ipython().run_line_magic('ls', 'data/') # ## Street Flooding Complaints # In[4]: # read data as a dataframe df = pd.read_csv('data/street-flooding-complaints.csv', low_memory=False) # preview data print('shape of data: {}'.format(df.shape)) df.head() # In[5]: # summarize columns df.info() # ## Neighborhood Tabulation Areas (NTAs) # In[6]: # importing nta boundaries url = 'https://data.cityofnewyork.us/api/geospatial/cpf4-rkhq?method=export&format=GeoJSON' nta_gdf = gpd.read_file(url) nta_gdf = nta_gdf.to_crs(epsg=2263) # previewing first five rows in data print('shape of data: {}'.format(nta_gdf.shape[0])) nta_gdf.head() # In[7]: # sanity plot nta_gdf.plot() # ## Streets # In[8]: path = 'data/streets-clipped.gpkg' streets = gpd.read_file(path) print('shape of data: {}'.format(streets.shape)) print('street id is unique: {}'.format(streets['physicalid'].is_unique)) print(streets.crs) streets.head() # In[9]: # summarize columns streets.info() # In[10]: # examine counts of geom types streets.geom_type.value_counts() # In[11]: # sanity check plot streets.plot() # # Assigning NTA Information to Street Complaints # In[12]: # convert to geodataframe from x,y points crs = 2263 geometry = gpd.points_from_xy( df['x_coordinate_state_plane'], df['y_coordinate_state_plane'] ) # make geodataframe gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=crs) # preview geodataframe gdf.iloc[:, [0, -1]].head() # In[13]: # sanity check plot gdf.plot() # In[14]: # spatial join nta to points gdf = gpd.sjoin( gdf, nta_gdf, how="inner", predicate='within' ) print('shape of data: {}'.format(gdf.shape)) gdf.head() # In[15]: # summarize columns gdf.info() # In[16]: # exclude specified columns cols = ['shape_leng', 'shape_area', 'index_right'] exclude = gdf.columns.isin(cols) gdf = gdf.loc[:, gdf.columns[~exclude]] # sanity check gdf.info() # # Snap Complaints to Streets # Methodology: https://medium.com/@brendan_ward/how-to-leverage-geopandas-for-faster-snapping-of-points-to-lines-6113c94e59aa # # The code below is from Brendan's awesome post. # In[17]: # offset of match (ft.) offset = 80 bbox = gdf.bounds + [-offset, -offset, offset, offset] # match points to streets based on distance hits = bbox.apply(lambda row: list(streets.sindex.intersection(row)), axis=1) print('shape of data: {}'.format(hits.shape)) hits.head(5) # In[18]: # position1: index of points table # position2: ordinal position of line - access via iloc later points_to_lines_dict = { 'pt_idx': np.repeat(hits.index, hits.apply(len)), 'line_i': np.concatenate(hits.values) } tmp = pd.DataFrame(points_to_lines_dict) # join back to the lines on line_i # join back to the original points to get their geometry, rename the point geometry as "point" tmp = ( tmp .join(streets, on="line_i") .join(gdf['geometry'].rename("point"), on="pt_idx") ) print('shape of data: {}'.format(tmp.shape)) tmp.head() # In[19]: # convert back to a GeoDataFrame, so we can do spatial ops tmp = gpd.GeoDataFrame( tmp, geometry='geometry', crs=gdf.crs ) print('shape of data: {}'.format(tmp.shape)) tmp.head() # In[20]: # discard any lines that are greater than tolerance from points # sort on ascending snap distance, so that closest goes to top tmp["snap_dist"] = tmp['geometry'].distance(gpd.GeoSeries(tmp.point)) tmp = ( tmp .loc[tmp.snap_dist <= offset] .sort_values(by=["snap_dist"]) ) # sanity check distance ceiling tmp.loc[:, ['snap_dist']].describe() # In[21]: # group by the index of the points and take the first, which is the closest line closest = ( tmp .groupby("pt_idx") .first() ) # construct a GeoDataFrame of the closest lines closest = gpd.GeoDataFrame(closest, geometry="geometry") closest.head() # In[22]: counts = gdf.shape[0] - closest.shape[0] counts_perc = round((1 - (closest.shape[0] / gdf.shape[0])) * 100, 2) print('Dropped {} rows or {}% of street flooding complaint points, \ which were more than 80 feet from the closest street center line.'.format(counts, counts_perc)) # In[23]: # position of nearest point from start of the line and get new point location geometry pos = closest['geometry'].project(gpd.GeoSeries(closest.point)) new_pts = closest['geometry'].interpolate(pos) # create a new GeoDataFrame from the columns from the closest line and # new point geometries (which will be called "geometries") snapped = gpd.GeoDataFrame(closest, geometry=new_pts) snapped.head() # In[24]: # summarize columns gdf.info() # In[25]: # join back to the original points and drop any that did not join updated_points = ( gdf .drop(columns=["geometry"]) .join(snapped) .dropna(subset=["geometry"]) .reset_index(drop=True) ) updated_points.head() # In[26]: # distribution of snap distances plt.figure(figsize=(8, 6)) sns.histplot(updated_points['snap_dist']) plt.title('Histogram of snap_dist (ft.)') # In[27]: # examine counts per street id gdf_count = ( updated_points .groupby(by='physicalid')['created_date'] .count() .reset_index() .rename(columns={"created_date": "count"}) ) gdf_count.head() # In[28]: # join our street data to our flood complaints data streets_with_count = streets.merge( gdf_count, on='physicalid', how='left' ) streets_with_count['count'] = streets_with_count['count'].fillna(0).astype(int) print('shape of data: {}'.format(streets_with_count.shape)) streets_with_count.head() # In[29]: # summarize columns streets_with_count.info() # In[30]: # examine highest counts streets_with_count.sort_values(by='count', ascending=False).head() # In[31]: # normalize counts streets_with_count['shape_leng'] = streets_with_count['geometry'].length count_norm = (streets_with_count['count'] / streets_with_count['shape_leng'].replace(0, np.nan) * 100) # counts per 100 ft streets_with_count['count_per_100ft'] = round(count_norm, 2) streets_with_count.head() # In[32]: # examine values streets_with_count.loc[:, ['count_per_100ft']].describe() # In[33]: # sort descending streets_with_count.sort_values(by='count_per_100ft', ascending=False).head() # # Joining Streets and Counts to Neighborhoods # In[34]: print('shape of data: {}'.format(updated_points.shape)) updated_points.head() # In[35]: # summarize columns updated_points.info() # In[36]: # retrieve specific columns cols = [ 'unique_key', 'ntacode', 'county_fips', 'ntaname', 'boro_name', 'boro_code', 'shape_leng', 'physicalid' ] streets_with_nta = updated_points.loc[:, cols] streets_with_nta.head() # In[37]: # check for duplicates checking_for_duplicates = ( streets_with_nta .groupby(by=['physicalid', 'ntaname', 'boro_name'])['shape_leng'] .count() .reset_index() .rename(columns={"shape_leng": "count_complaints"}) ) checking_for_duplicates.head() # In[38]: print('street id is unique: {}'.format(checking_for_duplicates['physicalid'].is_unique)) # In[39]: (checking_for_duplicates .loc[checking_for_duplicates.duplicated(subset=['physicalid'], keep=False) == True] .sort_values(by=['physicalid', 'count_complaints'], ascending=[True, False]) .head(10) ) # In[40]: count_duplicates = ( checking_for_duplicates .loc[checking_for_duplicates.duplicated(subset=['physicalid'], keep=False) == True] .shape[0] ) counts = round(count_duplicates / streets_with_count.shape[0] * 100, 2) print('count of duplicates: {:,}'.format(count_duplicates)) print('percent duplicates: {}%'.format(counts)) # In[41]: # sorting descending by number of complaints on a street in a given NTA # then removing duplicates unique_streets = ( checking_for_duplicates .sort_values(by=['physicalid', 'count_complaints'], ascending=[True, False]) .drop_duplicates('physicalid') .reset_index(drop=True) ) print('physical id is unique: {}'.format(unique_streets['physicalid'].is_unique)) unique_streets.head() # In[42]: # joining streets with count complaints to unique streets streets_with_count_nta = streets_with_count.merge( unique_streets, left_on='physicalid', right_on='physicalid', how='left' ) print('shape of data: {}'.format(streets_with_count_nta.shape)) streets_with_count_nta.head() # In[43]: # examine columns streets_with_count_nta.info() # In[44]: # retrieve desired columns cols = [ 'physicalid','full_stree', 'ntaname', 'boro_name', 'count', 'count_per_100ft' ] count_by_nta = ( streets_with_count_nta .loc[:, cols] .reset_index(drop=True) ) # sort on count desc count_by_nta.sort_values(by='count', ascending=False).head() # In[45]: # sort on count per 100ft desc count_by_nta.sort_values(by='count_per_100ft', ascending=False).head() # In[46]: # summary statistics (streets_with_count .groupby(by=['physicalid', 'full_stree'])[['count', 'count_per_100ft']] .sum() .reset_index() .describe() ) # In[47]: # Adding nta information count_by_nta['ntaname_full'] = ( count_by_nta['full_stree'] + " (id: " + count_by_nta['physicalid'] + "), " + count_by_nta['ntaname'] + ", " + count_by_nta['boro_name'] ) count_by_nta.sort_values(by='count', ascending=False).head() # In[48]: fig, ax = plt.subplots(figsize=(8, 6)) data = count_by_nta.sort_values(by='count', ascending=False).head(10) sns.barplot( data=data, y='ntaname_full', x='count', color='#1f77b4' ) label = 'Count of NYC 311 Street Flooding Complaints by Street Segment (2010 - 2020)' fig.suptitle(label, fontsize=12) plt.xlabel('Count', fontsize=12) plt.ylabel('Street Segment\n', fontsize=12) plt.tight_layout() # In[49]: fig, ax = plt.subplots(figsize=(10, 6)) data = count_by_nta.sort_values(by='count_per_100ft', ascending=False).head(10) sns.barplot( data=data, y='ntaname_full', x='count_per_100ft', color='#1f77b4' ) label = 'Count of NYC 311 Street Flooding Complaints per 100 ft. by Street Segment (2010 - 2020)' fig.suptitle(label, fontsize=12) plt.xlabel('Count per 100 ft.', fontsize=12) plt.ylabel('Street Segment', fontsize=12) plt.tight_layout() # In[50]: fig, axs = plt.subplots(2, 1, figsize=(10, 8)) # first plot data = count_by_nta.sort_values(by='count', ascending=False).head(10) sns.barplot( data=data, y='ntaname_full', x='count', color='#1f77b4', ax=axs[0] ) label = 'Count of NYC 311 Street Flooding Complaints by Street Segment from 2010 to 2020' axs[0].set_title(label, fontsize=12, pad=10, x=-.1) axs[0].set_xlabel('Count', fontsize=12) axs[0].set_ylabel('Street Segment\n', fontsize=12, labelpad=10) # second plot data = count_by_nta.sort_values(by='count_per_100ft', ascending=False).head(10) sns.barplot( data=data, y='ntaname_full', x='count_per_100ft', color='#1f77b4', ax=axs[1] ) label = 'Count of NYC 311 Street Flooding Complaints per 100 ft. by Street Segment from 2010 to 2020' axs[1].set_title(label, fontsize=12, pad=10, x=-.24) axs[1].set_xlabel('Count per 100 ft.', fontsize=12, labelpad=10) axs[1].set_ylabel('Street Segment', fontsize=12, labelpad=10) fig.tight_layout(pad=.9) plt.savefig('figures/count-street-segment.png', dpi=250, bbox_inches='tight') # In[ ]: