#!/usr/bin/env python
# coding: utf-8

# # Analyzing NYC's 311 Street Flooding Complaints from 2010 to 2020
# ## Streets with the Most Street Flooding Complaints
# 
# Author: Mark Bauer

# In[1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.ticker import FuncFormatter
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
import geopandas as gpd

plt.rcParams['savefig.facecolor'] = 'white'
get_ipython().run_line_magic('matplotlib', 'inline')


# Printing verions of Python modules and packages with **watermark** - the IPython magic extension.  
# Documention for installing watermark: https://github.com/rasbt/watermark

# In[2]:


get_ipython().run_line_magic('reload_ext', 'watermark')
get_ipython().run_line_magic('watermark', '-v -p numpy,pandas,geopandas,matplotlib,seaborn')


# # Read in Data

# In[3]:


# list items in data folder
get_ipython().run_line_magic('ls', 'data/')


# ## Street Flooding Complaints

# In[4]:


# read data as a dataframe
df = pd.read_csv('data/street-flooding-complaints.csv', low_memory=False)

# preview data
print('shape of data: {}'.format(df.shape))
df.head()


# In[5]:


# summarize columns
df.info()


# ## Neighborhood Tabulation Areas (NTAs)

# In[6]:


# importing nta boundaries
url = 'https://data.cityofnewyork.us/api/geospatial/cpf4-rkhq?method=export&format=GeoJSON'
nta_gdf = gpd.read_file(url)
nta_gdf = nta_gdf.to_crs(epsg=2263)

# previewing first five rows in data
print('shape of data: {}'.format(nta_gdf.shape[0]))
nta_gdf.head()


# In[7]:


# sanity plot
nta_gdf.plot()


# ## Streets

# In[8]:


path = 'data/streets-clipped.gpkg'
streets = gpd.read_file(path)

print('shape of data: {}'.format(streets.shape))
print('street id is unique: {}'.format(streets['physicalid'].is_unique))
print(streets.crs)

streets.head()


# In[9]:


# summarize columns
streets.info()


# In[10]:


# examine counts of geom types
streets.geom_type.value_counts()


# In[11]:


# sanity check plot
streets.plot()


# # Assigning NTA Information to Street Complaints

# In[12]:


# convert to geodataframe from x,y points
crs = 2263
geometry = gpd.points_from_xy(
    df['x_coordinate_state_plane'],
    df['y_coordinate_state_plane']
)

# make geodataframe
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=crs)

# preview geodataframe
gdf.iloc[:, [0, -1]].head()


# In[13]:


# sanity check plot
gdf.plot()


# In[14]:


# spatial join nta to points
gdf = gpd.sjoin(
    gdf,
    nta_gdf,
    how="inner",
    predicate='within'
)

print('shape of data: {}'.format(gdf.shape))
gdf.head()


# In[15]:


# summarize columns
gdf.info()


# In[16]:


# exclude specified columns
cols = ['shape_leng', 'shape_area', 'index_right']
exclude = gdf.columns.isin(cols)

gdf = gdf.loc[:, gdf.columns[~exclude]]

# sanity check
gdf.info()


# # Snap Complaints to Streets

# Methodology: https://medium.com/@brendan_ward/how-to-leverage-geopandas-for-faster-snapping-of-points-to-lines-6113c94e59aa
# 
# The code below is from Brendan's awesome post.

# In[17]:


# offset of match (ft.)
offset = 80
bbox = gdf.bounds + [-offset, -offset, offset, offset]

# match points to streets based on distance
hits = bbox.apply(lambda row: list(streets.sindex.intersection(row)), axis=1)

print('shape of data: {}'.format(hits.shape))
hits.head(5)


# In[18]:


# position1: index of points table
# position2: ordinal position of line - access via iloc later
points_to_lines_dict = {
    'pt_idx': np.repeat(hits.index, hits.apply(len)),
    'line_i': np.concatenate(hits.values)
}
    
tmp = pd.DataFrame(points_to_lines_dict)
# join back to the lines on line_i
# join back to the original points to get their geometry, rename the point geometry as "point"
tmp = (
    tmp
    .join(streets, on="line_i")
    .join(gdf['geometry'].rename("point"), on="pt_idx")
)

print('shape of data: {}'.format(tmp.shape))
tmp.head()


# In[19]:


# convert back to a GeoDataFrame, so we can do spatial ops
tmp = gpd.GeoDataFrame(
    tmp,
    geometry='geometry',
    crs=gdf.crs
)

print('shape of data: {}'.format(tmp.shape))
tmp.head()


# In[20]:


# discard any lines that are greater than tolerance from points
# sort on ascending snap distance, so that closest goes to top
tmp["snap_dist"] = tmp['geometry'].distance(gpd.GeoSeries(tmp.point))

tmp = (
    tmp
    .loc[tmp.snap_dist <= offset]
    .sort_values(by=["snap_dist"])
)

# sanity check distance ceiling
tmp.loc[:, ['snap_dist']].describe()


# In[21]:


# group by the index of the points and take the first, which is the closest line 
closest = (
    tmp
    .groupby("pt_idx")
    .first()
)

# construct a GeoDataFrame of the closest lines
closest = gpd.GeoDataFrame(closest, geometry="geometry")

closest.head()


# In[22]:


counts = gdf.shape[0] - closest.shape[0]
counts_perc = round((1 - (closest.shape[0] / gdf.shape[0])) * 100, 2)

print('Dropped {} rows or {}% of street flooding complaint points, \
which were more than 80 feet from the closest street center line.'.format(counts, counts_perc))


# In[23]:


# position of nearest point from start of the line and get new point location geometry
pos = closest['geometry'].project(gpd.GeoSeries(closest.point))
new_pts = closest['geometry'].interpolate(pos)

# create a new GeoDataFrame from the columns from the closest line and 
# new point geometries (which will be called "geometries")
snapped = gpd.GeoDataFrame(closest, geometry=new_pts)

snapped.head()


# In[24]:


# summarize columns
gdf.info()


# In[25]:


# join back to the original points and drop any that did not join
updated_points = (
    gdf
    .drop(columns=["geometry"])
    .join(snapped)
    .dropna(subset=["geometry"])
    .reset_index(drop=True)
)

updated_points.head()


# In[26]:


# distribution of snap distances
plt.figure(figsize=(8, 6))

sns.histplot(updated_points['snap_dist'])
plt.title('Histogram of snap_dist (ft.)')


# In[27]:


# examine counts per street id
gdf_count = (
    updated_points
    .groupby(by='physicalid')['created_date']
    .count()
    .reset_index()
    .rename(columns={"created_date": "count"})
)

gdf_count.head()


# In[28]:


# join our street data to our flood complaints data
streets_with_count = streets.merge(
    gdf_count, 
    on='physicalid',
    how='left'
)

streets_with_count['count'] = streets_with_count['count'].fillna(0).astype(int)

print('shape of data: {}'.format(streets_with_count.shape))
streets_with_count.head()


# In[29]:


# summarize columns
streets_with_count.info()


# In[30]:


# examine highest counts
streets_with_count.sort_values(by='count', ascending=False).head()


# In[31]:


# normalize counts
streets_with_count['shape_leng'] = streets_with_count['geometry'].length
count_norm = (streets_with_count['count'] / streets_with_count['shape_leng'].replace(0, np.nan) * 100)

# counts per 100 ft
streets_with_count['count_per_100ft'] = round(count_norm, 2)

streets_with_count.head()


# In[32]:


# examine values
streets_with_count.loc[:, ['count_per_100ft']].describe()


# In[33]:


# sort descending
streets_with_count.sort_values(by='count_per_100ft', ascending=False).head()


# # Joining Streets and Counts to Neighborhoods

# In[34]:


print('shape of data: {}'.format(updated_points.shape))
updated_points.head()


# In[35]:


# summarize columns
updated_points.info()


# In[36]:


# retrieve specific columns
cols = [
    'unique_key',
    'ntacode',
    'county_fips',
    'ntaname',
    'boro_name',
    'boro_code',
    'shape_leng',
    'physicalid'
]

streets_with_nta = updated_points.loc[:, cols]

streets_with_nta.head()


# In[37]:


# check for duplicates
checking_for_duplicates = (
    streets_with_nta
    .groupby(by=['physicalid', 'ntaname', 'boro_name'])['shape_leng']
    .count()
    .reset_index()
    .rename(columns={"shape_leng": "count_complaints"})
)

checking_for_duplicates.head()


# In[38]:


print('street id is unique: {}'.format(checking_for_duplicates['physicalid'].is_unique))


# In[39]:


(checking_for_duplicates
 .loc[checking_for_duplicates.duplicated(subset=['physicalid'], keep=False) == True]
 .sort_values(by=['physicalid', 'count_complaints'], ascending=[True, False])
 .head(10)
)


# In[40]:


count_duplicates = (
    checking_for_duplicates
    .loc[checking_for_duplicates.duplicated(subset=['physicalid'], keep=False) == True]
    .shape[0]
)

counts = round(count_duplicates / streets_with_count.shape[0] * 100, 2)

print('count of duplicates: {:,}'.format(count_duplicates))
print('percent duplicates: {}%'.format(counts))


# In[41]:


# sorting descending by number of complaints on a street in a given NTA
# then removing duplicates
unique_streets = (
    checking_for_duplicates
    .sort_values(by=['physicalid', 'count_complaints'], ascending=[True, False])
    .drop_duplicates('physicalid')
    .reset_index(drop=True)
)

print('physical id is unique: {}'.format(unique_streets['physicalid'].is_unique))
unique_streets.head()


# In[42]:


# joining streets with count complaints to unique streets
streets_with_count_nta = streets_with_count.merge(
    unique_streets, 
    left_on='physicalid', 
    right_on='physicalid', 
    how='left'
)

print('shape of data: {}'.format(streets_with_count_nta.shape))
streets_with_count_nta.head()


# In[43]:


# examine columns
streets_with_count_nta.info()


# In[44]:


# retrieve desired columns
cols = [
    'physicalid','full_stree', 'ntaname', 'boro_name',
    'count', 'count_per_100ft'
]

count_by_nta = (
    streets_with_count_nta
    .loc[:, cols]
    .reset_index(drop=True)
)

# sort on count desc
count_by_nta.sort_values(by='count', ascending=False).head()


# In[45]:


# sort on count per 100ft desc
count_by_nta.sort_values(by='count_per_100ft', ascending=False).head()


# In[46]:


# summary statistics
(streets_with_count
 .groupby(by=['physicalid', 'full_stree'])[['count', 'count_per_100ft']]
 .sum()
 .reset_index()
 .describe()
)


# In[47]:


# Adding nta information
count_by_nta['ntaname_full'] = (
    count_by_nta['full_stree']
    + " (id: "
    + count_by_nta['physicalid']
    + "), " + count_by_nta['ntaname']
    + ", "
    + count_by_nta['boro_name']
)

count_by_nta.sort_values(by='count', ascending=False).head()


# In[48]:


fig, ax = plt.subplots(figsize=(8, 6))

data = count_by_nta.sort_values(by='count', ascending=False).head(10)
sns.barplot(
    data=data,
    y='ntaname_full',
    x='count',
    color='#1f77b4'
)

label = 'Count of NYC 311 Street Flooding Complaints by Street Segment (2010 - 2020)'
fig.suptitle(label, fontsize=12)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Street Segment\n', fontsize=12)

plt.tight_layout()


# In[49]:


fig, ax = plt.subplots(figsize=(10, 6))

data = count_by_nta.sort_values(by='count_per_100ft', ascending=False).head(10)
sns.barplot(
    data=data,
    y='ntaname_full',
    x='count_per_100ft',
    color='#1f77b4'
)

label = 'Count of NYC 311 Street Flooding Complaints per 100 ft. by Street Segment (2010 - 2020)'
fig.suptitle(label, fontsize=12)
plt.xlabel('Count per 100 ft.', fontsize=12)
plt.ylabel('Street Segment', fontsize=12)

plt.tight_layout()


# In[50]:


fig, axs = plt.subplots(2, 1, figsize=(10, 8))

# first plot
data = count_by_nta.sort_values(by='count', ascending=False).head(10)
sns.barplot(
    data=data,
    y='ntaname_full',
    x='count',
    color='#1f77b4',
    ax=axs[0]
)

label = 'Count of NYC 311 Street Flooding Complaints by Street Segment from 2010 to 2020'
axs[0].set_title(label, fontsize=12, pad=10, x=-.1)
axs[0].set_xlabel('Count', fontsize=12)
axs[0].set_ylabel('Street Segment\n', fontsize=12, labelpad=10)

# second plot
data = count_by_nta.sort_values(by='count_per_100ft', ascending=False).head(10)
sns.barplot(
    data=data,
    y='ntaname_full',
    x='count_per_100ft',
    color='#1f77b4',
    ax=axs[1]
)

label = 'Count of NYC 311 Street Flooding Complaints per 100 ft. by Street Segment from 2010 to 2020'
axs[1].set_title(label, fontsize=12, pad=10, x=-.24)
axs[1].set_xlabel('Count per 100 ft.', fontsize=12, labelpad=10)
axs[1].set_ylabel('Street Segment', fontsize=12, labelpad=10)

fig.tight_layout(pad=.9)
plt.savefig('figures/count-street-segment.png', dpi=250, bbox_inches='tight')


# In[ ]: