#!/usr/bin/env python # coding: utf-8 # # K-means Cluster Analysis - Philadelphia's neighborhoods # Jack Rummler | MUSA550: Final Project # Different neighborhoods possess different characteristics that potentially attract new residents to live in those places. Whether it’s the access or distance to amenities, demographic makeup, or affordability, many choices inform people’s desire to live in certain neighborhoods of a city. This web app is a cluster analysis of Philadelphia County, Pennsylvania to help inform both current residents who may be looking to move within the city, as well as transplants looking to come to Philadelphia. # Cluster analysis is a technique used in unsupervised machine learning, data mining, statistics, and other data-adjacent fields that groups objects into clusters based on shared features. It is a data-driven partitioning technique which creates the clusters based on number k of appropriate clusters and degree of similarity. # This Jupyter notebook will take us through the process of data gathering, data engineering, and cluster analysis. # In[1]: from matplotlib import pyplot as plt import numpy as np import pandas as pd import geopandas as gpd import hvplot.pandas import esri2gpd import carto2gpd import requests import cenpy import osmnx as ox import hvplot.pandas import holoviews as hv import altair as alt from sklearn.neighbors import NearestNeighbors from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from kneed import KneeLocator from sklearn.cluster import dbscan hv.extension("bokeh") np.random.seed(42) get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.max_columns = 999 pd.options.display.max_rows = 1200 pd.set_option('display.float_format', lambda x: '%.8f' % x) # ## Step 1: Importing census data # I will be using the census geojson data url to get the names and geometries of each tract. Since the projection of the data is in EPSG:4326, I will reproject it into EPSG:3857 and calculate each tract's area in square kilometers. Given that some neighborhoods are much larger than others, this will allows us to calculate the amount of a certain amenity there is per square kilometer to understand the density of said amenity. The census geometry will be re-aggregated to the neighborhood level, using Zillow neighbhorhood data. # In[2]: tracts_url = "https://opendata.arcgis.com/datasets/8bc0786524a4486bb3cf0f9862ad0fbf_0.geojson" tracts = requests.get(tracts_url).json() tracts = gpd.GeoDataFrame.from_features(tracts, crs="EPSG:4326") tractsCleaned = tracts[['geometry', 'TRACTCE10']] z = tractsCleaned.rename(columns={"TRACTCE10": "tract"}) # In[3]: #Calculate square kilometers of each neighborhood z_3857 = z.to_crs("epsg:3857") z_3857["area"] = z_3857['geometry'].area / 1000000 # Convert back to 4326 projection z = z_3857.to_crs("epsg:4326") z # ## Step 2: Open Data Philly # I used the Open Data Philly API to load in commercial and residential permits since 2016, and thefts data for the last three months. This helps us understand which neighborhoods are developing / hot spots for growth, as well as which areas may be deemed unsafe. These are important characteristics that could influence a potential resident to choose a certain neighbhorhood. # In[4]: # API Arguments carto_url = "https://phl.carto.com/api/v2/sql" table_name = "permits" # In[5]: # Commercial Permits since 2016 commercial = "permitdescription IN ('COMMERCIAL BUILDING PERMIT')" commercialPermits = carto2gpd.get(carto_url, table_name, where=commercial) commercialPermits = commercialPermits.to_crs("epsg:4326") commercialPermitsCleaned = commercialPermits[commercialPermits['geometry'].notna()] # In[6]: #Joining commercial permits to Zillow neighborhoods commercialJoin = gpd.sjoin(commercialPermitsCleaned, z, how='inner', predicate='intersects') commercialJoin = commercialJoin.drop(labels=['index_right'], axis=1) # In[7]: #Determining count of permits per neighborhood commercialJoinCount = commercialJoin.groupby(["tract"], dropna=False)['tract'].count().reset_index(name='commercialCount') commercialJoinCount = pd.DataFrame(commercialJoinCount) # In[8]: #Joining commercial count to originial DF z = pd.merge(z, commercialJoinCount, on="tract", how="left").fillna(0) # In[9]: z['commercialSqKm'] = z['commercialCount'] / z['area'] z = z.drop(['commercialCount'], axis=1) # In[10]: z = gpd.GeoDataFrame(z, geometry='geometry') type(z) # In[11]: #Residential permits since 2016 residential = "permitdescription IN ('RESIDENTIAL BUILDING PERMIT')" residentialPermits = carto2gpd.get(carto_url, table_name, where=residential) residentialPermits = residentialPermits.to_crs("epsg:4326") residentialPermitsCleaned = residentialPermits[residentialPermits['geometry'].notna()] # In[12]: #Determining which neighborhood each permit belongs to residentialJoin = gpd.sjoin(residentialPermitsCleaned, z, how='left', predicate='intersects') residentialJoin = residentialJoin.drop(labels=['index_right'], axis=1) # In[13]: #Determining count of permits per neighborhood residentialJoinCount = residentialJoin.groupby(["tract"])['tract'].count().reset_index(name='residentialCount') residentialJoinCount = pd.DataFrame(residentialJoinCount) # In[14]: #Joining residential count to originial DF z = pd.merge(z, residentialJoinCount, on="tract", how="left").fillna(0) # In[15]: z['residentialSqKm'] = z['residentialCount'] / z['area'] z = z.drop(['residentialCount'], axis=1) # In[16]: z = gpd.GeoDataFrame(z, geometry='geometry') type(z) # In[17]: #Thefts data API table call crime_table = "incidents_part1_part2" a = "dispatch_date >= '2022-10-01'" b = " and text_general_code IN ('Thefts')" crime = a + b #Crimes for last three months crimeData = carto2gpd.get(carto_url, crime_table, where=crime) crimeData = crimeData.to_crs("epsg:4326") crimeDataCleaned = crimeData[['geometry', 'text_general_code']] crimeDataCleaned = crimeDataCleaned[crimeDataCleaned['geometry'].notna()] # In[18]: #Joining theft count to DF theftsJoin = gpd.sjoin(crimeDataCleaned, z, how='inner', predicate='intersects') theftsJoin = theftsJoin.drop(labels=['index_right'], axis=1) # In[19]: theftsCount = theftsJoin.groupby(["tract"])['tract'].count().reset_index(name='theftsCount') theftsCount = pd.DataFrame(theftsCount) # In[20]: z = pd.merge(z, theftsCount, on="tract", how="left").fillna(0) # In[21]: z['theftsSqKm'] = z['theftsCount'] / z['area'] z = z.drop(['theftsCount'], axis=1) # In[22]: z = gpd.GeoDataFrame(z, geometry='geometry') type(z) # In[23]: z # ## Step 3: Open Street Map # Open Street Map is a collaborative maping resource that allows users to access point data on various features. In this context, I used OSM to get information on amenities such as restaurants, theatres, parks, and more. Again, I used a calculation of number of x amenity per square kilometer of census tract. # In[24]: # City Limits data to use for OSMnx city_limits_url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0" city_limits = esri2gpd.get(city_limits_url).to_crs(epsg=3857) city_limits_outline = city_limits.to_crs(epsg=4326).squeeze().geometry # In[25]: # Restaurants, pubs, and bars restaurant = ox.geometries_from_polygon(city_limits_outline, tags={"amenity": ["restaurant", "pub", "bar"]}) restaurant = restaurant.to_crs(epsg=4326) restaurantJoin = gpd.sjoin(restaurant, z, how='inner', predicate='intersects') restaurantJoin = restaurantJoin.drop(labels=['index_right'], axis=1) restaurantCount = restaurantJoin.groupby(["tract"])['tract'].count().reset_index(name='restaurantCount') restaurantCount = pd.DataFrame(restaurantCount) z = pd.merge(z, restaurantCount, on="tract", how="left").fillna(0) z['restaurantSqKm'] = z['restaurantCount'] / z['area'] z = z.drop(['restaurantCount'], axis=1) # In[26]: # Arts centers, museums, theatres arts = ox.geometries_from_polygon(city_limits_outline, tags={"amenity": ["arts_centre", "theatre"]}) arts = arts.to_crs(epsg=4326) artsJoin = gpd.sjoin(arts, z, how='inner', predicate='intersects') artsJoin = artsJoin.drop(labels=['index_right'], axis=1) artsCount = artsJoin.groupby(["tract"])['tract'].count().reset_index(name='artsCount') artsCount = pd.DataFrame(artsCount) z = pd.merge(z, artsCount, on="tract", how="left").fillna(0) z['artsSqKm'] = z['artsCount'] / z['area'] z = z.drop(['artsCount'], axis=1) # In[27]: # Green spaces parks = ox.geometries_from_polygon(city_limits_outline, tags={"leisure":["park", "garden", "playground"]}) parks = parks.to_crs(epsg=4326) parksJoin = gpd.sjoin(parks, z, how='inner', predicate='intersects') parksJoin = parksJoin.drop(labels=['index_right'], axis=1) parksCount = parksJoin.groupby(["tract"])['tract'].count().reset_index(name='parksCount') parksCount = pd.DataFrame(parksCount) z = pd.merge(z, parksCount, on="tract", how="left").fillna(0) z['parksSqKm'] = z['parksCount'] / z['area'] z = z.drop(['parksCount'], axis=1) # In[28]: # Healthcare facilities health = ox.geometries_from_polygon(city_limits_outline, tags={"healthcare": ['doctors', 'clinic', 'hospital']}) health = health.to_crs(epsg=4326) healthJoin = gpd.sjoin(health, z, how='inner', predicate='intersects') healthJoin = healthJoin.drop(labels=['index_right'], axis=1) healthCount = healthJoin.groupby(["tract"])['tract'].count().reset_index(name='healthCount') healthCount = pd.DataFrame(healthCount) z = pd.merge(z, healthCount, on="tract", how="left").fillna(0) z['healthSqKm'] = z['healthCount'] / z['area'] z = z.drop(['healthCount'], axis=1) z # ## Step 4: Census Data # The final data source is the census, using the cenpy package's remote API conncetion. I used American Community Survey's 5-year survey for 2019. I chose 2019 because the census tracts maintained the same geometry. I looked at a variety of demographic characteristics and calculated primarily percentages of each in the census tract. # In[29]: # Connecting to the API acs2019 = cenpy.remote.APIConnection("ACSDT5Y2019") # In[30]: census_vars = [ "NAME", "GEO_ID", "B25026_001E", # total population in housing units "B19013_001E", # median household income "B25058_001E", # median rent "B06012_002E", # total poverty "B03002_003E", # Not Hispanic, White "B03002_004E", # Not Hispanic, Black "B03002_005E", # Not Hispanic, American Indian "B03002_006E", # Not Hispanic, Asian "B03002_007E", # Not Hispanic, Native Hawaiian "B03002_008E", # Not Hispanic, Other "B03002_009E", # Not Hispanic, Two or More Races "B03002_012E", # Hispanic "B01001_003E", # Male, age < 5 "B01001_004E", # Male, age 5-9 "B01001_005E", # Male, age 10-14 "B01001_006E", # Male, age 15-17 "B01001_007E", # Male, age 18-19 "B01001_008E", # Male, age 20 "B01001_009E", # Male, age 21 "B01001_010E", # Male, age 22-24 "B01001_011E", # Male, age 25-29 "B01001_012E", # Male, age 30-34 "B01001_013E", # Male, age 35-39 "B01001_014E", # Male, age 40-44 "B01001_015E", # Male, age 45-49 "B01001_016E", # Male, age 50-54 "B01001_017E", # Male, age 55-59 "B01001_018E", # Male, age 60-61 "B01001_019E", # Male, age 62-64 "B01001_020E", # Male, age 65-66 "B01001_021E", # Male, age 67-69 "B01001_022E", # Male, age 70-74 "B01001_023E", # Male, age 75-79 "B01001_024E", # Male, age 80-84 "B01001_025E", # Male, age > 85 "B01001_027E", # Female, age < 5 "B01001_028E", # Female, age 5-9 "B01001_029E", # Female, age 10-14 "B01001_030E", # Female, age 15-17 "B01001_031E", # Female, age 18-19 "B01001_032E", # Female, age 20 "B01001_033E", # Female, age 21 "B01001_034E", # Female, age 22-24 "B01001_035E", # Female, age 25-29 "B01001_036E", # Female, age 30-34 "B01001_037E", # Female, age 35-39 "B01001_038E", # Female, age 40-44 "B01001_039E", # Female, age 45-49 "B01001_040E", # Female, age 50-54 "B01001_041E", # Female, age 55-59 "B01001_042E", # Female, age 60-61 "B01001_043E", # Female, age 62-64 "B01001_044E", # Female, age 65-66 "B01001_045E", # Female, age 67-69 "B01001_046E", # Female, age 70-74 "B01001_047E", # Female, age 75-79 "B01001_048E", # Female, age 80-84 "B01001_049E", # Female, age > 85 "B06009_002E", # Less than high school "B06009_003E", # High school or equivalent "B06009_004E", # Some college or Associates "B06009_005E", # Bachelor's degree "B06009_006E", # Graduate/professional degree "B07003_004E", # Lived in same house last year "B07003_007E", # Moved within county "B07003_010E", # Moved from different county "B07003_013E", # Moved from different state "B07003_016E", # Moved from abroad ] # In[31]: philly_census = acs2019.query( cols=census_vars, geo_unit="tract:*", geo_filter={"state": "42", "county": "101"}, ) philly_census # In[32]: philly_census = philly_census.rename( columns={ "B25026_001E": "TotalPop", # total population in housing units "B19013_001E": "MedHHIncome", # median household income "B25058_001E": "MedRent", # median rent "B06012_002E": "Poverty", # total poverty "B03002_003E": "White", # Not Hispanic, White "B03002_004E": "Black", # Not Hispanic, Black "B03002_005E": "AmerIndian", # Not Hispanic, American Indian "B03002_006E": "Asian", # Not Hispanic, Asian "B03002_007E": "NativeHawaiian", # Not Hispanic, Native Hawaiian "B03002_008E": "Other", # Not Hispanic, Other "B03002_009E": "TwoPlusRaces", # Not Hispanic, Two or More Races "B03002_012E": "Hispanic", # Hispanic "B01001_003E": "MaleUnder5", # Male, age < 5 "B01001_004E": "Male5to9", # Male, age 5-9 "B01001_005E": "Male10to14", # Male, age 10-14 "B01001_006E": "Male15to17", # Male, age 15-17 "B01001_007E": "Male18to19", # Male, age 18-19 "B01001_008E": "Male20", # Male, age 20 "B01001_009E": "Male21", # Male, age 21 "B01001_010E": "Male22to24", # Male, age 22-24 "B01001_011E": "Male25to29", # Male, age 25-29 "B01001_012E": "Male30to34", # Male, age 30-34 "B01001_013E": "Male35to39", # Male, age 35-39 "B01001_014E": "Male40to44", # Male, age 40-44 "B01001_015E": "Male45to49", # Male, age 45-49 "B01001_016E": "Male50to54", # Male, age 50-54 "B01001_017E": "Male55to59", # Male, age 55-59 "B01001_018E": "Male60to61", # Male, age 60-61 "B01001_019E": "Male62to64", # Male, age 62-64 "B01001_020E": "Male65to66", # Male, age 65-66 "B01001_021E": "Male67to69", # Male, age 67-69 "B01001_022E": "Male70to74", # Male, age 70-74 "B01001_023E": "Male75to79", # Male, age 75-79 "B01001_024E": "Male80to84", # Male, age 80-84 "B01001_025E": "MaleOver85", # Male, age > 85 "B01001_027E": "FemaleUnder5", # Female, age < 5 "B01001_028E": "Female5to9", # Female, age 5-9 "B01001_029E": "Female10to14", # Female, age 10-14 "B01001_030E": "Female15to17", # Female, age 15-17 "B01001_031E": "Female18to19", # Female, age 18-19 "B01001_032E": "Female20", # Female, age 20 "B01001_033E": "Female21", # Female, age 21 "B01001_034E": "Female22to24", # Female, age 22-24 "B01001_035E": "Female25to29", # Female, age 25-29 "B01001_036E": "Female30to34", # Female, age 30-34 "B01001_037E": "Female35to39", # Female, age 35-39 "B01001_038E": "Female40to44", # Female, age 40-44 "B01001_039E": "Female45to49", # Female, age 45-49 "B01001_040E": "Female50to54", # Female, age 50-54 "B01001_041E": "Female55to59", # Female, age 55-59 "B01001_042E": "Female60to61", # Female, age 60-61 "B01001_043E": "Female62to64", # Female, age 62-64 "B01001_044E": "Female65to66", # Female, age 65-66 "B01001_045E": "Female67to69", # Female, age 67-69 "B01001_046E": "Female70to74", # Female, age 70-74 "B01001_047E": "Female75to79", # Female, age 75-79 "B01001_048E": "Female80to84", # Female, age 80-84 "B01001_049E": "FemaleOver85", # Female, age > 85 "B06009_002E": "UnderHS", # Less than high school "B06009_003E": "HS", # High school or equivalent "B06009_004E": "SomeCollege", # Some college or Associates "B06009_005E": "Bachelors", # Bachelor's degree "B06009_006E": "Graduate", # Graduate/professional degree "B07003_004E": "DidNotMove", # Lived in same house last year "B07003_007E": "MovedInCounty", # Moved within county "B07003_010E": "MovedDiffCounty", # Moved from different county "B07003_013E": "MovedDiffState", # Moved from different state "B07003_016E": "MovedDiffCountry", # Moved from abroad }) # In[33]: # Turning columns from strings to intergers philly_census[['TotalPop', 'MedHHIncome', 'MedRent', 'Poverty', 'White', 'Black', 'AmerIndian', 'Asian', 'NativeHawaiian', 'Other', 'TwoPlusRaces', 'Hispanic', 'MaleUnder5', 'Male5to9', 'Male10to14', 'Male15to17', 'Male18to19', 'Male20', 'Male21', 'Male22to24', 'Male25to29', 'Male30to34', 'Male35to39', 'Male40to44', 'Male45to49', 'Male50to54', 'Male55to59', 'Male60to61', 'Male62to64', 'Male65to66', 'Male67to69', 'Male70to74', 'Male75to79', 'Male80to84', 'MaleOver85', 'FemaleUnder5', 'Female5to9', 'Female10to14', 'Female15to17', 'Female18to19', 'Female20', 'Female21', 'Female22to24', 'Female25to29', 'Female30to34', 'Female35to39', 'Female40to44', 'Female45to49', 'Female50to54', 'Female55to59', 'Female60to61', 'Female62to64', 'Female65to66', 'Female67to69', 'Female70to74', 'Female75to79', 'Female80to84', 'FemaleOver85', 'UnderHS', 'HS', 'SomeCollege', 'Bachelors', 'Graduate', 'DidNotMove', 'MovedInCounty', 'MovedDiffCounty', 'MovedDiffState', 'MovedDiffCountry']] = philly_census[[ 'TotalPop', 'MedHHIncome', 'MedRent', 'Poverty', 'White', 'Black', 'AmerIndian', 'Asian', 'NativeHawaiian', 'Other', 'TwoPlusRaces', 'Hispanic', 'MaleUnder5', 'Male5to9', 'Male10to14', 'Male15to17', 'Male18to19', 'Male20', 'Male21', 'Male22to24', 'Male25to29', 'Male30to34', 'Male35to39', 'Male40to44', 'Male45to49', 'Male50to54', 'Male55to59', 'Male60to61', 'Male62to64', 'Male65to66', 'Male67to69', 'Male70to74', 'Male75to79', 'Male80to84', 'MaleOver85', 'FemaleUnder5', 'Female5to9', 'Female10to14', 'Female15to17', 'Female18to19', 'Female20', 'Female21', 'Female22to24', 'Female25to29', 'Female30to34', 'Female35to39', 'Female40to44', 'Female45to49', 'Female50to54', 'Female55to59', 'Female60to61', 'Female62to64', 'Female65to66', 'Female67to69', 'Female70to74', 'Female75to79', 'Female80to84', 'FemaleOver85', 'UnderHS', 'HS', 'SomeCollege', 'Bachelors', 'Graduate', 'DidNotMove', 'MovedInCounty', 'MovedDiffCounty', 'MovedDiffState', 'MovedDiffCountry']].astype(int) # ### Percentage calculations # In[34]: philly_census['ImmigrantPct'] = philly_census['MovedDiffCountry'] / philly_census['TotalPop'] # In[35]: philly_census['TransplantPct'] = philly_census['MovedDiffState'] / philly_census['TotalPop'] # In[36]: philly_census['BachPct'] = (philly_census['Bachelors'] + philly_census['Graduate']) / philly_census['TotalPop'] # In[37]: philly_census['Under14Pct'] = (philly_census['MaleUnder5'] + philly_census['FemaleUnder5'] + philly_census['Male5to9'] + philly_census['Female5to9'] + philly_census['Male10to14'] + philly_census['Female10to14']) / philly_census['TotalPop'] # In[38]: philly_census['15to29Pct'] = (philly_census['Male15to17'] + philly_census['Male18to19'] + philly_census['Male20'] + philly_census['Male21'] + philly_census['Male22to24'] + philly_census['Male25to29'] + philly_census['Female15to17'] + philly_census['Female18to19'] + philly_census['Female20'] + philly_census['Female21'] + philly_census['Female22to24'] + philly_census['Female25to29']) / philly_census['TotalPop'] # In[39]: philly_census['30to44Pct'] = (philly_census['Male30to34'] + philly_census['Male35to39'] + philly_census['Male40to44'] + philly_census['Female30to34'] + philly_census['Female35to39'] + philly_census['Female40to44']) / philly_census['TotalPop'] # In[40]: philly_census['45to64Pct'] = (philly_census['Male45to49'] + philly_census['Male50to54'] + philly_census['Male55to59'] + philly_census['Male60to61'] + philly_census['Male62to64'] + philly_census['Female45to49'] + philly_census['Female50to54'] + philly_census['Female55to59'] + philly_census['Female60to61'] + philly_census['Female62to64']) / philly_census['TotalPop'] # In[41]: philly_census['Over65Pct'] = (philly_census['Male65to66'] + philly_census['Male67to69'] + philly_census['Male70to74'] + philly_census['Male75to79'] + philly_census['Male80to84'] + philly_census['MaleOver85'] + philly_census['Female65to66'] + philly_census['Female67to69'] + philly_census['Female70to74'] + philly_census['Female75to79'] + philly_census['Female80to84'] + philly_census['FemaleOver85']) / philly_census['TotalPop'] # In[42]: philly_census['WhitePct'] = philly_census['White'] / philly_census['TotalPop'] # In[43]: philly_census['BlackPct'] = philly_census['Black'] / philly_census['TotalPop'] # In[44]: philly_census['AsianPct'] = philly_census['Asian'] / philly_census['TotalPop'] # In[45]: philly_census['HispanicPct'] = philly_census['Hispanic'] / philly_census['TotalPop'] # In[46]: philly_census['OtherRacePct'] = (philly_census['NativeHawaiian'] + philly_census['AmerIndian'] + philly_census['Other'] + philly_census['TwoPlusRaces']) / philly_census['TotalPop'] # In[47]: philly_census['PovertyPct'] = philly_census['Poverty'] / philly_census['TotalPop'] # In[48]: censusCleaned = philly_census.drop(['Poverty', 'White', 'Black', 'AmerIndian', 'Asian', 'NativeHawaiian', 'Other', 'TwoPlusRaces', 'Hispanic', 'MaleUnder5', 'Male5to9', 'Male10to14', 'Male15to17', 'Male18to19', 'Male20', 'Male21', 'Male22to24', 'Male25to29', 'Male30to34', 'Male35to39', 'Male40to44', 'Male45to49', 'Male50to54', 'Male55to59', 'Male60to61', 'Male62to64', 'Male65to66', 'Male67to69', 'Male70to74', 'Male75to79', 'Male80to84', 'MaleOver85', 'FemaleUnder5', 'Female5to9', 'Female10to14', 'Female15to17', 'Female18to19', 'Female20', 'Female21', 'Female22to24', 'Female25to29', 'Female30to34', 'Female35to39', 'Female40to44', 'Female45to49', 'Female50to54', 'Female55to59', 'Female60to61', 'Female62to64', 'Female65to66', 'Female67to69', 'Female70to74', 'Female75to79', 'Female80to84', 'FemaleOver85', 'UnderHS', 'HS', 'SomeCollege', 'Bachelors', 'Graduate', 'DidNotMove', 'MovedInCounty', 'MovedDiffCounty', 'MovedDiffState', 'MovedDiffCountry', 'GEO_ID' ], axis=1) censusCleaned # In[49]: # Since our API does not have geometry data, we must merge it to datafram with Open Data Philly and OSM data x = pd.merge(censusCleaned, z, on='tract', how='left') x # In[50]: # Calculating population density x['PopDensity'] = x['TotalPop'] / x['area'] x # In[51]: x = gpd.GeoDataFrame(x, geometry='geometry') type(x) # ## Step 5: Joining census data with neighborhood data # This step required me to join tract data with Zillow neighborhood data. Both sets of data's geometries are obviously polygons, so the dataframes were spatially joined, which created new unique geometries based on tract and neighborhood data. Then, the mean values of each row with a shared neighborhood name were calculated. # In[52]: #Load in Zillow data zillow = "https://raw.githubusercontent.com/MUSA-550-Fall-2021/week-3/main/data/zillow_neighborhoods.geojson" y = gpd.read_file(zillow) y = y.set_crs("epsg:4326") y # In[53]: # Join data from steps 2-4 to the Zillow data frame join = gpd.sjoin(x, y, how='inner', predicate='intersects') join # In[54]: # Removing rows with incorrect values join = join[join.MedHHIncome != -666666666] join = join[join.MedRent != -666666666] join # In[55]: len(join) # In[56]: join_mean = join.groupby(["ZillowName"], dropna=False)[('PopDensity', 'TotalPop', 'MedHHIncome', 'MedRent', 'ImmigrantPct', 'TransplantPct', 'BachPct', 'Under14Pct', '15to29Pct', '30to44Pct', '45to64Pct', 'Over65Pct', 'WhitePct', 'BlackPct', 'AsianPct', 'HispanicPct', 'OtherRacePct', 'PovertyPct', 'geometry', 'commercialSqKm', 'residentialSqKm', 'theftsSqKm', 'restaurantSqKm', 'artsSqKm', 'parksSqKm', 'healthSqKm' )].mean().reset_index() join_mean # In[57]: # Merging back onto the original Zillow dataframe with geometries data = pd.merge(y, join_mean, on="ZillowName", how="left").fillna(0) data # In[58]: data = data.rename(columns = {'ZillowName': 'Neighborhood'}) data # In[59]: type(data) # ## Step 6: K-means Cluster Analysis # With our final dataframe, we can now do a cluster analysis. In this case, we are using k-means clustering, which aims to partition n observations into k clusters, where each cluster belongs to the cluster with the nearest mean. A limitation of k-means clustering is determining k clusters in advance, but the knee test helps determine the optimal amount. There are also other sophisticated methods of completing cluster analysis, such as DB SCAN; however, k-means fits the scope of this analysis. # In[60]: # Using the standard scaler tool to fit our data scaler = StandardScaler() scaled_data = scaler.fit_transform(data[['PopDensity', 'TotalPop', 'MedHHIncome', 'MedRent', 'ImmigrantPct', 'TransplantPct', 'BachPct', 'Under14Pct', '15to29Pct', '30to44Pct', '45to64Pct', 'Over65Pct', 'WhitePct', 'BlackPct', 'AsianPct', 'HispanicPct', 'OtherRacePct', 'PovertyPct', 'commercialSqKm', 'residentialSqKm', 'theftsSqKm', 'restaurantSqKm', 'artsSqKm', 'parksSqKm', 'healthSqKm']]) # In[61]: # Number of clusters to try out n_clusters = list(range(2, 10)) # Run kmeans for each value of k inertias = [] for k in n_clusters: # Initialize and run kmeans = KMeans(n_clusters=k) kmeans.fit(scaled_data) # Save the "inertia" inertias.append(kmeans.inertia_) plt.plot(n_clusters, inertias, marker='o', ms=10, mfc='white', lw=4, mew=3); # In[62]: kn = KneeLocator(n_clusters, inertias, curve='convex', direction='decreasing') print(kn.knee) # We will use 5 clusters # In[63]: # Cluster Anaysis kmeans = KMeans(n_clusters=5, random_state=42) kmeans.fit(scaled_data) # Saves cluster labels data['Cluster'] = kmeans.labels_ data # In[64]: # Looking at mean values of each cluster's variables cluster_analysis=data.groupby('Cluster', as_index=False).mean().sort_values(by='Cluster') cluster_analysis # In[65]: # Ranking variables to determine trends rank = cluster_analysis.rank(axis=0) rank # I wanted to give each cluster a unique name that captures some aspect of the data, thus I renamed all 5 clusters based on the trends I saw after ranking. # In[66]: # Changing cluster names data = data.replace({'Cluster' : {0:"Suburban Life", 1:"University Crescent", 2:"Minority-Majority", 3:"Young and Fun", 4:"Downtown Money Pocket"}}) # In[67]: # Describing each cluster: this will be used in an interactive feature for the app description = {"Suburban Life": "Suburban life is exactly how it sounds – suburban. These neighborhoods expand across Northeast and Northwest Philadelphia and are the least densely populated neighborhoods in the city proper. They are characterized by high media household income, predominantly home ownership, and fewer options for rentals (though the average rental price is on the lower end). These neighborhoods are racially homogeneous, predominantly white households, with a diversity of young families, couples, and retirees. Considering the land use trends of suburban neighborhoods, this cluster sees the least amount of new development, and has the least amount of access to amenities.", "University Crescent": "The University Crescent is a conglomerate of neighborhoods expanding along the Schuylkill River. Several of the neighborhoods are in or outside of University City, so they are popular among college students, professional students, and young professionals. These neighborhoods are some of the most diverse, with the 2nd highest percentage of Black, Hispanic, and Asian populations out of all clusters, and have the highest concentration of transplants and immigrants. Some of these neighborhoods are being gentrified, thus more residential and commercial development is happening here.", "Minority-Majority": "Since Philadelphia is a minority-majority city, most neighborhoods fall into this category. These neighborhoods have high concentrations of Black and Hispanic populations, and are also frequently multi-generational, with high concentrations of under 14 and over 65 population. Minority-majority neighborhoods exemplify Philadelphia’s history of redlining and segregation: these neighborhoods have minimal new development and minimal access to amenities.", "Young and Fun": "Young and fun neighborhoods are how they sound - young and fun. They are found mainly throughout South Philadelphia and downtown and are defined by a high concentration of millennials, racial diversity, college-educated, and moderate affordability. These neighborhoods often have great access to restaurants, pubs, arts establishments, and parks, and have high residential and commercial development activity.", "Downtown Money Pocket": "The downtown money pocket is a small collection of neighborhoods defined by residential density and household income. These neighborhoods are in historic districts like Old City, or the heart of downtown like Center City. These neighborhoods have the highest median household income, median rent, and percentage of college educated individuals. They are the most connected to amenities and are attracting lots of new commercial and residential development." } # In[68]: data['Description'] = data['Cluster'].replace(description) data # In[69]: # Alt air Map of Clusters data = data.to_crs(epsg=4326) domain = ["Downtown Money Pocket", "Minority-Majority", "Suburban Life", "University Crescent", "Young and Fun"] range_ = ['#4053d3', '#00b25d', '#fb49b0', "#b51d14", "#ddb310"] chart = alt.Chart(data, title="Neighborhood Cluster Map\n(Philadelphia, PA)").mark_geoshape().properties(width=600, height=600).encode( tooltip=["Neighborhood:N", "Cluster:N", "Description:N"], color=alt.Color("Cluster:N", scale=alt.Scale(domain=domain, range=range_))) chart = chart.configure_title( fontSize=26, font='Rockwell', anchor='start', color='black') chart = chart.configure_legend( strokeColor='gray', fillColor='black', padding=10, cornerRadius=10, orient='bottom-right', labelColor='white', titleColor='white', titleFontSize=18, labelFontSize=12, titleFont="Helvetica", labelFont="Helvetica") chart = chart.configure_view(strokeWidth=0, fill='black') chart # In[120]: chart.save("cluster.json") # In[121]: chart.save("cluster.html") # ## Cluster 2: DBSCAN # In[99]: hvdata = data.drop(labels=['geometry', 'TotalPop', 'Description'], axis=1) hvdata # In[100]: hvdata = hvdata.round(decimals=3) hvdata # In[101]: hv = pd.melt(hvdata, id_vars=["Neighborhood", "Cluster"]) hv # In[102]: from bokeh.models import HoverTool hover = HoverTool(tooltips=[("amount", "@amount{0,0.000}"), ("amount", "@amount{.000}"), ("amount", "@amount{.0000}"), ("amount", "@amount{.00}") ]) table = hv.hvplot(kind="scatter", color=['#4053d3', '#00b25d', '#fb49b0', "#b51d14", "#ddb310"], size=40, by=["Cluster"], x="Neighborhood", groupby="variable", height=500).opts(tools=[hover]) table # In[122]: hvplot.save(table, 'table.html') # ## App Development # In[103]: import panel as pn import param as pm pn.extension("vega", sizing_mode="stretch_width") # In[116]: class PhillyClustersApp(pm.Parameterized): """ An example app visualizing neighborhood clusters in Philadelphia It includes: - an Altair map of the clusters - an hvplot scatterplot of each variable the clusters were based on, grouped by cluster """ def altair_map(self): """ Return an altair map of the clusters. """ data = data.to_crs(epsg=4326) domain = ["Downtown Money Pocket", "Minority-Majority", "Suburban Life", "University Crescent", "Young and Fun"] range_ = ['#4053d3', '#00b25d', '#fb49b0', "#b51d14", "#ddb310"] chart = alt.Chart(data, title="Neighborhood Cluster Map (Philadelphia, PA)").mark_geoshape().properties(width=600, height=600).encode( tooltip=["Neighborhood:N", "Cluster:N", "Description:N"], color=alt.Color("Cluster:N", scale=alt.Scale(domain=domain, range=range_))) chart = chart.configure_title( fontSize=26, font='Rockwell', anchor='start', color='black') chart = chart.configure_legend( strokeColor='gray', fillColor='black', padding=10, cornerRadius=10, orient='bottom-right', labelColor='white', titleColor='white', titleFontSize=18, labelFontSize=12, titleFont="Helvetica", labelFont="Helvetica") chart = chart.configure_view( strokeWidth=0, fill='black') return pn.Pane(chart) def scatterplot(self): """ Return a scatterplot of grouped variables. """ hover = HoverTool(tooltips=[("amount", "@amount{0,0.000}"), ("amount", "@amount{.000}"), ("amount", "@amount{.0000}"), ("amount", "@amount{.00}")]) table = hv.hvplot( kind="scatter", color=['#4053d3', '#00b25d', '#fb49b0', "#b51d14", "#ddb310"], size=40, by=["Cluster"], x="Neighborhood", groupby="variable", height=500).opts(tools=[hover]) return pn.Pane(table) # In[117]: app = PhillyClustersApp(name="") # In[118]: panel = pn.template.BootstrapTemplate( title="Neighborhood Clusters in Philadelphia", sidebar_width=0 # No sidebar needed for this app ) # In[119]: panel.main.append( pn.Column( pn.Row(app.altairmap, app.scatterplot) ) )