import sys as sys
sys.path.append('./code')
import pandas as pd
import numpy as np
import os
## available via pip install nyc_geoclient
from nyc_geoclient import Geoclient
# need API keys from Geoclient, substitute in your own here
# https://developer.cityofnewyork.us/api/geoclient-api
g = Geoclient(os.environ['GEOSUPPORT_ID'],os.environ['GEOSUPPORT_KEY'])
## Some geocoding helper functions MODA wrote
from geocoding import GetBBLFromBIN
%matplotlib inline
import warnings; warnings.simplefilter('ignore') # for presentation purposes
###Some helper functions
def Remove_Dummy_BINS(df):
'''
removes illegitimate BINs from input dataframe.
'''
dummy_BINs = [1000000 , 2000000, 3000000, 4000000, 5000000]
df = df[~df.BIN.isin(dummy_BINs)]
return df
Several of the source datasets are large and impractical to store in with the repository. Their locations are linked below. Getting the data from the provided links will also ensure getting the most up-to-date versions.
To re-run the code, download these files to the '/input' directory and check that the file-path variables in the code match the actual file name.
Note: The Cooling Towers dataset linked above was derived by DOITT GIS from planimetrics data; the latest planimetrics images were taken in 2014 and the dataset was published in 2016 (more information on planimetrics can be found in this GitHub repository.) The dataset used during the 2015 Cooling Towers activation was not based on this planimetrics data.
path_towers = "../input/Cooling_Towers.csv" # or another location for the file
if not os.path.isfile(path_towers):
print "Please download Cooling Tower Data to /input directory"
else:
Towers = pd.read_csv(path_towers)
print len(Towers)
Towers = Remove_Dummy_BINS(Towers)
## Drop any duplicate buildings ( a building may have more than one cooling tower)
Towers.drop_duplicates(subset=['BIN'], inplace=True)
Towers['Identified'] = 1 # noting that these buildings have a cooling tower
print len(Towers)
3204 2035
## This is PLUTO data from the relevant timeperiod in 2015, updated versions are available.
path_PLUTO = "../input/nyc_pluto_15v1"
if not os.path.exists(path_PLUTO):
print 'Please download PLUTO data to input/ directory'
boros = ['BK','BX','Mn','QN','SI']
PLUTO = pd.DataFrame()
for b in boros:
filename = path_PLUTO + "/"+ b + ".csv"
print 'reading borough file: ' + b
temp = pd.read_csv(filename)
print str(len(temp)) + ' rows in ' + b
PLUTO = pd.concat([PLUTO,temp],axis=0)
print str(len(PLUTO)) + ' rows in total PLUTO data'
reading borough file: BK 277748 rows in BK 277748 rows in total PLUTO data reading borough file: BX 89963 rows in BX 367711 rows in total PLUTO data reading borough file: Mn 43231 rows in Mn 410942 rows in total PLUTO data reading borough file: QN 324630 rows in QN 735572 rows in total PLUTO data reading borough file: SI 123892 rows in SI 859464 rows in total PLUTO data
## PAD file is also from 2015
path_PAD = "../input/pad15b/bobaadr.txt"
if not os.path.isfile(path_PAD):
print 'Please download PAD to /input directory'
pad = pd.read_csv(path_PAD)
pad['BBL'] = (pad['boro'].astype(str) + \
pad['block'].apply('{0:0>5}'.format) + \
pad['lot'].apply('{0:0>4}'.format)).astype(float)
pad.rename(columns={'bin':'BIN'},inplace=True)
pad.drop_duplicates(subset = ['BIN','BBL'], inplace=True)
pad = Remove_Dummy_BINS(pad)
# using Building Identification Number (BIN) as index, checking to make sure there's no duplicates.
pad.set_index('BIN',inplace=True,verify_integrity=True)
Towers.set_index('BIN',inplace=True,verify_integrity=True)
In New York City a BIN identifies a building while a BBL identifies a lot of land. Some lots have more than one building on them, so one BBL may have multiple associated BINs.
As the code below shows, we joined a list of buildings to a list of lots using BBL. This was a simplifying assumption made in order to make use of as much data as possible as quickly as possible. Two things make this a palatable assumption:
## Run this code to test the BIN vs. BBL assumptions from above
#PLUTO[PLUTO.NumBldgs > 1].BldgClass.value_counts(normalize = True).cumsum().head(10)
#PLUTO.NumBldgs.value_counts(normalize=1)
print str(len(Towers)) + ' buildings in tower data'
Towers = Towers.join(pad[['BBL']],how='left')
print str(len(Towers)) + ' rows joined to PAD'
Towers.reset_index(inplace=True)
Towers.drop_duplicates(subset = ['BBL'],inplace=True) # collapsing towers to lot level
print str(len(Towers)) + ' unique BBLs in tower set'
orphan_towers = Towers[~Towers.BBL.isin(PLUTO.BBL)] ## these seem to be mostly condo BBLs
print str(len(orphan_towers)) + ' tower BBLs not in PLUTO'
2035 buildings in tower data 2035 rows joined to PAD 1929 unique BBLs in tower set 390 tower BBLs not in PLUTO
Sometimes identifiers can be inconsistent across datasets and going 'to the source' of DCP's GeoSupport geocoding service is the easiest way to reconcile differences.
Note: Running this geocoding process now may produce results inconsistent with when it was originally run in 2015, since BINs and BBLs may have changed.
orphan_towers_geosupport, errors = GetBBLFromBIN(orphan_towers,'BIN', geoclient_instance=g); # geoclient returns the most up-to-date BBL
orphan_towers_geosupport[['BBL']] = orphan_towers_geosupport[['BBL']].astype(float)
Towers.set_index('BIN',inplace=True,verify_integrity=True)
orphan_towers_geosupport.set_index('BIN',inplace=True,verify_integrity=True)
Towers.update(orphan_towers_geosupport) # sub in the geosupport BBLs
2118405 2096626 2096857 2116415 2117335 2080263 3393449 3392416 3396842 3388709 3348849 3397489 3058390 3393008 3061628 3061652 3396506 3397446 3391008 3391222 3329418 3054615 3397042 3398702 3388496 3006148 3396696 3346104 3002008 3000259 3392184 3413889 3391746 3397720 3000484 3347736 3392968 3394020 3000274 3398101 3391379 1001531 1002114 1087716 1087847 1087848 1087170 1085867 1085789 1087238 1082634 1087700 1083346 1087842 1000005 1010899 1088864 1012223 1086105 1012238 1067116 1010382 1009746 1088431 1080855 1087241 1088698 1087833 1016065 1089677 1088500 1087725 1016163 1017954 1069344 1007638 1007251 1087488 1002734 1002167 1002125 1002814 1020579 1087667 1088222 1001877 1001883 1001838 1060526 1060788 1087917 1085673 1015937 1017178 1017247 1087746 1027472 1027366 1013039 1013011 1087701 1013854 1087260 1087721 1080359 1088885 1015006 1087767 1087723 1015146 1087989 1085695 1076166 1021925 1016212 1087902 1088910 1016877 1087263 1016884 1089380 1019668 1019951 1016907 1087770 1087726 1087536 1087841 1018549 1018474 1087309 1087537 1080794 1017097 1022051 1020338 1020214 1087674 1035776 1035052 1076272 1087269 1076271 1035078 1038550 1086160 1036086 1027191 1087510 1077844 1070358 1028950 1084564 1028839 1028840 1027074 1087835 1030315 1030329 1087704 1070721 1070689 1070711 1070703 1070696 1088576 1081034 1030352 1086065 1087072 1088786 1087660 1030903 1088301 1087548 1088733 1087691 1081578 1088438 1088432 1088631 1088152 1088481 1032491 1087768 1089333 1087782 1089046 1051486 1086570 1048264 1087920 1045197 1087912 1082303 1087742 1076331 1081307 1086440 1036153 1028827 1088175 1025451 1024900 1070014 1026318 1034856 1076843 1023398 1023760 1024878 1024898 1086135 1025450 1023163 1025211 1087538 1026676 1026728 1035253 1071457 1034537 1071418 1089376 1036469 1087513 1035481 1082746 1038040 1076268 1087455 1035407 1038896 1086068 1071414 1034238 1034205 1034227 1035385 1038761 1038673 1034212 1037550 1038658 1088126 1087672 1038935 1038909 1071989 1040356 1044390 1045271 1045290 1087926 1087313 1043870 1072696 1072694 1023166 1076262 1088760 1087978 1087264 1022706 1022699 1087187 1024757 1076193 1024736 1024771 1076844 1087119 1022578 1022579 1015272 1087186 1088437 1012987 <type 'exceptions.KeyError'> BUILDING IDENTIFICATION NUMBER (BIN) NOT FOUND 1013558 1015198 1015191 1088533 1088726 1085962 1078988 1078985 1001050 1001026 1001041 1001022 1000794 1088750 1088442 1001068 1087167 1001102 1001193 1001115 1087755 1001249 1001215 1001263 1001199 1075701 1088793 1072630 1076299 1043871 1089700 1040756 1076296 1043780 1087103 1043366 1040766 1041959 1086581 1087494 1037174 1042771 1034520 1014736 1086104 1085936 1082441 1089686 <type 'exceptions.KeyError'> BUILDING IDENTIFICATION NUMBER (BIN) NOT FOUND 1088577 1001137 1001162 1001292 1087484 1000830 1000827 1001003 1001007 1087459 1087651 1087518 1087715 1087549 1007197 4534816 4536853 1001389 1087485 1087756 1079153 1087951 1001629 1001451 1088224 1001228 1087081 1008455 1083503 1089381 1009706 1045282 1042799 1088565 1056465 1087671 1015487 1023732 1088713 1036458 1087267 1035762 4312614 4534920 4003522 4540105 4312083 4307915 4592106 4539469 4050411 <type 'exceptions.KeyError'> BUILDING IDENTIFICATION NUMBER (BIN) NOT FOUND 4045397 4533805 4080169 4312087 4538327 4539429 4536823 4540156 4539589 4536915 4014554 4024498 5089354
# Join with PLUTO DATA
Towers.drop_duplicates(subset = ['BBL'],inplace=True) # collapsing towers to lot level
tower_data = PLUTO.merge(Towers,how='left',on='BBL')
print str(Towers.Identified.sum() - tower_data.Identified.sum()) + ' towers not accounted for in PLUTO'
4.0 towers not accounted for in PLUTO
BC: first character in building class, e.g., A is single-family home
BC_num: second character in building class. Not used.
Zone: First character in 'AllZoning1' field.
tower_data['Identified'] = tower_data['Identified'].fillna(0)
tower_data['BC'] = tower_data['BldgClass'].str[0]
tower_data['BC_num'] = tower_data['BldgClass'].str[1]
tower_data['Zone'] = tower_data['AllZoning1'].str[0]
tower_data['Log_BldgArea'] = np.log(tower_data['BldgArea'])
tower_data['Root_BldgArea'] = np.sqrt(tower_data['BldgArea'])
tower_data['Log_NumFloors'] = np.log(tower_data['NumFloors'])
# save the file locally
if not os.path.exists("../data"):
os.makedirs("../data")
tower_data.to_csv('../data/tower_data.csv')