import pandas as pd
import geopandas as gpd
%matplotlib inline
#Read in blocks and assign a unique index
gdfBlocks = gpd.read_file('../data/DURHAM/DURHAM_blocks.shp')
gdfBlocks["OrgID"] = 0
#Select blocks that are majority black
gdfMajBlack = gdfBlocks.query('PctBlack >= 50')
#Of those, select blocks that have at least 50 BHH, these we'll keep (1)
gdf_Org1 = gdfMajBlack.query('BlackHH > 50').reset_index()
gdf_Org1.drop(['index', 'STATEFP10', 'COUNTYFP10',
'TRACTCE10', 'BLOCKCE', 'BLOCKID10',
'GEOID10','PARTFLG'],axis=1,inplace=True)
gdf_Org1['OrgID'] = gdf_Org1.index + 1
gdf_Org1['OrgType'] = 'OriginalBlock'
gdf_Org1.to_file(keep1)
#Of those, select blocks that have fewer than 50 BHH; these we'll cluster
gdfMajBlack_LT50 = gdfMajBlack.query('BlackHH < 50')
#Cluster
gdfClusters = gpd.GeoDataFrame(geometry = list(gdfMajBlack_LT50.unary_union))
gdfClusters['ClusterID'] = gdfClusters.index
gdfClusters.crs = gdfMajBlack_LT50.crs
#gdfClusters.to_file('../data/DURHAM/clusters.shp')
#Spatially join the cluster ID to the original blocks
gdfMajBlack_LT50_2 = gpd.sjoin(gdfMajBlack_LT50,gdfClusters,
how='left',op='within').drop("index_right",axis=1)
#gdfMajBlack_LT50_2.to_file('../data/DURHAM/MajBlack1.shp')
#Compute the total BHH for the dissolved blocks and add as block attribute
gdfClusters_2 = gdfMajBlack_LT50_2.dissolve(by='ClusterID', aggfunc='sum')
gdfClusters_2['PctBlack'] = gdfClusters_2['P003003'] / gdfClusters_2['P003001'] * 100
gdfClusters_2['PctBlack18'] = gdfClusters_2['P010004'] / gdfClusters_2['P010001'] * 100
#Remove block clusters with fewer than 50 BHH; these are impractical
gdfClusters_2 = gdfClusters_2.query('BlackHH >= 50')
#gdfClusters_2.to_file('../data/DURHAM/clusters2.shp')
#Select clusters with fewer than 100 BHH, these we'll keep as org units(2)
gdf_Org2 = gdfClusters_2.query('BlackHH <= 100').reset_index()
gdf_Org2['OrgID'] = gdf_Org1['OrgID'].max() + gdf_Org2.index + 1
gdf_Org2['OrgType'] = 'Full block cluster'
gdf_Org2.to_file('../data/DURHAM/keep2.shp')
#Get a list of Cluster IDs for block clusters with more than 100 BHH;
# we'll cluster individual blocks with these IDs until BHH >= 100
clusterIDs = gdfClusters_2.query('BlackHH > 100').index.unique()
gdfClusters_2.query('BlackHH > 100').to_file('../data/DURHAM/cluster.shp')
clusterID = 5
gdfBlksAll = gdfMajBlack_LT50_2.query('ClusterID == {}'.format(clusterID)).reset_index()
gdfBlksAll['X'] = gdfBlksAll.geometry.centroid.x
gdfBlksAll['claimed'] = 0
geomDict = {}
gdfList = []
unclaimedCount = gdfBlksAll.query('claimed == 0')['X'].count()
#print(unclaimedCount)
stopIt2 = 0
while unclaimedCount > 0:
gdfBlks = gdfBlksAll[gdfBlksAll.claimed == 0].reset_index()
gdfNbrs = gdfBlks[gdfBlks.X == gdfBlks.X.min()]#; print(gdfNbrs.BLOCKID10.unique())
BHH = gdfNbrs.BlackHH.sum()
geom = gdfNbrs.geometry.unary_union
#gdfNbrs.plot();
stopIt = 0
while BHH < 100:
gdfNbrs = gdfBlksAll[(gdfBlksAll.intersects(geom)) &
(gdfBlksAll.claimed == 0)
]#; print(gdfNbrs.BLOCKID10.unique())
BHH = gdfNbrs.BlackHH.sum()
geom = gdfNbrs.geometry.unary_union
#gdfNbrs.plot();
stopIt += 1
if stopIt > 100:
print("BHH never reached 100")
break
gdfBlksAll.loc[gdfBlksAll.geometry.intersects(geom),'claimed'] = 1
unclaimedCount = gdfBlksAll.query('claimed == 0')['X'].count()
#print(stopIt2,unclaimedCount)
stopIt2 += 1
if stopIt2 > 100: break
geomDict[stopIt2] = geom
gdfSelect = (gdfBlksAll[(gdfBlksAll.centroid.within(geom))]
.reset_index()
.dissolve(by='ClusterID', aggfunc='sum')
.drop(['level_0','index','X'],axis=1)
)
#gdfSelect['OrgID'] = stopIt2
gdfList.append(gdfSelect)
gdf5 = pd.concat(gdfList)
gdf5.to_file("../data/DURHAM/foo5.shp")
gdfSelect = (gdfBlksAll[(gdfBlksAll.geometry.intersects(geom)) & (gdfBlksAll.claimed == 0)]
.reset_index()
.dissolve(by='ClusterID', aggfunc='sum')
.drop(['level_0','index','X'],axis=1)
)
#gdfSelect.plot()
gdfBlksAll.loc[gdfBlksAll.geometry.intersects(geom),'claimed'] = 1
print(gdfBlksAll.query('claimed == 0')['X'].count())
#Iterate through each clusterID
gdfs = []
for clusterID in clusterIDs:
#Get all the blocks in the selected cluster
gdfBlksAll = gdfMajBlack_LT50_2.query('ClusterID == {}'.format(clusterID)).reset_index()
#Assign the X coordinate, used to select the first feature in a sub-cluster
gdfBlksAll['X'] = gdfBlksAll.geometry.centroid.x
#Set all blocks to "unclaimed"
gdfBlksAll['claimed'] = 0
#Determine how many blocks are unclaimed
unclaimedCount = gdfBlksAll.query('claimed == 0')['X'].count()
#Initialize the loop catch variable
stopLoop = 0
#Run until all blocks have been "claimed"
while unclaimedCount > 0:
#Extract all unclaimed blocks
gdfBlks = gdfBlksAll[gdfBlksAll.claimed == 0].reset_index()
#Get the initial block (the western most one); get its BHH and geometry
gdfBlock = gdfBlks[gdfBlks.X == gdfBlks.X.min()]
BHH = gdfBlock.BlackHH.sum()
geom = gdfBlock.geometry.unary_union
#Expand the geometry until 100 BHH are found
stopLoop2 = 0 #Loop break check
while BHH < 100:
#Select unclaimed blocks that within the area
gdfNbrs = gdfBlksAll[(gdfBlksAll.touches(geom))]
gdfBoth = pd.concat((gdfBlock,gdfNbrs),axis='rows',sort=False)
gdfBlock = gdfBoth.copy(deep=True)
#Tally the BHHs in the area and update the area shape
BHH = gdfBoth.BlackHH.sum()
geom = gdfBoth.geometry.unary_union
#Catch if run 100 times without getting to 100 BHH
stopLoop2 += 1
if stopLoop2 > 100:
print("BHH never reached 100")
break
#Extract features intersecting the geometry to a new dataframe
gdfSelect = (gdfBlksAll[(gdfBlksAll.centroid.within(geom)) &
(gdfBlksAll.claimed == 0)
]
.reset_index()
.dissolve(by='ClusterID', aggfunc='sum')
.drop(['level_0','index','X'],axis=1)
)
#Set all features intersecting the shape as "claimed"
gdfBlksAll.loc[gdfBlksAll.geometry.centroid.within(geom),'claimed'] = 1
unclaimedCount = gdfBlksAll.query('claimed == 0')['X'].count()
#Add the dataframe to the list of datarames
gdfs.append(gdfSelect[gdfSelect['BlackHH'] >= 50])
#Stop the loop if run for over 100 iterations
stopLoop += 1
if stopLoop > 100: break
pd.concat(gdfs).to_file('../data/Durham/Foo5x.shp')
gdf_Orgx = pd.concat(gdfs)
gdf_Orgx.columns
Index(['geometry', 'HOUSING10', 'POP10', 'P003001', 'P003003', 'P010001', 'P010004', 'PctBlack', 'PctBlack18', 'BlackHH', 'OrgID', 'claimed'], dtype='object')
gdf_Orgx = pd.concat(gdfs)
gdf_Org3 = gdf_Orgx.query('claimed > 0').reset_index()
gdf_Org3.BlackHH.min()
gdf_Org3['PctBlack'] = gdf_Org3['P003003'] / gdf_Org3['P003001'] * 100
gdf_Org3['PctBlack18'] = gdf_Org3['P010004'] / gdf_Org3['P010001'] * 100
gdf_Org3.to_file('../data/Durham/Keep3.shp')
#Assign a unique ID to each org unit
for i,df in enumerate(gdfs):
df['OrgIDx'] = i
#Combine all together
gdf_Org3 = pd.concat(gdfs).reset_index()
#gdf_Org3 = gdfsClaimed.dissolve(by='OrgIDx',aggfunc='sum')
#Combine each cluster into a single dataframe and write to a file
#gdf_Org3 = pd.concat(gdfs).reset_index()
gdf_Org3['OrgID'] = gdf_Org2['OrgID'].max() + gdf_Org3.index + 1
gdf_Org3['OrgType'] = 'Partial block cluster'
gdf_Org3.to_file('../data/Durham/Keep3.shp')
#Merge all three keepers
gdfAllOrgs = pd.concat((gdf_Org1, gdf_Org2, gdf_Org3),axis=0,sort=True)
gdfAllOrgs.to_file('../data/DURHAM/Orgs.shp')