Notes:
To simulate "What Happens If a Nuclear Bomb Goes Off in Manhattan?" we first need to create the population (~20 million) living in the area. This notebook shows how we generate our synthetic population on a sample of two counties in New York (Sullivan and Ulster):
We use five datasets to synthesize our population:
The output of this notebook is basically a population with six tuples:
individual id, age, sex, social network (a list of ids), household id, workplace/school id
The files in particular are
iid,age,sex,hhid,sid
sid,point,stype
(hhold(1-11), school(12), wp(13))To verify and validate the synthesized population we examine
with open('errors.pkl', 'rb') as f:
errors = pd.concat(pickle.load(f),axis=1).T
ax = err.plot(style=['x','+','*','.'],figsize=(12,3))
ax.set(xticklabels='',xlabel='Census Tracts (N=71)',ylabel='Percentage Error (%)',#ylim=(-45,45),
xlim=(-1,72),title='Synthesized Household Metrics w.r.t. Actual (Ulster & Sullivan Counties, NY)');
with open('population.pkl','rb') as f:
people = pd.concat(pickle.load(f))
G = gt.load_graph('k4p3-2.gml')
print('# of nodes: {}\n# of edges: {}'.format(G.num_vertices(),G.num_edges()))
print('Clustering coefficient: {:.2f}'.format(gt.clustering.global_clustering(G)[0]))
print('Pseudo diameter: {}'.format(int(gt.topology.pseudo_diameter(G)[0])))
print('Average degree: {:.2f}'.format(gt.stats.vertex_average(G, 'total')[0]))
counts,bins = gt.stats.distance_histogram(G,samples=1000)
print('Average path length (N=1000): {:.2f}'.format((counts*bins[:-1]).sum()/counts.sum()))
# of nodes: 258672 # of edges: 709213 Clustering coefficient: 0.28 Pseudo diameter: 20 Average degree: 5.48 Average path length (N=1000): 9.17
hhold = gt.load_graph('hhold-2.gml')
work = gt.load_graph('work-2.gml')
school = gt.load_graph('school-2.gml')
hcounts,hbins = gt.stats.vertex_hist(hhold, 'total', float_count=False)
wcounts,wbins = gt.stats.vertex_hist(work, 'total', float_count=False)
scounts,sbins = gt.stats.vertex_hist(school, 'total', float_count=False)
with open('population2.pkl','rb') as f:
people = pd.concat(pickle.load(f))
print('# of employed:',len(people[(people.age >= 18) & (people.wp.notnull())]))
print('# of students:',len(people[people.age < 18]))
# of employed: 117488 # of students: 54320
counts,bins = gt.stats.vertex_hist(G, 'total')
print('total number of edges: {:.0f}'.format(sum(counts*bins[:-1])))
counts,bins = gt.stats.vertex_hist(hhold, 'total', float_count=False)
print('household edges: {:.0f}'.format(sum(counts*bins[:-1])))
counts,bins = gt.stats.vertex_hist(work, 'total', float_count=False)
print('work edges: {:.0f}'.format(sum(counts*bins[:-1])))
counts,bins = gt.stats.vertex_hist(school, 'total', float_count=False)
print('school edges: {:.0f}'.format(sum(counts*bins[:-1])))
total number of edges: 1418426 household edges: 569912 work edges: 565500 school edges: 283014
labels = ['household','work','school']
sizes = [569912,565500,283014]
fig1, ax1 = plt.subplots(figsize=(2,2))
ax1.pie(sizes, labels=labels, autopct='%1.2f%%')
ax1.axis('equal');
#prepare a 2-hops (FoF) ego network
g = nx.read_gml('k4p3-2.gml')
#start with a single individual
i0 = '36105952300i550'
#get friends + friends of friends (FoF)
hop2 = set().union(*[[n]+g.neighbors(n) for n in g.neighbors(i0)])
g2 = nx.Graph(g.edges(hop2,data=True))
hholdm = [i0] + [k for k,v in g[i0].items() if v['etype']=='hhold']
hhold_edges = [e for e in g2.edges(hholdm,data=True) if e[2]['etype'] == 'hhold']
work_edges = [e for e in g2.edges(hholdm,data=True) if e[2]['etype'] == 'work']
school_edges = [e for e in g2.edges(hholdm,data=True) if e[2]['etype'] == 'school']
#draw the graph
labels = people.loc[[i0] + g.neighbors(i0)] #[k for k,v in g[i0].items()]
labels = (labels['age'].astype(str)+','+labels['sex']).to_dict()
pos = nx.spring_layout(g2)
fig,ax = plt.subplots();
blues = plt.get_cmap('Blues')
nx.draw_networkx_nodes(g2,pos,nodelist=[i0],alpha=.5)
nx.draw_networkx(g2,pos,nodelist=hop2, node_color=[1]*len(hop2),cmap=blues,vmin=0,vmax=1,alpha=0.15,
node_size=50,font_size=13, labels=labels, ax=ax)
nx.draw_networkx_edges(g2,pos, edge_color='red',edgelist = hhold_edges,width=2,alpha=.3,ax=ax)
nx.draw_networkx_edges(g2,pos, edge_color='blue',edgelist = school_edges,width=2,alpha=.3,ax=ax)
nx.draw_networkx_edges(g2,pos, edge_color='green',edgelist = work_edges,width=2,alpha=.3,ax=ax)
ax.axis('off');
f,axes = plt.subplots(nrows=2, ncols=2,figsize=(18,11))
ax = axes[0,0]
counts,bins = gt.stats.vertex_hist(G, 'total')
ax.bar(bins[:-1],counts,tick_label=bins[:-1])
ax.set(xlabel='Degree',ylabel='Frequency',#yscale='log',
title='Total Degree Distribution',xlim=(-0.5,len(counts)-0.5));
ax = axes[0,1]
counts,bins = gt.stats.vertex_hist(hhold, 'total', float_count=False)
counts[0] = G.num_vertices() - sum(counts)
ax.bar(bins[:-1],counts,tick_label=bins[:-1])
ax.set(xlabel='Degree',ylabel='Frequency',#yscale='log',
title='Household Degree Distribution',xlim=(-0.5,len(counts)-0.5));
ax = axes[1,0]
counts,bins = gt.stats.vertex_hist(work, 'total', float_count=False)
counts[0] = len(people[(people.age >= 18) & (people.wp.notnull())]) - sum(counts)
ax.bar(bins[:-1],counts,tick_label=bins[:-1])
ax.set(xlabel='Degree',ylabel='Frequency',#yscale='log',
title='Work Degree Distribution',xlim=(-0.5,len(counts)-0.5));
ax = axes[1,1]
counts,bins = gt.stats.vertex_hist(school, 'total', float_count=False)
counts[0] = len(people[people.age < 18]) - sum(counts)
ax.bar(bins[:-1],counts,tick_label=bins[:-1])
ax.set(xlabel='Degree',ylabel='Frequency',#yscale='log',
title='School Degree Distribution',xlim=(-0.5,len(counts)-0.5));
with open('wps2.pkl', 'rb') as f:
wps = pd.concat(pickle.load(f))
tract = dp.iloc[-5]
# roads = road.intersection(tract.geometry)
#tract houses and wps
thouses = gpd.GeoSeries(people[people.hhold.str.startswith(tract.name)].geometry)
twps = gpd.GeoSeries(wps[wps.index.str.startswith(tract.name)])
#tract
ax = gpd.GeoDataFrame(tract).T.plot(ls='--',color='white') #draw tract borders
ax = roads.plot(color='grey',ax=ax) #draw roads
markers = {}
ax = thouses.plot(ax=ax,markersize=3,color='orange',linestyle='None') #draw houses
markers['house'] = ax.get_lines()[-1]
ax = twps.plot(ax=ax,markersize=4,color='purple',linestyle='None') #draw workplaces
markers['workplace'] = ax.get_lines()[-1]
edu = school[school.intersects(tract.geometry)] #schools and daycares
ax = edu.plot(ax=ax,markersize=5,color='red',linestyle='None') #draw educational inst
markers['school'] = ax.get_lines()[-1]
ax.axis('off')
ax.legend(title=tract.NAMELSAD10, *list(zip(*markers.items()))[::-1]);
Three steps to clean and get the giant connected component from the road shapefile.
v.clean.advanced
tools snap,break,rmdupl,rmsa
with tolerance values 0.0001,0.0,0.0,0.0
, save the result to cleaned.shp
v.net.components
tool (weak
or strong
does not matter since the network is undirected), save the result as giant_component.csv
gcc.shp
:components = pd.read_csv('../nWMDmap2/giant_component.csv', usecols=[0])
cleaned = gpd.read_file('../nWMDmap2/cleaned.shp')
roads = cleaned[['LINEARID','MTFCC','STATEFP','COUNTYFP','geometry']].join(components)
roads = roads[roads.comp == 1610].drop('comp',axis=1)
roads.to_file('../nWMDmap2/gcc.shp')
The output:
To get inter-tract commuting data at census-tract level:
# DOWNLOAD SCRIPT
pre = 'https://lehd.ces.census.gov/data/lodes/LODES7/'
#two separate files for the workers living in the same state and for those not
for state in ['ny','nj','ct','pa','ri','ma']:
for res in ['main','aux']:
post = '_JT00_2010.csv.gz' if state != 'ma' else '_JT00_2011.csv.gz'
os.system('wget {0:}{1:}/od/{1:}_od_{2:}{3:}'.format(pre,state,res,post))
We are interested in these columns only (ripping off the rest by usecols=range(6)
):
# CREATE TRACT LEVEL O-D PAIRS
#GEOID: state(2)-county(3)-tract(6): e.g. 09-001-030300
census = gpd.read_file('../nWMDmap2/censusclip1.shp').set_index('GEOID10') #demographic profiles
read_workflow = partial(pd.read_csv,usecols=range(6),dtype={0:str,1:str})
wf = pd.concat([read_workflow(f) for f in glob('../od/*JT00*')]) #workflow
wf['work'] = wf.w_geocode.str[:11]
wf['home'] = wf.h_geocode.str[:11]
od = wf[(wf.work.isin(census.index)) | (wf.home.isin(census.index))]
od = od.groupby(['work','home']).sum()
od.reset_index().to_csv('../od/tract-od.csv',index=False)
Workplace count in tract = |wp in county| * |# of employed in tract| / |# of employed in county|
Establishment numbers per size at county level is available in CBP dataset. Note: "An establishment is a single physical location at which business is conducted or services or industrial operations are performed" ref
In this section we create (tract_id,wp_count) tuples/series.
def number_of_wp(dp, od, cbp):
"""
calculate number of workplaces for each tract
wp_tract = wp_cty * (tract_employed / county_employed)
"""
# get the number of employees per tract
dwp = od[['work','S000']].groupby('work').sum()
dwp = pd.merge(dp.portion.to_frame(),dwp,left_index=True,right_index=True,how='left').fillna(0)
# dwp = dwp.portion*dwp.S000/10
wp_class = ["n1_4","n5_9","n10_19","n20_49","n50_99","n100_249","n250_499","n500_999","n1000","n1000_1","n1000_2","n1000_3","n1000_4"]
dwp['county'] = dwp.index.str[:5]
a = dwp.groupby('county').sum()
a = a.join(cbp[wp_class].sum(axis=1).to_frame('wpcty'))
# note: as Dr. Crooks suggested agents not living in our region included
dwp = (dwp.portion * dwp.S000 / dwp.county.map(a.S000)) * dwp.county.map(a.wpcty)
return dwp.apply(np.ceil).astype(int)
def wp_proba(x):
"""
probability of an employee working in that workplace is lognormal:
http://www.haas.berkeley.edu/faculty/pdf/wallace_dynamics.pdf
"""
if x == 0: return np.zeros(0)
b = np.random.lognormal(mean=2,size=x).reshape(-1, 1)
return np.sort(normalize(b,norm='l1',axis=0).ravel())
There were two columns for each info data such that anding them leading to all nulls, i.e.
the result of this test was 0: [(p[0],len(school[school[p[0]].isnull() & school[p[1]].isnull()])) for p in pairs]
I cleaned these pairs and removed the rest, and saved the results as a new shapefile:
school = gpd.read_file('../nWMDmap2/schoolsclip1.shp')
pairs=[('ENROLLMENT','ENROLLME_2'),('START_GRAD','START_GR_2'),('END_GRADE','END_GRAD_2'),('LEVEL','LEVEL_2')]
for p in pairs:
school[p[0]] = school[p[0]].fillna(school[p[1]])
school[[p[0] for p in pairs+[('geometry',)]]].to_file('../nWMDmap2/school.shp')
And, for the daycares, filled the null values with the median:
dc = gpd.read_file('../nWMDmap2/daycareclip1.shp')
dc['POPULATION'] = dc.POPULATION.fillna(dc.POPULATION.median()).astype(int)
dc[['POPULATION','geometry']].to_file('../nWMDmap2/daycare.shp')
def clean_schools(school,daycare):
school = school[school.START_GRAD != 'N'] #Unknown
"""
Codes for Grade Level
PK = PreKindergarten
KG = Kindergarten
TK = Transitional Kindergarten
T1 = Transitional First Grade
01-12 = Grade 1-12
"""
grade2age = {'PK':3,'KG':5,'UG':5, 'TK':5,'T1':6}
grade2age.update({'{:02d}'.format(i+1):i+6 for i in range(12)})
school = school.assign(start_age = school.START_GRAD.map(grade2age))
school = school.assign(end_age = school.END_GRADE.map(grade2age) +1)
school.loc[school.END_GRADE=='PK','end_age'] = 6 #not 4
mask = school.ENROLLMENT<=0 #non-positive enrollments?
school.loc[mask,'ENROLLMENT'] = school[~mask].ENROLLMENT.median()
school = school[['start_age','end_age','ENROLLMENT','geometry']]
daycare = daycare.assign(start_age = 0, end_age = 5)
daycare = daycare.rename(columns={'POPULATION':'ENROLLMENT'})
daycare = daycare[['start_age','end_age','ENROLLMENT','geometry']]
school = school.append(daycare, ignore_index=True)
school['current'] = 0
return school
Each individual gets an age and a sex. For each census-tract, DP table provides the number of individuals for 18 age bands for two sexes. In age creation, we assume a uniform distribution within each band (Note: if the number of individuals in a band are not divisible by 5, then the remainder is added to the latest age in the band)
def create_individuals(tract):
"""Generate a population of ages and sexes as a DataFrame
Given the number of individuals for 18 age groups of each sex,
return a two column DataFrame of age ([0,89]) and sex ('m','f')
"""
portion = tract.geometry.area / tract.Shape_Area # what portion of the tract is included
age_sex_groups = (tract[22:59].drop('DP0010039') * portion).astype(int)
dfs=[]
for code,count in enumerate(age_sex_groups):
base_age = (code % 18)*5
gender = 'm' if code < 18 else 'f'
ages = []
for offset in range(4): ages.extend([offset+base_age]*(count//5))
ages.extend([base_age+4]*(count-len(ages)))
dfs.append(pd.DataFrame({'code':code, 'age':ages,'sex':[gender]*count}))
df = pd.concat(dfs).sample(frac=1,random_state=123).reset_index(drop=True)
df.index = tract.name + 'i' + df.index.to_series().astype(str)
df['friends'] = [set()] * len(df)
return df
Number of households for 11 types are given in the DP table:
0 h&w (no<18)
1 h&w&ch (ch<18)
2 male (no<18)
3 male (ch<18)
4 female (no<18)
5 female (ch<18)
6 nonfamily group
7 lone male <65
8 lone male >65
9 lone female<65
10 lone female>65
Using this information as a constraint, we assign individuals to households. We also added other constraints such as
-4 < husband.age - wife.age < 9
Also, those who live in the group quarters given one place. No assumption is made on their ages since we do not have information about the instutitions within the tract such as the number universities (college dorm), nursing houses, jails, or military bases.
def create_households(tract, people):
hh_cnt = get_hh_cnts(tract)
hholds = pd.DataFrame()
hholds['htype'] = np.repeat(hh_cnt.index,hh_cnt)
hholds = hholds[hholds.htype != 6].sort_values('htype',ascending=False).append(hholds[hholds.htype == 6])
populate_households(tract, people, hholds)
def get_hh_cnts(tract):
"""
Eleven household types:
0 h&w (no<18)
1 h&w&ch (ch<18)
2 male (no<18)
3 male (ch<18)
4 female (no<18)
5 female (ch<18)
6 nonfamily group
7 lone male <65
8 lone male >65
9 lone female<65
10 lone female>65
"""
portion = tract.geometry.area / tract.Shape_Area # what portion of the tract is included
householdConstraints = (tract[150:166] * portion).astype(int) #HOUSEHOLDS BY TYPE
hh_cnt = pd.Series(np.zeros(11),dtype=int) #11 household types (group quarters are not household)
# husband/wife families
hh_cnt[0] = householdConstraints[4] - householdConstraints[5]; # husband-wife family
hh_cnt[1] = householdConstraints[5]; # husband-wife family, OWN CHILDREN < 18
# male householders
hh_cnt[2] = householdConstraints[6] - householdConstraints[7]; # single male householder
hh_cnt[3] = householdConstraints[7]; # single male householder, OWN CHILDREN < 18
# female householders
hh_cnt[4] = householdConstraints[8] - householdConstraints[9]; # single female householder
hh_cnt[5] = householdConstraints[9]; # single female householder, OWN CHILDREN < 18
# nonfamily householders
hh_cnt[6] = householdConstraints[10] - householdConstraints[11]; # nonfamily group living
hh_cnt[7] = householdConstraints[12] - householdConstraints[13]; # lone male < 65
hh_cnt[8] = householdConstraints[13]; # lone male >= 65
hh_cnt[9] = householdConstraints[14] - householdConstraints[15]; # lone female < 65
hh_cnt[10] = householdConstraints[15]; # lone female >= 65
return hh_cnt
def populate_households(tract, people, hholds):
"""
Eleven household types:
0 h&w (no<18)
1 h&w&ch (ch<18)
2 male (no<18)
3 male (ch<18)
4 female (no<18)
5 female (ch<18)
6 nonfamily group
7 lone male <65
8 lone male >65
9 lone female<65
10 lone female>65
"""
mask = pd.Series(True,index=people.index) #[True]*len(people)
hholds['members'] = hholds.htype.apply(gen_households,args=(people, mask,))
"""The seven types of group quarters are categorized as institutional group quarters
(correctional facilities for adults, juvenile facilities, nursing facilities/skilled-nursing facilities,
and other institutional facilities) or noninstitutional group quarters (college/university student housing,
military quarters, and other noninstitutional facilities)."""
portion = tract.geometry.area / tract.Shape_Area # what portion of the tract is included
group_population = int(tract.DP0120014 * portion) #people living in group quarters (not in households)
#gq_indices = people[(people.age>=65) | (people.age<18)].index[:group_population]
gq_indices = people[mask].index[:group_population]
#for i in gq_indices: mask[i] = False
mask.loc[gq_indices] = False
#now distribute the remaining household guys as relatives...
relatives = people[mask].index
it = iter(relatives) #sample by replacement
relative_hhs = hholds[hholds.htype<7].sample(n=len(relatives),replace=True)
relative_hhs.members.apply(lambda x: x.append(next(it))) #appends on mutable lists
#for i in relatives: mask[i] = False
mask.loc[relatives]= False
#print('is anyone left homeless:',any(mask))
#add those living in group quarters as all living in a house of 12th type
if group_population > 0:
hholds.loc[len(hholds)] = {'htype':11, 'members':gq_indices}
hholds = hholds.set_index(tract.name+'h'+pd.Series(np.arange(len(hholds)).astype(str)))
def hh_2_people(hh,people):
for m in hh.members:
people.loc[m,'hhold'] = hh.name
people.loc[m,'htype'] = hh.htype
hholds.apply(hh_2_people,args=(people,),axis=1)
people['htype'] = people.htype.astype(int)
def gen_households(hh_type, people, mask):
"""Helper
"""
members = []
head_ranges = [range(4,18), range(4,14), range(4,18), range(4,14), range(22,36), range(22,30),
chain(range(1,18),range(19,36)), range(4,13), range(13,18), range(21,31), range(31,36)]
# head_age = [range(15,99), range(20,70), range(15,99), range(20,70), range(15,99),
# range(20,70), range(15,99), range(15,65), range(65,99), range(15,65), range(65,99)]
# head_sex= [('m'),('m'),('m'),('m'),('f'),('f'),('m','f'),('m'),('m'),('f'),('f')]
#add the householder
pot = people[mask].code
iindex = pot[pot.isin(head_ranges[hh_type])].index[0]
h1 = people.loc[iindex] #age & sex of h1
mask[iindex] = False
members.append(iindex)
#if living alone then return the household
if hh_type > 6:
return members
#if husband and wife, add the wife
if hh_type in (0,1):
pot = people[mask].code
iindex = pot[pot.isin(range(h1.code+17,h1.code+19))].index[0]
h2 = people.loc[iindex] # -4 < husband.age - wife.age < 9
mask[iindex] = False
members.append(iindex)
#if may have children 18+ then add them
# if (hh_type <= 5) and (h1.age > 36) and (np.random.random() < 0.3):
# #to have a child older than 18, h1 should be at least 37
# pot = people[mask]
# iindex = pot[pot.age.isin(range(18,h1.age-15))].index[0]
# ch18 = people.ix[iindex] #child should be at least 19 years younger than h1
# mask[iindex] = False
# members.append(iindex)
"""A child includes a son or daughter by birth (biological child), a stepchild,
or an adopted child of the householder, regardless of the child’s age or marital status.
The category excludes sons-in-law, daughters- in-law, and foster children."""
#household types with at least one child (18-)
if hh_type in (1,3,5):
#https://www.census.gov/hhes/families/files/graphics/FM-3.pdf
if hh_type == 1:
num_of_child = max(1,abs(int(np.random.normal(2)))) #gaussian touch
elif hh_type == 3:
num_of_child = max(1,abs(int(np.random.normal(1.6)))) #gaussian touch
elif hh_type == 5:
num_of_child = max(1,abs(int(np.random.normal(1.8)))) #gaussian touch
pot = people[mask]
if hh_type == 1:
iindices = pot[(pot.age<18) & (45>h2.age-pot.age)].index[:num_of_child]
else: #father (mother) and child age difference not to exceed 50 (40)
age_diff = 45 if hh_type == 5 else 55
iindices = pot[(pot.age<18) & (age_diff>h1.age-pot.age)].index[:num_of_child]
for i in iindices:
child = people.loc[i]
mask[i] = False
members.append(i)
#if nonfamily group then either friends or unmarried couples
if hh_type == 6:
pot = people[mask].code
num_of_friends = max(1,abs(int(np.random.normal(1.3)))) #gaussian touch
iindices = pot[pot.isin(range(h1.code-2,h1.code+3))].index[:num_of_friends]
for i in iindices:
friend = people.loc[i]
mask[i] = False
members.append(i)
return members
def assign_workplaces(tract,people,od):
"""
if the destination tract of a worker is not in our DP dataset
then we assign his wp to 'DTIDw', otherwise 'DTIDw#'
the actual size distribution of establishments is lognormal
https://www.princeton.edu/~erossi/fsdae.pdf
"""
#get true od numbers considering the proportions
portion = tract.geometry.area / tract.Shape_Area
td = od[od['home'] == tract.name].set_index('work').S000
td = (td*portion).apply(np.ceil).astype(int) #from this tract to others
# 58.5%: US population (16+) - employment rate
# https://data.bls.gov/timeseries/LNS12300000
employed = people[people.age>=18].sample(td.sum()).index #get the employed
dtract = pd.Series(np.repeat(td.index.values, td.values)) #get the destination tract
# if 'wp' in people.columns: people.drop('wp',axis=1,inplace=True)
people.loc[employed,'wp'] = dtract.apply(lambda x: x+'w'+str(np.random.choice(dp.loc[x,'WP_CNT'],p=dp.loc[x,'WP_PROBA'])) if x in dp.index else x+'w').values
The first step of population synthesis is creating the environment (buildings):
Note that around NY (40N,74W), 1 degree is ~100km (1-lat = 111km, 1-lon = 85km).
#shapely geometries are not hashable, here is my hash function to check the duplicates
def hash_geom(x):
if x.geom_type == 'MultiLineString':
return tuple((round(lat,6),round(lon,6)) for s in x for lat,lon in s.coords[:])
else:
return tuple((round(lat,6),round(lon,6)) for lat,lon in x.coords[:])
# create spaces
def create_spaces(tract, hcnt, od, road, HD=0.0005, WD=0.0002, avg_wp = 10):
portion = tract.geometry.area / tract.Shape_Area # what portion of the tract is included
# create houses
# DP0180001: Total housing units, DP0180002: Occupied housing units
# hcnt = int(tract.DP0180002 * portion) #number of households DP0130001 == DP0180002
if tract.DP0120014 > 0: hcnt += 1 #people living in group quarters (not in households)
mask = road.intersects(tract.geometry)
hmask = mask & road.MTFCC.str.contains('S1400|S1740')
hrd = road[hmask].intersection(tract.geometry)
hrd = hrd[hrd.geom_type.isin(['LinearRing', 'LineString', 'MultiLineString'])]
hrd = hrd[~hrd.apply(hash_geom).duplicated()]
houses = hrd.apply(lambda x: pd.Series([x.interpolate(seg) for seg in np.arange(HD,x.length,HD)]))
houses = houses.unstack().dropna().reset_index(drop=True) #flatten
houses = houses.sample(n=hcnt,replace=True).reset_index(drop=True)
houses.index = tract.name + 'h' + houses.index.to_series().astype(str)
#create workplaces
#jcnt = int(portion * od[od.work==tract.name].S000.sum() / avg_wp)
wmask = mask & road.MTFCC.str.contains('S1200')
wrd = road[wmask].intersection(tract.geometry)
wrd = wrd[wrd.geom_type.isin(['LinearRing', 'LineString', 'MultiLineString'])]
wrd = wrd[~wrd.apply(hash_geom).duplicated()]
#workplaces on S1200
swps = wrd.apply(lambda x: pd.Series([x.interpolate(seg) for seg in np.arange(x.length,WD)]))
#workplaces on the joints of S1400|S1740
rwps = hrd.apply(lambda x: Point(x.coords[0]) if type(x) != MultiLineString else Point(x[0].coords[0]))
wps = rwps.append(swps.unstack().dropna().reset_index(drop=True))
wps = wps.sample(n=tract.WP_CNT,replace=True).reset_index(drop=True)
wps.index = tract.name + 'w' + wps.index.to_series().astype(str)
return gpd.GeoSeries(houses), gpd.GeoSeries(wps)
Individuals populate the households
def populate_households(tract, people, hholds):
"""
Eleven household types:
0 h&w (no<18)
1 h&w&ch (ch<18)
2 male (no<18)
3 male (ch<18)
4 female (no<18)
5 female (ch<18)
6 nonfamily group
7 lone male <65
8 lone male >65
9 lone female<65
10 lone female>65
"""
def gen_households(hh_type, mask):
"""Helper
"""
members = []
head_ranges = [range(4,18), range(4,14), range(4,18), range(4,14), range(22,36), range(22,30),
chain(range(1,18),range(19,36)), range(4,13), range(13,18), range(21,31), range(31,36)]
# head_age = [range(15,99), range(20,70), range(15,99), range(20,70), range(15,99),
# range(20,70), range(15,99), range(15,65), range(65,99), range(15,65), range(65,99)]
# head_sex= [('m'),('m'),('m'),('m'),('f'),('f'),('m','f'),('m'),('m'),('f'),('f')]
#add the householder
pot = people[mask].code
iindex = pot[pot.isin(head_ranges[hh_type])].index[0]
h1 = people.loc[iindex] #age & sex of h1
mask[iindex] = False
members.append(iindex)
#if living alone then return the household
if hh_type > 6:
return members
#if husband and wife, add the wife
if hh_type in (0,1):
pot = people[mask].code
iindex = pot[pot.isin(range(h1.code+17,h1.code+19))].index[0]
h2 = people.loc[iindex] # -4 < husband.age - wife.age < 9
mask[iindex] = False
members.append(iindex)
#if may have children 18+ then add them
# if (hh_type <= 5) and (h1.age > 36) and (np.random.random() < 0.3):
# #to have a child older than 18, h1 should be at least 37
# pot = people[mask]
# iindex = pot[pot.age.isin(range(18,h1.age-15))].index[0]
# ch18 = people.ix[iindex] #child should be at least 19 years younger than h1
# mask[iindex] = False
# members.append(iindex)
"""A child includes a son or daughter by birth (biological child), a stepchild,
or an adopted child of the householder, regardless of the child’s age or marital status.
The category excludes sons-in-law, daughters- in-law, and foster children."""
#household types with at least one child (18-)
if hh_type in (1,3,5):
#https://www.census.gov/hhes/families/files/graphics/FM-3.pdf
if hh_type == 1:
num_of_child = max(1,abs(int(np.random.normal(2)))) #gaussian touch
elif hh_type == 3:
num_of_child = max(1,abs(int(np.random.normal(1.6)))) #gaussian touch
elif hh_type == 5:
num_of_child = max(1,abs(int(np.random.normal(1.8)))) #gaussian touch
pot = people[mask]
if hh_type == 1:
iindices = pot[(pot.age<18) & (45>h2.age-pot.age)].index[:num_of_child]
else: #father (mother) and child age difference not to exceed 50 (40)
age_diff = 45 if hh_type == 5 else 55
iindices = pot[(pot.age<18) & (age_diff>h1.age-pot.age)].index[:num_of_child]
for i in iindices:
child = people.loc[i]
mask[i] = False
members.append(i)
#if nonfamily group then either friends or unmarried couples
if hh_type == 6:
pot = people[mask].code
num_of_friends = max(1,abs(int(np.random.normal(1.3)))) #gaussian touch
iindices = pot[pot.isin(range(h1.code-2,h1.code+3))].index[:num_of_friends]
for i in iindices:
friend = people.loc[i]
mask[i] = False
members.append(i)
return members
mask = pd.Series(True,index=people.index) #[True]*len(people)
hholds['members'] = hholds.htype.apply(gen_households,args=(mask,))
"""The seven types of group quarters are categorized as institutional group quarters
(correctional facilities for adults, juvenile facilities, nursing facilities/skilled-nursing facilities,
and other institutional facilities) or noninstitutional group quarters (college/university student housing,
military quarters, and other noninstitutional facilities)."""
portion = tract.geometry.area / tract.Shape_Area # what portion of the tract is included
group_population = int(tract.DP0120014 * portion) #people living in group quarters (not in households)
#gq_indices = people[(people.age>=65) | (people.age<18)].index[:group_population]
gq_indices = people[mask].index[:group_population]
#for i in gq_indices: mask[i] = False
mask.loc[gq_indices] = False
#now distribute the remaining household guys as relatives...
relatives = people[mask].index
it = iter(relatives) #sample by replacement
relative_hhs = hholds[hholds.htype<7].sample(n=len(relatives),replace=True)
relative_hhs.members.apply(lambda x: x.append(next(it))) #appends on mutable lists
#for i in relatives: mask[i] = False
mask.loc[relatives]= False
#print('is anyone left homeless:',any(mask))
#add those living in group quarters as all living in a house of 12th type
if group_population > 0:
hholds.loc[len(hholds)] = {'htype':11, 'members':gq_indices}
hholds.set_index(tract.name+'h'+hholds.index.values)
def hh_2_people(hh,people):
for m in hh.members:
people.loc[m,'hhold'] = hh.name
people.loc[m,'htype'] = hh.htype
hholds.apply(hh_2_people,args=(people,),axis=1)
def get_errors(tract,people):
"""Percentage errors
"""
err = {}
portion = tract.geometry.area / tract.Shape_Area # what portion of the tract is included
senior_actual = int(tract.DP0150001 * portion) # Households with individuals 65 years and over
minor_actual = int(tract.DP0140001 * portion) # Households with individuals under 18 years
# err['tract'] = tract.name
err['population'] = tract.DP0010001
err['in_gq'] = tract.DP0120014
avg_synthetic_family = people[people.htype<6].groupby('hhold').size().mean()
err['avg_family'] = 100*(avg_synthetic_family - tract.DP0170001) / tract.DP0170001
err['avg_hh'] = 100*(people[people.htype!=11].groupby('hhold').size().mean() - tract.DP0160001) / tract.DP0160001
err['senior_hh'] = 100*(people[people.age>=65].hhold.nunique() - senior_actual) / senior_actual
err['minor_hh'] = 100*(people[people.age<18].hhold.nunique() - minor_actual) / minor_actual
return pd.Series(err,name=tract.name)
def assign_hholds_to_houses(hholds,houses,people):
hholds['house'] = houses.sample(frac=1,random_state=123).index
def hh_2_people(hh,people):
for m in hh.members:
people.loc[m,'house']=hh.house
people.loc[m,'htype']=hh.htype
people['house'] = 'homeless'
people['htype'] = 'homeless'
people['work'] = 'unemployed'
hholds.apply(hh_2_people,args=(people,),axis=1)
people['geometry'] = people.house.map(houses)
return gpd.GeoDataFrame(people)
def assign_students_to_schools(tract,people,school,buffer=0.3):
"""
Get the schools within 30km that accepts students at this age.
loop over schools from nearest to farest:
if school has any space then enroll
if no school available then
enroll to the school with the lowest enrollment rate
update the enrollment of the school
PERCENTAGE ERRORS
"""
def assign_school(x,pot,school):
sch = pot.distance(x.geometry).sort_values()
for s in sch.index: #from nearest to farest
if school.loc[s,'current'] < school.loc[s,'ENROLLMENT']:
school.loc[s,'current'] += 1
return s
#if no space left at any school within the buffer
least_crowded = (pot.current/pot.ENROLLMENT).idxmin()
school.loc[least_crowded,'current'] += 1
return least_crowded
kids = people.age<18
buff = tract.geometry.buffer(buffer)
sch_pot = school[school.intersects(buff)] #filter potential schools and daycares
people.loc[kids,'wp'] = people[kids].apply(assign_school,args=(sch_pot,school),axis=1)
def synthesize(tract, od, road, school, errors, population, wps):
start_time = timeit.default_timer()
print(tract.name,'started...',end=' ')
people = create_individuals(tract)
create_households(tract,people)
assign_workplaces(tract,people,od)
#create spaces
houses, wp = create_spaces(tract, people.hhold.nunique(), od, road)
#assign households to houses
people['geometry'] = people.hhold.map(houses)
assign_students_to_schools(tract,people,school)
# people['friends'] = create_networks(people)
err = get_errors(tract,people)
wps.append(wp)
population.append(people)
errors.append(err)
# giant = max(nx.connected_component_subgraphs(g), key=len)
# nms.append(net_metrics(giant))
print(tract.name,'now ended ({:.1f} secs)'.format(timeit.default_timer() - start_time))
Individuals living in the same house, or working in the same workplace, or attending the same school have relations at different levels. It has been argued that the intimate social network size is 5 (Dunbar). Following it, we create Newman–Watts–Strogatz small-world graphs with k=4 and p=0.3 for each network (home/work/school).
def create_networks(people,k=4,p=.3):
g = nx.Graph()
g.add_nodes_from(people.index)
grouped = people.groupby('hhold')
grouped.apply(lambda x: create_edges(x,g,etype='hhold',k=k,p=p))
grouped = people[people.age>=18].groupby('wp')
grouped.apply(lambda x: create_edges(x,g,etype='work',k=k,p=p))
grouped = people[people.age<18].groupby('wp')
grouped.apply(lambda x: create_edges(x,g,etype='school',k=k,p=p))
return g #max(nx.connected_component_subgraphs(g), key=len)
def create_edges(x,g,etype=None,k=4,p=.3):
"""Creates the edges within group `g` and adds them to `edges`
if the group size is <=5, then a complete network is generated. Otherwise,
a Newman–Watts–Strogatz small-world graph is generated with k=4 and p=0.3
http://www.sciencedirect.com/science/article/pii/S0375960199007574
"""
if len(x)<=5:
sw = nx.complete_graph(len(x))
else:
sw = nx.newman_watts_strogatz_graph(len(x),k,p)
sw = nx.relabel_nodes(sw,dict(zip(sw.nodes(), x.index.values)))
if etype:
g.add_edges_from(sw.edges(),etype=etype)
else:
g.add_edges_from(sw.edges())
#imports
import networkx as nx
from graph_tool.all import graph_tool as gt
import pickle
from shapely.prepared import prep
from shapely.ops import snap, linemerge, nearest_points
from shapely.geometry import MultiLineString, LineString, Point, Polygon, GeometryCollection
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from itertools import chain
from glob import glob
import sys
import timeit
import os
from io import StringIO
from IPython.display import display, HTML, Image
from simpledbf import Dbf5
import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import random
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)
import seaborn as sns
plt.style.use('ggplot')
Set1 = plt.get_cmap('Set1')
%matplotlib inline
mpl.rcParams['savefig.dpi'] = 150
mpl.rcParams['figure.dpi'] = 150
mpl.rcParams['figure.figsize'] = 12, 8
mpl.rcParams['axes.facecolor']='white'
mpl.rcParams['savefig.facecolor']='white'
mpl.rcParams['axes.formatter.useoffset']=False
PyTables is not installed. No support for HDF output. SQLalchemy is not installed. No support for SQL output.
For demo, we work on three census tracts, so created a road shapefile for faster reading
tracts = '36105952300 36105952400 36105952500'.split()
fields = dp.loc[tracts,:].unary_union
road = road[road.intersects(fields)]
road.to_file('../road.shp')
#read in the preprocessed files
# road = gpd.read_file('../nWMDmap2/gcc.shp') #road file
# road = gpd.read_file('../road.shp') #road file
road = gpd.read_file('../data/SU/comp1.shp') #road file
# demographic profiles
# dp = gpd.read_file('../nWMDmap2/censusclip1.shp').set_index('GEOID10')
dp = gpd.read_file('../data/SU/SU_census.shp').set_index('GEOID10')
dp['portion'] = dp.apply(lambda tract: tract.geometry.area / tract.Shape_Area, axis=1)
# origin (home) - destination (job) at census-tract level
od = pd.read_csv('../od/tract-od.csv',dtype={i:(str if i<2 else int) for i in range(6)})
#Number of establishments per county per size
cbp = pd.read_csv('../data/cbp10co.zip')
cbp = cbp[(cbp.naics.str.startswith('-'))] #All types of establishments included
cbp['fips'] = cbp.fipstate.map("{:02}".format) + cbp.fipscty.map("{:03}".format)
cbp = cbp.set_index('fips')
#add workplace counts and sizes to dp
dp['WP_CNT'] = number_of_wp(dp,od,cbp)
dp['WP_PROBA'] = dp.WP_CNT.map(wp_proba)
# school = gpd.read_file('../nWMDmap2/school.shp')
# daycare = gpd.read_file('../nWMDmap2/daycare.shp')
# school = clean_schools(school,daycare)
# school.to_file('../nWMDmap2/education.shp')
school = gpd.read_file('../nWMDmap2/education.shp')
tracts = '36105952300 36105952400 36105952500'.split()
dp.loc[tracts]
ALAND10 | AWATER10 | DP0010001 | DP0010002 | DP0010003 | DP0010004 | DP0010005 | DP0010006 | DP0010007 | DP0010008 | ... | DP0230002 | INTPTLAT10 | INTPTLON10 | NAMELSAD10 | Shape_Area | Shape_Leng | geometry | portion | WP_CNT | WP_PROBA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
GEOID10 | |||||||||||||||||||||
36105952300 | 122261325.0 | 4092105.0 | 1515 | 56 | 72 | 91 | 87 | 69 | 48 | 44 | ... | 2.26 | +41.5703095 | -074.9953079 | Census Tract 9523 | 0.013643 | 0.573302 | POLYGON ((-74.97426417153829 41.62098150620275... | 0.397886 | 8 | [0.0587462790098, 0.0630977256478, 0.070657904... |
36105952400 | 129644506.0 | 4835408.0 | 2530 | 101 | 126 | 142 | 166 | 139 | 107 | 122 | ... | 2.25 | +41.5673658 | -074.8865188 | Census Tract 9524 | 0.014510 | 0.648611 | POLYGON ((-74.96596599999992 41.47684400000003... | 0.979805 | 36 | [0.00210997329252, 0.00279947830991, 0.0034001... |
36105952500 | 120534573.0 | 7269494.0 | 2468 | 135 | 137 | 176 | 170 | 102 | 114 | 118 | ... | 2.74 | +41.4924235 | -074.8122203 | Census Tract 9525 | 0.013780 | 0.658278 | POLYGON ((-74.75637999999998 41.42807199999999... | 1.000000 | 10 | [0.0250950173172, 0.0366161842029, 0.042072704... |
3 rows × 197 columns
tract = dp.loc['36111952600']
people = create_individuals(tract)
create_households(tract,people)
assign_workplaces(tract,people,od)
#create spaces
houses, wp = create_spaces(tract, people.hhold.nunique(), od, road)
#assign households to houses
people['geometry'] = people.hhold.map(houses)
assign_students_to_schools(tract,people,school)
g = create_networks(people)
err = get_errors(tract,people)
err = get_errors(tract,people)
ax = err.drop(['population','in_gq']).plot.bar()
ax.set(ylabel='Percentage errors',title=err.name);
#networks
g = nx.Graph()
nodes = people[people.index.str.startswith(tract.name)]
grouped = nodes.groupby('hhold')
grouped.apply(lambda x: create_edges(x,g,etype='hhold'))
grouped = nodes[nodes.age>=18].groupby('wp')
grouped.apply(lambda x: create_edges(x,g,etype='work'))
grouped = nodes[nodes.age<18].groupby('wp')
grouped.apply(lambda x: create_edges(x,g,etype='school'))
giant = max(nx.connected_component_subgraphs(g), key=len)
giant.number_of_nodes()
population = []
errors = []
wps = []
# %prun
dp.apply(lambda t: synthesize(t,od,road,school,errors, population, wps),axis=1);
36111952600 started... 36111952600 now ended (27.0 secs) 36111952400 started... 36111952400 now ended (12.1 secs) 36111952700 started... 36111952700 now ended (13.7 secs) 36111953400 started... 36111953400 now ended (20.3 secs) 36111954800 started... 36111954800 now ended (22.9 secs) 36111953700 started... 36111953700 now ended (27.3 secs) 36111954000 started... 36111954000 now ended (25.7 secs) 36111950100 started... 36111950100 now ended (24.3 secs) 36111951600 started... 36111951600 now ended (10.9 secs) 36111951900 started... 36111951900 now ended (13.9 secs) 36111954100 started... 36111954100 now ended (31.5 secs) 36111951200 started... 36111951200 now ended (17.4 secs) 36111954700 started... 36111954700 now ended (10.9 secs) 36111954200 started... 36111954200 now ended (33.1 secs) 36111954500 started... 36111954500 now ended (23.6 secs) 36111950500 started... 36111950500 now ended (18.2 secs) 36111953600 started... 36111953600 now ended (32.4 secs) 36111952900 started... 36111952900 now ended (29.4 secs) 36111952500 started... 36111952500 now ended (15.4 secs) 36111952100 started... 36111952100 now ended (14.9 secs) 36111951500 started... 36111951500 now ended (14.6 secs) 36111951400 started... 36111951400 now ended (15.3 secs) 36111950900 started... 36111950900 now ended (11.4 secs) 36111951700 started... 36111951700 now ended (18.7 secs) 36111951000 started... 36111951000 now ended (21.6 secs) 36111954900 started... 36111954900 now ended (11.5 secs) 36111954400 started... 36111954400 now ended (80.0 secs) 36111951100 started... 36111951100 now ended (14.9 secs) 36111953500 started... 36111953500 now ended (27.7 secs) 36111955000 started... 36111955000 now ended (25.2 secs) 36111954600 started... 36111954600 now ended (9.7 secs) 36111950600 started... 36111950600 now ended (14.8 secs) 36111952800 started... 36111952800 now ended (14.9 secs) 36111953900 started... 36111953900 now ended (23.2 secs) 36111950300 started... 36111950300 now ended (15.6 secs) 36111950200 started... 36111950200 now ended (14.2 secs) 36111950400 started... 36111950400 now ended (21.0 secs) 36111953300 started... 36111953300 now ended (25.6 secs) 36111951300 started... 36111951300 now ended (18.2 secs) 36111952300 started... 36111952300 now ended (7.3 secs) 36111951800 started... 36111951800 now ended (7.8 secs) 36111952200 started... 36111952200 now ended (19.1 secs) 36111953000 started... 36111953000 now ended (11.3 secs) 36111953800 started... 36111953800 now ended (18.9 secs) 36111952000 started... 36111952000 now ended (18.9 secs) 36111955300 started... 36111955300 now ended (16.9 secs) 36111955400 started... 36111955400 now ended (26.8 secs) 36105951900 started... 36105951900 now ended (13.1 secs) 36105952000 started... 36105952000 now ended (16.8 secs) 36105950400 started... 36105950400 now ended (17.0 secs) 36105952200 started... 36105952200 now ended (7.2 secs) 36105952100 started... 36105952100 now ended (12.8 secs) 36105950800 started... 36105950800 now ended (22.7 secs) 36105950900 started... 36105950900 now ended (19.3 secs) 36105951000 started... 36105951000 now ended (14.3 secs) 36105951100 started... 36105951100 now ended (14.6 secs) 36105951300 started... 36105951300 now ended (22.2 secs) 36105951700 started... 36105951700 now ended (24.4 secs) 36105950300 started... 36105950300 now ended (8.1 secs) 36105952400 started... 36105952400 now ended (11.5 secs) 36105950500 started... 36105950500 now ended (13.3 secs) 36105950600 started... 36105950600 now ended (10.1 secs) 36105950700 started... 36105950700 now ended (17.2 secs) 36105952500 started... 36105952500 now ended (10.2 secs) 36105951200 started... 36105951200 now ended (45.0 secs) 36105950100 started... 36105950100 now ended (14.2 secs) 36105950200 started... 36105950200 now ended (14.1 secs) 36105951500 started... 36105951500 now ended (10.9 secs) 36105951600 started... 36105951600 now ended (17.1 secs) 36105951800 started... 36105951800 now ended (31.4 secs) 36105952300 started... 36105952300 now ended (7.9 secs)
#save the results
with open('errors2.pkl', 'wb') as f:
pickle.dump(errors, f)
with open('population2.pkl', 'wb') as f:
pickle.dump(population, f)
with open('wps2.pkl', 'wb') as f:
pickle.dump(wps, f)
# create and save the networks
g = create_networks(people,k=4,p=.3)
nx.write_gml(g,'k4p3-2.gml')
# people['friends'] = people.index.map(lambda x: set(g.neighbors(x)))
for etype in ['hhold','work','school']:
sg = nx.Graph([(u,v) for u,v,d in g.edges(data=True) if d['etype']==etype])
nx.write_gml(sg,etype+'-2.gml')
work_school = nx.Graph([(u,v) for u,v,d in g.edges(data=True) if d['etype'] in ['work','school']])
nx.write_gml(work_school,'work_school-2.gml')