This is currently only intended for use with block-level data as base units.
James Gaboardi (jgaboardi@gmail.com), 2020-05
%load_ext watermark
%watermark
2020-06-13T12:56:41-04:00 CPython 3.7.6 IPython 7.15.0 compiler : Clang 9.0.1 system : Darwin release : 19.5.0 machine : x86_64 processor : i386 CPU cores : 8 interpreter: 64bit
import inspect
import nhgisxwalk
import numpy
import pandas
%load_ext autoreload
%autoreload 2
%watermark -w
%watermark -iv
watermark 2.0.2 pandas 1.0.4 numpy 1.18.5 nhgisxwalk 0.0.2
source_year, target_year = "1990", "2010"
gj_src, gj_trg = "GJOIN%s"%source_year, "GJOIN%s"%target_year
base_xwalk_name = "nhgis_blk%s_blk%s_gj.csv.zip" % (source_year, target_year)
base_xwalk_file = "../testing_data_subsets/%s" % base_xwalk_name
base_xwalk_file
'../testing_data_subsets/nhgis_blk1990_blk2010_gj.csv.zip'
base_source_name = "%s_block.csv.zip" % source_year
base_source_file = "../testing_data_subsets/%s" % base_source_name
base_source_file
'../testing_data_subsets/1990_block.csv.zip'
supp_source_name = "%s_blck_grp_598_103.csv.zip" % source_year
supp_source_file = "../testing_data_subsets/%s" % supp_source_name
supp_source_file
'../testing_data_subsets/1990_blck_grp_598_103.csv.zip'
data_types = nhgisxwalk.str_types([gj_src, gj_trg])
base_xwalk = pandas.read_csv(base_xwalk_file, index_col=0, dtype=data_types)
base_xwalk
GJOIN1990 | GJOIN2010 | WEIGHT | PAREA_VIA_BLK00 | |
---|---|---|---|---|
0 | NaN | G10000100432021078 | 0.0 | 0.0 |
1 | NaN | G10000100432023014 | 0.0 | 0.0 |
2 | NaN | G10000100432023015 | 0.0 | 0.0 |
3 | NaN | G10000109900000011 | 0.0 | 0.0 |
4 | NaN | G10000109900000012 | 0.0 | 0.0 |
... | ... | ... | ... | ... |
38292 | G10000500519289 | G10000500519002125 | 1.0 | 1.0 |
38293 | G34003300204401A | G10000309901000007 | 0.0 | 0.0 |
38294 | G34003300204418 | G10000309901000007 | 0.0 | 0.0 |
38295 | G34003300204419 | G10000309901000007 | 0.0 | 0.0 |
38296 | G34003300204420 | G10000309901000007 | 0.0 | 0.0 |
38297 rows × 4 columns
not needed for creating a subset perse, but should do regardless
input_vars = [
nhgisxwalk.desc_code_1990["Persons"]["Total"],
nhgisxwalk.desc_code_1990["Families"]["Total"],
nhgisxwalk.desc_code_1990["Households"]["Total"],
nhgisxwalk.desc_code_1990["Housing Units"]["Total"]
]
input_var_tags = ["pop", "fam", "hh", "hu"]
state_bgp1990trt2010 = nhgisxwalk.GeoCrossWalk(
base_xwalk,
source_year=source_year,
target_year=target_year,
source_geo="bgp",
target_geo="trt",
base_source_table=base_source_file,
supp_source_table=supp_source_file,
input_var=input_vars,
weight_var=input_var_tags,
keep_base=True,
add_geoid=True
)
#del base_xwalk
state_bgp1990trt2010.xwalk
bgp1990gj | trt2010gj | trt2010ge | wt_pop | wt_fam | wt_hh | wt_hu | |
---|---|---|---|---|---|---|---|
0 | G100001090444072500423009999999999921 | G1000010043202 | 10001043202 | 1.0 | 1.0 | 1.0 | 1.0 |
1 | G100001090444444300422009999999999926 | G1000010042202 | 10001042202 | 1.0 | 1.0 | 1.0 | 1.0 |
2 | G100001090444612650422009999999219011 | G1000010041200 | 10001041200 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | G100001090444612650422009999999219011 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
4 | G100001090444612650422009999999219012 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... |
1058 | G100005093552999990515009999999999923 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1059 | G100005093552999990515009999999999924 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1060 | G100005093552999990516009999999999921 | G1000050051702 | 10005051702 | 1.0 | 1.0 | 1.0 | 1.0 |
1061 | G340033010610106000204029999999916014 | G1000030990100 | 10003990100 | 0.0 | 0.0 | 0.0 | 0.0 |
1062 | NaN | G1000050990000 | 10005990000 | 0.0 | 0.0 | 0.0 | 0.0 |
1063 rows × 7 columns
state_bgp1990trt2010.xwalk.drop_duplicates(subset=["bgp1990gj", "trt2010gj"])
bgp1990gj | trt2010gj | trt2010ge | wt_pop | wt_fam | wt_hh | wt_hu | |
---|---|---|---|---|---|---|---|
0 | G100001090444072500423009999999999921 | G1000010043202 | 10001043202 | 1.0 | 1.0 | 1.0 | 1.0 |
1 | G100001090444444300422009999999999926 | G1000010042202 | 10001042202 | 1.0 | 1.0 | 1.0 | 1.0 |
2 | G100001090444612650422009999999219011 | G1000010041200 | 10001041200 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | G100001090444612650422009999999219011 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
4 | G100001090444612650422009999999219012 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... |
1058 | G100005093552999990515009999999999923 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1059 | G100005093552999990515009999999999924 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1060 | G100005093552999990516009999999999921 | G1000050051702 | 10005051702 | 1.0 | 1.0 | 1.0 | 1.0 |
1061 | G340033010610106000204029999999916014 | G1000030990100 | 10003990100 | 0.0 | 0.0 | 0.0 | 0.0 |
1062 | NaN | G1000050990000 | 10005990000 | 0.0 | 0.0 | 0.0 | 0.0 |
1063 rows × 7 columns
ix1, ix2 = 13, 17
state_bgp1990trt2010.xwalk.loc[ix1:ix2-1]
bgp1990gj | trt2010gj | trt2010ge | wt_pop | wt_fam | wt_hh | wt_hu | |
---|---|---|---|---|---|---|---|
13 | G100001090444999990421009999999219012 | G1000010042100 | 10001042100 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
14 | G100001090444999990421009999999999921 | G1000010042100 | 10001042100 | 0.997664 | 0.997166 | 0.997148 | 0.997278 |
15 | G100001090444999990421009999999999921 | G1000010042201 | 10001042201 | 0.002336 | 0.002834 | 0.002852 | 0.002722 |
16 | G100001090444999990421009999999999922 | G1000010042100 | 10001042100 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
id_cols = ["bgp1990gj", "trt2010gj", "trt2010ge"]
obs_str_vals = state_bgp1990trt2010.xwalk[id_cols][ix1:ix2].values
obs_str_vals
array([['G100001090444999990421009999999219012', 'G1000010042100', '10001042100'], ['G100001090444999990421009999999999921', 'G1000010042100', '10001042100'], ['G100001090444999990421009999999999921', 'G1000010042201', '10001042201'], ['G100001090444999990421009999999999922', 'G1000010042100', '10001042100']], dtype=object)
wgt_cols = ["wt_pop", "wt_fam", "wt_hh", "wt_hu"]
obs_num_vals = state_bgp1990trt2010.xwalk[wgt_cols][ix1:ix2].values
obs_num_vals
array([[1. , 1. , 1. , 1. ], [0.99766436, 0.99716625, 0.99714829, 0.99727768], [0.00233564, 0.00283375, 0.00285171, 0.00272232], [1. , 1. , 1. , 1. ]])
state_bgp1990trt2010.xwalk[wgt_cols][ix1:ix2]
wt_pop | wt_fam | wt_hh | wt_hu | |
---|---|---|---|---|
13 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
14 | 0.997664 | 0.997166 | 0.997148 | 0.997278 |
15 | 0.002336 | 0.002834 | 0.002852 | 0.002722 |
16 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |