This is currently only intended for use with block-level data as base units.
James Gaboardi (jgaboardi@gmail.com), 2020-05
%load_ext watermark
%watermark
2020-07-29T16:05:04-04:00 CPython 3.7.6 IPython 7.15.0 compiler : Clang 9.0.1 system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 8 interpreter: 64bit
import inspect
import nhgisxwalk
import numpy
import pandas
%load_ext autoreload
%autoreload 2
%watermark -w
%watermark -iv
watermark 2.0.2 nhgisxwalk 0.0.6 pandas 1.0.4 numpy 1.18.5
subset_state = "10" # Delaware
#subset_state = "11" # DC
#subset_state = "15" # Hawaii
source_year, target_year = "2000", "2010"
gj_src, gj_trg = "GJOIN%s"%source_year, "GJOIN%s"%target_year
base_xwalk_name = "nhgis_blk%s_blk%s_gj.zip" % (source_year, target_year)
base_xwalk_file = "../../crosswalks/%s" % base_xwalk_name
base_xwalk_file
'../../crosswalks/nhgis_blk2000_blk2010_gj.zip'
base_source_name = "%s_block/%s_block.csv" % (source_year, source_year)
base_source_file = "../../tabular_data/%s" % base_source_name
base_source_file
'../../tabular_data/2000_block/2000_block.csv'
data_types = nhgisxwalk.str_types([gj_src, gj_trg])
base_xwalk = pandas.read_csv(base_xwalk_file, dtype=data_types)
base_xwalk.head()
GJOIN2000 | GJOIN2010 | WEIGHT | PAREA | |
---|---|---|---|---|
0 | G01000100201001000 | G01000100201002000 | 0.035897 | 0.008988 |
1 | G01000100201001000 | G01000100201002001 | 0.253330 | 0.263725 |
2 | G01000100201001000 | G01000100201002002 | 0.000000 | 0.000385 |
3 | G01000100201001000 | G01000100201002003 | 0.076297 | 0.055430 |
4 | G01000100201001000 | G01000100201002004 | 0.032441 | 0.007543 |
ss_base = base_xwalk[
base_xwalk["GJOIN2010"].map(lambda x: x[1:3] == subset_state)
].copy()
ss_base.reset_index(drop=True, inplace=True)
not needed for creating a subset perse, but should do regardless
input_vars = [
nhgisxwalk.desc_code_2000_SF1b["Persons"]["Total"],
nhgisxwalk.desc_code_2000_SF1b["Families"]["Total"],
nhgisxwalk.desc_code_2000_SF1b["Households"]["Total"],
nhgisxwalk.desc_code_2000_SF1b["Housing Units"]["Total"]
]
input_var_tags = ["pop", "fam", "hh", "hu"]
state_bgp2000tr2010 = nhgisxwalk.GeoCrossWalk(
base_xwalk,
source_year=source_year,
target_year=target_year,
source_geo="bgp",
target_geo="tr",
base_source_table=base_source_file,
input_var=input_vars,
weight_var=input_var_tags,
keep_base=True,
add_geoid=True,
stfips=subset_state
)
del base_xwalk
state_bgp2000tr2010.xwalk
bgp2000gj | tr2010gj | tr2010ge | wt_pop | wt_fam | wt_hh | wt_hu | |
---|---|---|---|---|---|---|---|
0 | G10000109044444430042202U1 | G1000010042202 | 10001042202 | 1.0 | 1.0 | 1.0 | 1.0 |
1 | G10000109044461265042201R1 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
2 | G10000109044461265042201U1 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
3 | G10000109044461265042201U2 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
4 | G10000109044461480042202R2 | G1000010042202 | 10001042202 | 1.0 | 1.0 | 1.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... |
1038 | G10000509355299999051500R4 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1039 | G10000509355299999051500U1 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1040 | G10000509355299999051500U3 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1041 | G10000509355299999051500U4 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1042 | G34003301061010600020400U2 | G1000030990100 | 10003990100 | 0.0 | 0.0 | 0.0 | 0.0 |
1043 rows × 7 columns
state_bgp2000tr2010.xwalk.drop_duplicates(subset=["bgp2000gj", "tr2010gj"])
bgp2000gj | tr2010gj | tr2010ge | wt_pop | wt_fam | wt_hh | wt_hu | |
---|---|---|---|---|---|---|---|
0 | G10000109044444430042202U1 | G1000010042202 | 10001042202 | 1.0 | 1.0 | 1.0 | 1.0 |
1 | G10000109044461265042201R1 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
2 | G10000109044461265042201U1 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
3 | G10000109044461265042201U2 | G1000010042201 | 10001042201 | 1.0 | 1.0 | 1.0 | 1.0 |
4 | G10000109044461480042202R2 | G1000010042202 | 10001042202 | 1.0 | 1.0 | 1.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... |
1038 | G10000509355299999051500R4 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1039 | G10000509355299999051500U1 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1040 | G10000509355299999051500U3 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1041 | G10000509355299999051500U4 | G1000050051500 | 10005051500 | 1.0 | 1.0 | 1.0 | 1.0 |
1042 | G34003301061010600020400U2 | G1000030990100 | 10003990100 | 0.0 | 0.0 | 0.0 | 0.0 |
1043 rows × 7 columns
state_bgp2000tr2010.xwalk["bgp2000gj"].nunique()
908
state_bgp2000tr2010.xwalk["tr2010gj"].nunique()
218
ss_base.to_csv("../testing_data_subsets/%s" % base_xwalk_name)
ss_base
GJOIN2000 | GJOIN2010 | WEIGHT | PAREA | |
---|---|---|---|---|
0 | G10000100401001000 | G10000100401001000 | 1.000000 | 1.000000 |
1 | G10000100401001001 | G10000100401001001 | 0.999981 | 0.999988 |
2 | G10000100401001001 | G10000100401001003 | 0.000019 | 0.000012 |
3 | G10000100401001002 | G10000100401001002 | 1.000000 | 1.000000 |
4 | G10000100401001003 | G10000100401001003 | 1.000000 | 1.000000 |
... | ... | ... | ... | ... |
28471 | G10000500519002095 | G10000500519002075 | 1.000000 | 1.000000 |
28472 | G10000500519002096 | G10000500519002131 | 1.000000 | 1.000000 |
28473 | G10000500519002097 | G10000500519002130 | 1.000000 | 1.000000 |
28474 | G10000500519002098 | G10000500519002079 | 1.000000 | 1.000000 |
28475 | G34003300204002001 | G10000309901000007 | 0.000000 | 0.000000 |
28476 rows × 4 columns
blk2000 = ss_base[~ss_base["GJOIN2000"].isna()]["GJOIN2000"]
blk2000
0 G10000100401001000 1 G10000100401001001 2 G10000100401001001 3 G10000100401001002 4 G10000100401001003 ... 28471 G10000500519002095 28472 G10000500519002096 28473 G10000500519002097 28474 G10000500519002098 28475 G34003300204002001 Name: GJOIN2000, Length: 28476, dtype: object
blk2000.shape
(28476,)
blk2000.nunique()
17484
# read in base source file
base_source_df = pandas.read_csv(base_source_file, dtype=str)
base_source_df
GISJOIN | YEAR | STATE | STATEA | COUNTY | COUNTYA | CTY_SUBA | PLACEA | TRACTA | BLCK_GRPA | BLOCKA | AIANHHA | URBRURALA | NAME | FXS001 | FY4001 | F2V001 | FV5001 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | G01000100201001000 | 2000 | Alabama | 01 | Autauga | 001 | 92628 | 99999 | 020100 | 1 | 1000 | 9999 | R | Block 1000 | 16 | 7 | 4 | 7 |
1 | G01000100201001001 | 2000 | Alabama | 01 | Autauga | 001 | 92628 | 62328 | 020100 | 1 | 1001 | 9999 | R | Block 1001 | 40 | 13 | 13 | 14 |
2 | G01000100201001002 | 2000 | Alabama | 01 | Autauga | 001 | 92628 | 62328 | 020100 | 1 | 1002 | 9999 | U | Block 1002 | 284 | 101 | 88 | 104 |
3 | G01000100201001003 | 2000 | Alabama | 01 | Autauga | 001 | 92628 | 62328 | 020100 | 1 | 1003 | 9999 | U | Block 1003 | 47 | 16 | 13 | 17 |
4 | G01000100201001004 | 2000 | Alabama | 01 | Autauga | 001 | 92628 | 62328 | 020100 | 1 | 1004 | 9999 | U | Block 1004 | 18 | 7 | 4 | 9 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8205577 | G56004509513003080 | 2000 | Wyoming | 56 | Weston | 045 | 92255 | 99999 | 951300 | 3 | 3080 | 9999 | R | Block 3080 | 0 | 0 | 0 | 0 |
8205578 | G56004509513003081 | 2000 | Wyoming | 56 | Weston | 045 | 92255 | 99999 | 951300 | 3 | 3081 | 9999 | R | Block 3081 | 0 | 0 | 0 | 0 |
8205579 | G56004509513003082 | 2000 | Wyoming | 56 | Weston | 045 | 92255 | 99999 | 951300 | 3 | 3082 | 9999 | R | Block 3082 | 0 | 0 | 0 | 0 |
8205580 | G56004509513003083 | 2000 | Wyoming | 56 | Weston | 045 | 92255 | 99999 | 951300 | 3 | 3083 | 9999 | R | Block 3083 | 1 | 1 | 0 | 1 |
8205581 | G56004509513003084 | 2000 | Wyoming | 56 | Weston | 045 | 92255 | 99999 | 951300 | 3 | 3084 | 9999 | R | Block 3084 | 34 | 14 | 11 | 16 |
8205582 rows × 18 columns
base_source_df = base_source_df[base_source_df["GISJOIN"].isin(blk2000.unique())]
base_source_df.reset_index(drop=True, inplace=True)
base_source_df
GISJOIN | YEAR | STATE | STATEA | COUNTY | COUNTYA | CTY_SUBA | PLACEA | TRACTA | BLCK_GRPA | BLOCKA | AIANHHA | URBRURALA | NAME | FXS001 | FY4001 | F2V001 | FV5001 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | G10000100401001000 | 2000 | Delaware | 10 | Kent | 001 | 91480 | 99999 | 040100 | 1 | 1000 | 9999 | R | Block 1000 | 15 | 9 | 6 | 9 |
1 | G10000100401001001 | 2000 | Delaware | 10 | Kent | 001 | 91480 | 99999 | 040100 | 1 | 1001 | 9999 | R | Block 1001 | 364 | 104 | 94 | 104 |
2 | G10000100401001002 | 2000 | Delaware | 10 | Kent | 001 | 91480 | 99999 | 040100 | 1 | 1002 | 9999 | R | Block 1002 | 16 | 6 | 6 | 6 |
3 | G10000100401001003 | 2000 | Delaware | 10 | Kent | 001 | 91480 | 99999 | 040100 | 1 | 1003 | 9999 | R | Block 1003 | 80 | 27 | 24 | 27 |
4 | G10000100401001004 | 2000 | Delaware | 10 | Kent | 001 | 91480 | 99999 | 040100 | 1 | 1004 | 9999 | R | Block 1004 | 55 | 20 | 15 | 21 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17479 | G10000500519002095 | 2000 | Delaware | 10 | Sussex | 005 | 91628 | 20380 | 051900 | 2 | 2095 | 9999 | U | Block 2095 | 18 | 9 | 4 | 12 |
17480 | G10000500519002096 | 2000 | Delaware | 10 | Sussex | 005 | 91628 | 20380 | 051900 | 2 | 2096 | 9999 | R | Block 2096 | 3 | 2 | 1 | 2 |
17481 | G10000500519002097 | 2000 | Delaware | 10 | Sussex | 005 | 91628 | 99999 | 051900 | 2 | 2097 | 9999 | U | Block 2097 | 0 | 0 | 0 | 0 |
17482 | G10000500519002098 | 2000 | Delaware | 10 | Sussex | 005 | 91628 | 99999 | 051900 | 2 | 2098 | 9999 | U | Block 2098 | 18 | 5 | 3 | 5 |
17483 | G34003300204002001 | 2000 | New Jersey | 34 | Salem | 033 | 10610 | 10600 | 020400 | 2 | 2001 | 9999 | U | Block 2001 | 418 | 234 | 92 | 257 |
17484 rows × 18 columns
base_source_df.to_csv("../testing_data_subsets/%s_block.csv.zip" % source_year)