In [ ]:

# This file is part of the Minnesota Population Center's NHGISXWALK.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
#   https://github.com/ipums/nhgisxwalk

Sample workflow: 2000 block group parts to 2010 block groups¶

Starting from a subset of 2010 Delaware blocks¶

For further background information see:

Schroeder, J. P. 2007. Target-density weighting interpolation and uncertainty evaluation for temporal analysis of census data. Geographical Analysis 39 (3):311–335.

NHGIS block crosswalks ¶

In [1]:

%load_ext watermark
%watermark

2020-08-19T18:07:25-04:00

CPython 3.8.5
IPython 7.16.1

compiler   : Clang 10.0.1 
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 8
interpreter: 64bit

In [2]:

import nhgisxwalk
import inspect
import pandas

%load_ext autoreload
%autoreload 2
%watermark -w
%watermark -iv

watermark 2.0.2
pandas     1.1.0
nhgisxwalk 0.0.9

Source and target years for the crosswalk¶

In [3]:

source_year, target_year = "2000", "2010"
gj_src, gj_trg = "GJOIN%s"%source_year, "GJOIN%s"%target_year

Source-target building base¶

In [4]:

subset_data_dir = "../testing_data_subsets/"
base_xwalk_name = "nhgis_blk%s_blk%s_gj" % (source_year, target_year)
data_types = nhgisxwalk.str_types([gj_src, gj_trg])
from_csv_kws = {"path": subset_data_dir, "archived": True, "remove_unpacked": True}
read_csv_kws = {"dtype": data_types}
base_xwalk = nhgisxwalk.xwalk_df_from_csv(
    base_xwalk_name, **from_csv_kws, **read_csv_kws
)
base_xwalk.head()

Out[4]:

	GJOIN2000	GJOIN2010	WEIGHT	PAREA
0	G10000100401001000	G10000100401001000	1.000000	1.000000
1	G10000100401001001	G10000100401001001	0.999981	0.999988
2	G10000100401001001	G10000100401001003	0.000019	0.000012
3	G10000100401001002	G10000100401001002	1.000000	1.000000
4	G10000100401001003	G10000100401001003	1.000000	1.000000

Convenience code shorthand/lookup¶

In [5]:

print(inspect.getsource(nhgisxwalk.valid_geo_shorthand))

def valid_geo_shorthand(shorthand_name=True):
    """Shorthand lookups for census geographies."""
    lookup = {
        "blk": "block",
        "bgp": "block group part",
        "bg": "block group",
        "tr": "tract",
        "co": "county",
    }
    if not shorthand_name:
        lookup = {v: k for k, v in lookup.items()}
    return lookup

In [6]:

nhgisxwalk.valid_geo_shorthand(shorthand_name=False)

Out[6]:

{'block': 'blk',
 'block group part': 'bgp',
 'block group': 'bg',
 'tract': 'tr',
 'county': 'co'}

Instantiate an `nhgisxwalk.GeoCrossWalk` object¶

see nhgisxwalk.GeoCrossWalk for full details¶

In [7]:

nhgisxwalk.desc_code_2000_SF1b

Out[7]:

{'Persons': {'Persons': 'Universe',
  'NP001A': 'Source code',
  'FXS': 'NHGIS code',
  'Total': 'FXS001'},
 'Families': {'Families': 'Universe',
  'NP031A': 'Source code',
  'F2V': 'NHGIS code',
  'Total': 'F2V001'},
 'Households': {'Households': 'Universe',
  'NP010A': 'Source code',
  'FY4': 'NHGIS code',
  'Total': 'FY4001'},
 'Housing Units': {'Housing Units': 'Universe',
  'NH001A': 'Source code',
  'FV5': 'NHGIS code',
  'Total': 'FV5001'}}

In [8]:

input_vars = [
    nhgisxwalk.desc_code_2000_SF1b["Persons"]["Total"],
    nhgisxwalk.desc_code_2000_SF1b["Families"]["Total"],
    nhgisxwalk.desc_code_2000_SF1b["Households"]["Total"],
    nhgisxwalk.desc_code_2000_SF1b["Housing Units"]["Total"]
]
input_vars

Out[8]:

['FXS001', 'F2V001', 'FY4001', 'FV5001']

In [9]:

input_var_tags = ["pop", "fam", "hh", "hu"]

In [10]:

subset_state = "10"
bgp2000_to_bg2010 = nhgisxwalk.GeoCrossWalk(
    base_xwalk,
    source_year=source_year,
    target_year=target_year,
    source_geo="bgp",
    target_geo="bg",
    base_source_table=subset_data_dir+"/2000_block.csv.zip",
    input_var=input_vars,
    weight_var=input_var_tags,
    stfips=subset_state,
    keep_base=True,
    add_geoid=True
)
bgp2000_to_bg2010.xwalk

Out[10]:

	bgp2000gj	bg2010gj	bg2010ge	wt_pop	wt_fam	wt_hh	wt_hu
0	G10000109044444430042202U1	G10000100422021	100010422021	1.0	1.0	1.0	1.0
1	G10000109044461265042201R1	G10000100422011	100010422011	1.0	1.0	1.0	1.0
2	G10000109044461265042201U1	G10000100422011	100010422011	1.0	1.0	1.0	1.0
3	G10000109044461265042201U2	G10000100422012	100010422012	1.0	1.0	1.0	1.0
4	G10000109044461480042202R2	G10000100422022	100010422022	1.0	1.0	1.0	1.0
...	...	...	...	...	...	...	...
1220	G10000509355299999051500R4	G10000500515004	100050515004	1.0	1.0	1.0	1.0
1221	G10000509355299999051500U1	G10000500515001	100050515001	1.0	1.0	1.0	1.0
1222	G10000509355299999051500U3	G10000500515003	100050515003	1.0	1.0	1.0	1.0
1223	G10000509355299999051500U4	G10000500515004	100050515004	1.0	1.0	1.0	1.0
1224	G34003301061010600020400U2	G10000309901000	100039901000	0.0	0.0	0.0	0.0

1225 rows × 7 columns

Prepare a single data product with a `README.txt`¶

In [11]:

xwalk, xwalk_name = bgp2000_to_bg2010.xwalk, bgp2000_to_bg2010.xwalk_name
xwalk_name_base = "_".join(xwalk_name.split("_")[:-1])

In [12]:

out_data_dir = "../../crosswalks/"
out_path = "%s%s%s/%s" % (out_data_dir, xwalk_name_base, "_state", xwalk_name)
nhgisxwalk.prepare_data_product(xwalk, xwalk_name, out_path, remove=True)

Read crosswalk from a `.zip` archive¶

In [13]:

in_path = "%s%s%s" % (out_data_dir, xwalk_name_base, "_state/")
id_cols = [c for c in xwalk.columns if not c.startswith("wt")]
data_types = nhgisxwalk.str_types(id_cols)
from_csv_kws = {"path": in_path, "archived": True, "remove_unpacked": True}
read_csv_kws = {"dtype": data_types}
bgp2000_to_bg2010_df = nhgisxwalk.xwalk_df_from_csv(
    xwalk_name, **from_csv_kws, **read_csv_kws
)
bgp2000_to_bg2010_df

Out[13]:

	bgp2000gj	bg2010gj	bg2010ge	wt_pop	wt_fam	wt_hh	wt_hu
0	G10000109044444430042202U1	G10000100422021	100010422021	1.0	1.0	1.0	1.0
1	G10000109044461265042201R1	G10000100422011	100010422011	1.0	1.0	1.0	1.0
2	G10000109044461265042201U1	G10000100422011	100010422011	1.0	1.0	1.0	1.0
3	G10000109044461265042201U2	G10000100422012	100010422012	1.0	1.0	1.0	1.0
4	G10000109044461480042202R2	G10000100422022	100010422022	1.0	1.0	1.0	1.0
...	...	...	...	...	...	...	...
1220	G10000509355299999051500R4	G10000500515004	100050515004	1.0	1.0	1.0	1.0
1221	G10000509355299999051500U1	G10000500515001	100050515001	1.0	1.0	1.0	1.0
1222	G10000509355299999051500U3	G10000500515003	100050515003	1.0	1.0	1.0	1.0
1223	G10000509355299999051500U4	G10000500515004	100050515004	1.0	1.0	1.0	1.0
1224	G34003301061010600020400U2	G10000309901000	100039901000	0.0	0.0	0.0	0.0