weighted-portion-synthetic-atoms¶

toy example¶

In [1]:

%load_ext watermark
%watermark

2020-06-20T21:05:26-04:00

CPython 3.7.6
IPython 7.15.0

compiler   : Clang 9.0.1 
system     : Darwin
release    : 19.5.0
machine    : x86_64
processor  : i386
CPU cores  : 8
interpreter: 64bit

In [2]:

import nhgisxwalk
import inspect

%load_ext autoreload
%autoreload 2
%watermark -w
%watermark -iv

watermark 2.0.2
nhgisxwalk 0.0.4

In [3]:

print(inspect.getsource(nhgisxwalk.calculate_atoms))

def calculate_atoms(
    df,
    weight=None,
    input_var=None,
    weight_var=None,
    weight_prefix=None,
    source_id=None,
    groupby_cols=None,
    overwrite_attrs=None,
):
    """Calculate the atoms (intersecting parts) of census geographies
    and interpolate a proportional weight of the source attribute that
    lies within the target geography.
    
    Parameters
    ----------
    
    df : pandas.DataFrame
        The input data. See ``GeoCrossWalk.base``.
    
    weight : str
        The weight colum name(s).
    
    input_var : str or iterable
        The input variable column name(s).
    
    weight_var : str or iterable
        The groupby and summed variable column name(s).
    
    weight_prefix : str
        Prepend this prefix to the the ``weight_var`` column name.
    
    source_id : str
        The source ID column name.
    
    groupby_cols : list
        The dataframe columns on which to perform groupby.
    
    overwrite_attrs : None or GeoCrossWalk
        Setting this parameter to a ``GeoCrossWalk`` object overwrites the
        ``input_var`` and ``weight_var`` attributes. Default is ``None``.
    
    Returns
    -------
    
    atoms : pandas.DataFrame
        All intersections between ``source`` and ``target`` geographies, and 
        the interpolated weight calculations for the propotion of
        source area attributes that are in the target area.
    
    Notes
    -----
    
    See example 1 in the ``GeoCrossWalk`` Examples section.
    
    """

    # confirm variable data types
    input_var, weight_var = _check_vars(input_var), _check_vars(weight_var)

    # determine length of variable lists
    n_input_var, n_weight_var = len(input_var), len(weight_var)

    # check variable lists are equal length
    if n_input_var != n_weight_var:
        msg = "The 'input_var' and 'weight_var' should be the same length. "
        msg += "%s != %s" % (n_input_var, n_weight_var)
        raise RuntimeError(msg)

    # add prefix (if desired)
    weight_col = _weight_columns(weight_prefix if weight_prefix else "", weight_var)

    if str(overwrite_attrs) != "None":
        overwrite_attrs.input_var = input_var
        overwrite_attrs.weight_col = weight_col

    # iterate over each pair of input/interpolation variables
    for ix, (ivar, wvar) in enumerate(zip(input_var, weight_col)):

        # calculate numerators
        df[wvar] = df[weight] * df[ivar]
        if ix == 0:
            # on the first iteration create an atom dataframe
            atoms = df.groupby(groupby_cols)[wvar].sum().to_frame()
            atoms.reset_index(inplace=True)
        else:
            # on tsubsequent iterations add weights as a column
            atoms[wvar] = df.groupby(groupby_cols)[wvar].sum().values

        # calculate denominators
        denominators = atoms.groupby(source_id)[wvar].sum()

        # interpolate weights
        atoms[wvar] = atoms[wvar] / atoms[source_id].map(denominators)

        # if any weights are NaN, replace with 0.
        atoms[wvar].fillna(0.0, inplace=True)

    return atoms

toy data¶

In [4]:

toy_df = nhgisxwalk.example_crosswalk_data()
toy_df

Out[4]:

	bgp1990	blk1990	blk2010	trt2010	wt	pop_1990	hh_1990
0	A	A.1	X.1	X	1.0	60.0	25.0
1	A	A.2	X.2	X	0.3	100.0	40.0
2	A	A.2	Y.1	Y	0.7	100.0	40.0
3	B	B.1	X.3	X	1.0	50.0	20.0
4	B	B.2	Y.2	Y	1.0	80.0	30.0

demo atom crosswalk calculation¶

In [5]:

toy_atoms = nhgisxwalk.calculate_atoms(
    toy_df,
    weight="wt",
    input_var=["pop_1990", "hh_1990"],
    weight_var=["pop", "hh"],
    weight_prefix="wt_",
    source_id="bgp1990",
    groupby_cols=["bgp1990", "trt2010"]
)
toy_atoms

Out[5]:

	bgp1990	trt2010	wt_pop	wt_hh
0	A	X	0.562500	0.569231
1	A	Y	0.437500	0.430769
2	B	X	0.384615	0.400000
3	B	Y	0.615385	0.600000