import pandas as pd
from causallib.datasets import load_nhefs
from causallib.simulation import CausalSimulator
from causallib.simulation import generate_random_topology
data = load_nhefs()
X_given = data.X
say we want to create three more variables: covariate, treatment and outcome. This will be a bit difficult to hardwire a graph with many variables, so lets use the random topology generator:
topology, var_types = generate_random_topology(n_covariates=1, p=0.4,
n_treatments=1, n_outcomes=1,
given_vars=X_given.columns)
Now we create the simulator based on the variables topology:
outcome_types = "categorical"
link_types = ['linear'] * len(var_types)
prob_categories = pd.Series(data=[[0.5, 0.5] if typ in ["treatment", "outcome"] else None for typ in var_types],
index=var_types.index)
treatment_methods = "gaussian"
snr = 0.9
treatment_importance = 0.8
effect_sizes = None
sim = CausalSimulator(topology=topology.values, prob_categories=prob_categories,
link_types=link_types, snr=snr, var_types=var_types,
treatment_importances=treatment_importance,
outcome_types=outcome_types,
treatment_methods=treatment_methods,
effect_sizes=effect_sizes)
Now in order to generate data based on the given data we need to specify:
X, prop, y = sim.generate_data(X_given=X_given)
Now that we generated some data, we can format it so it would be easier to train and validate:
observed_set, validation_set = sim.format_for_training(X, prop, y)
observed_set is the observed dataset (excluding hidden variables)validation_set is for validation purposes - it has the counterfactuals, the treatments assignment and the propensity for every sample. You can save the datasets into csv:
covariates = observed_set.loc[:, observed_set.columns.str.startswith("x_")]
print(covariates.shape)
covariates.head()
(1566, 19)
x_18 | x_active_1 | x_active_2 | x_age | x_age^2 | x_education_2 | x_education_3 | x_education_4 | x_education_5 | x_exercise_1 | x_exercise_2 | x_race | x_sex | x_smokeintensity | x_smokeintensity^2 | x_smokeyrs | x_smokeyrs^2 | x_wt71 | x_wt71^2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 153.760252 | 0 | 0 | 42 | 1764 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 30 | 900 | 29 | 841 | 79.04 | 6247.3216 |
1 | 94.762203 | 0 | 0 | 36 | 1296 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | 400 | 24 | 576 | 58.63 | 3437.4769 |
2 | 669.486191 | 0 | 0 | 56 | 3136 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 20 | 400 | 26 | 676 | 56.81 | 3227.3761 |
3 | -865.113582 | 1 | 0 | 68 | 4624 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 3 | 9 | 53 | 2809 | 59.42 | 3530.7364 |
4 | 634.638630 | 1 | 0 | 40 | 1600 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 20 | 400 | 19 | 361 | 87.09 | 7584.6681 |
treatment_outcome = observed_set.loc[:, (observed_set.columns.str.startswith("t_") |
observed_set.columns.str.startswith("y_"))]
print(treatment_outcome.shape)
treatment_outcome.head()
(1566, 2)
t_19 | y_20 | |
---|---|---|
0 | 0 | 0 |
1 | 0 | 1 |
2 | 0 | 1 |
3 | 1 | 1 |
4 | 1 | 0 |
print(validation_set.shape)
validation_set.head()
(1566, 5)
t_19 | p_19_0 | p_19_1 | cf_20_0 | cf_20_1 | |
---|---|---|---|---|---|
0 | 0 | 1.0 | 0.0 | 0 | 0 |
1 | 0 | 1.0 | 0.0 | 1 | 1 |
2 | 0 | 1.0 | 0.0 | 1 | 1 |
3 | 1 | 1.0 | 0.0 | 1 | 1 |
4 | 1 | 1.0 | 0.0 | 0 | 0 |