#!/usr/bin/env python # coding: utf-8 # ## Running a simulator using existing data # Consider the case when input data already exists, and that data already has a causal structure. # We would like to simulate treatment assignment and outcomes based on this data. # # ### Initialize the data # First we load the desired data into a pandas DataFrame: # In[7]: import pandas as pd from causallib.datasets import load_smoking_weight from causallib.simulation import CausalSimulator from causallib.simulation import generate_random_topology # In[4]: data = load_smoking_weight() X_given = data.X # say we want to create three more variables: covariate, treatment and outcome. # This will be a bit difficult to hardwire a graph with many variables, so lets use the random topology generator: # In[5]: topology, var_types = generate_random_topology(n_covariates=1, p=0.4, n_treatments=1, n_outcomes=1, given_vars=X_given.columns) # Now we create the simulator based on the variables topology: # In[8]: outcome_types = "categorical" link_types = ['linear'] * len(var_types) prob_categories = pd.Series(data=[[0.5, 0.5] if typ in ["treatment", "outcome"] else None for typ in var_types], index=var_types.index) treatment_methods = "gaussian" snr = 0.9 treatment_importance = 0.8 effect_sizes = None sim = CausalSimulator(topology=topology.values, prob_categories=prob_categories, link_types=link_types, snr=snr, var_types=var_types, treatment_importances=treatment_importance, outcome_types=outcome_types, treatment_methods=treatment_methods, effect_sizes=effect_sizes) # Now in order to generate data based on the given data we need to specify: # In[9]: X, prop, y = sim.generate_data(X_given=X_given) # ### Format the data for training and save it # # Now that we generated some data, we can format it so it would be easier to train and validate: # In[10]: observed_set, validation_set = sim.format_for_training(X, prop, y) # observed_set is the observed dataset (excluding hidden variables)validation_set is for validation purposes - it has the counterfactuals, the treatments assignment and the propensity for every sample. # You can save the datasets into csv: # In[20]: covariates = observed_set.loc[:, observed_set.columns.str.startswith("x_")] print(covariates.shape) covariates.head() # In[22]: treatment_outcome = observed_set.loc[:, (observed_set.columns.str.startswith("t_") | observed_set.columns.str.startswith("y_"))] print(treatment_outcome.shape) treatment_outcome.head() # In[23]: print(validation_set.shape) validation_set.head() sim.to_csv(observed_set, 'training_set.csv') sim.to_csv(validation_set, 'validation_set.csv')