#!/usr/bin/env python
# coding: utf-8

# ## Running a simulator using existing data
# Consider the case when input data already exists, and that data already has a causal structure.
# We would like to simulate treatment assignment and outcomes based on this data.
# 
# ### Initialize the data
# First we load the desired data into a pandas DataFrame:

# In[7]:


import pandas as pd
from causallib.datasets import load_smoking_weight
from causallib.simulation import CausalSimulator
from causallib.simulation import generate_random_topology


# In[4]:


data = load_smoking_weight()
X_given = data.X


# say we want to create three more variables: covariate, treatment and outcome.
# This will be a bit difficult to hardwire a graph with many variables, so lets use the random topology generator:

# In[5]:


topology, var_types = generate_random_topology(n_covariates=1, p=0.4,
                                               n_treatments=1, n_outcomes=1,
                                               given_vars=X_given.columns)


# Now we create the simulator based on the variables topology:

# In[8]:


outcome_types = "categorical"
link_types = ['linear'] * len(var_types)
prob_categories = pd.Series(data=[[0.5, 0.5] if typ in ["treatment", "outcome"] else None for typ in var_types],
                            index=var_types.index)
treatment_methods = "gaussian"
snr = 0.9
treatment_importance = 0.8
effect_sizes = None
sim = CausalSimulator(topology=topology.values, prob_categories=prob_categories,
                      link_types=link_types, snr=snr, var_types=var_types,
                      treatment_importances=treatment_importance,
                      outcome_types=outcome_types,
                      treatment_methods=treatment_methods,
                      effect_sizes=effect_sizes)


# Now in order to generate data based on the given data we need to specify:

# In[9]:


X, prop, y = sim.generate_data(X_given=X_given)


# ### Format the data for training and save it
# 
# Now that we generated some data, we can format it so it would be easier to train and validate:

# In[10]:


observed_set, validation_set = sim.format_for_training(X, prop, y)


# observed_set is the observed dataset (excluding hidden variables)validation_set is for validation purposes - it has the counterfactuals, the treatments assignment and the propensity for every sample.
# You can save the datasets into csv:

# In[20]:


covariates = observed_set.loc[:, observed_set.columns.str.startswith("x_")]
print(covariates.shape)
covariates.head()


# In[22]:


treatment_outcome = observed_set.loc[:, (observed_set.columns.str.startswith("t_") |
                                         observed_set.columns.str.startswith("y_"))]
print(treatment_outcome.shape)
treatment_outcome.head()


# In[23]:


print(validation_set.shape)
validation_set.head()

sim.to_csv(observed_set, 'training_set.csv')
sim.to_csv(validation_set, 'validation_set.csv')