This Notebooks presents several models that perform overlap exclusion
from causallib.positivity import Trimming
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def create_trimmed_data(X, a, threshold=[0.01,0.05,'crump']):
filtered_series = dict()
tr = Trimming()
tr.fit(X, a)
for th in threshold:
filtered_series['filtered_' + str(th)] = tr.predict(X, a, threshold=th)
df_filtered = pd.DataFrame.from_dict(filtered_series)
df = pd.concat([X, a, df_filtered], axis=1)
df.columns = ['X_1','X_2','a', *list(filtered_series.keys())]
return df
def plot_trimmed_obs_2d_spcae(data, number_of_methods=3):
fig, ax = plt.subplots(nrows=1, ncols=number_of_methods, figsize=(15,6),sharey=True)
for ind, fliter_type in enumerate(data.columns[3:]):
sns.scatterplot(data=data, x="X_1", y="X_2", hue="a", style=fliter_type, ax=ax[ind], style_order=[True, False])
ax[ind].set_title('2d space with treatment assignment\nx - are observations that filtered out ({})'
.format(fliter_type), fontsize=11)
plt.tight_layout()
# create 2d overlapping data
X, a = make_classification(n_samples=120, n_features=2, n_redundant=0, n_informative=1,
random_state=1, n_clusters_per_class=1, class_sep=0)
X, a = pd.DataFrame(X), pd.Series(a)
df = create_trimmed_data(X, a)
plot_trimmed_obs_2d_spcae(data, number_of_methods=3)
# create parialy seperated dataset
X, a = make_classification(n_samples=120, n_features=2, n_redundant=0, n_informative=1,
random_state=1, n_clusters_per_class=1, class_sep=1)
X, a = pd.DataFrame(X), pd.Series(a)
df = create_trimmed_data(X, a)
plot_trimmed_obs_2d_spcae(df, number_of_methods=3)
One way to filter samples as violating positivity assumption or not is to match them and only return samples that successfully matched. Here we show how a positivity checker can be used that is based on the matching logic in causallib.
import pandas as pd
import numpy as np
import seaborn as sb
from causallib.positivity import Matching
from causallib.datasets import load_nhefs
data = load_nhefs(augment=False,onehot=False)
X, a, y = data.X, data.a, data.y
m=Matching(with_replacement=False)
m.fit(X, a)
overlap=m.predict(X, a)
assert(min(len(a) - a.sum(), a.sum()) * 2 == overlap.sum())
from causallib.evaluation.weight_evaluator import calculate_covariate_balance
covbalance = calculate_covariate_balance(X, a, overlap)
sb.heatmap(covbalance, annot=True);
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
pc1,pc2 = TSNE(n_components=2).fit_transform(X).T
Xaug = X.join(a).assign(pc1=pc1,pc2=pc2,overlap=overlap)
sb.set("notebook")
sb.scatterplot(x="pc1", y="pc2", hue="overlap", markers="qsmk", data=Xaug)
<AxesSubplot:xlabel='pc1', ylabel='pc2'>
One way to filter samples as violating the positivity assumption or not is to check for overlap on each variable. As a univariate problem, this is straightforward and we can either use raw min/max to estimate support, quantiles or other methods. Here we have implemented the raw and quantile versions.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from causallib.positivity import UnivariateBoundingBox
from causallib.datasets import load_nhefs
data = load_nhefs(augment=False, onehot=False)
X, a, y = data.X, data.a, data.y
The basic usage is like the other modules, supporting fit
and predict
:
uvbb = UnivariateBoundingBox(quantile_alpha=0.05, continuous_columns=["age", "smokeintensity", "smokeyrs", "wt71"])
uvbb.fit(X, a)
overlap = uvbb.predict(X, a)
We can view the results in a table:
uvbb.supports_table_
treatment | control | joint | |
---|---|---|---|
active | support: {0, 1, 2} | support: {0, 1, 2} | support: {0, 1, 2} |
age | support: [26.0, 69.0] | support: [25.0, 67.0] | support: [26.0, 67.0] |
education | support: {1, 2, 3, 4, 5} | support: {1, 2, 3, 4, 5} | support: {1, 2, 3, 4, 5} |
exercise | support: {0, 1, 2} | support: {0, 1, 2} | support: {0, 1, 2} |
race | support: {0, 1} | support: {0, 1} | support: {0, 1} |
sex | support: {0, 1} | support: {0, 1} | support: {0, 1} |
smokeintensity | support: [1.0, 40.0] | support: [3.0, 45.0] | support: [3.0, 40.0] |
smokeyrs | support: [6.0, 52.0] | support: [5.0, 48.0] | support: [6.0, 48.0] |
wt71 | support: [47.0655, 106.79149999999998] | support: [46.72, 104.52200000000002] | support: [47.0655, 104.52200000000002] |
We can use the subtract operation on the support objects to check the regions of lack of overlap:
uvbb.supports_table_["treatment"] - uvbb.supports_table_["control"]
active {} age [1.0, 2.0] education {} exercise {} race {} sex {} smokeintensity [-2.0, -5.0] smokeyrs [1.0, 4.0] wt71 [0.34550000000000125, 2.269499999999965] dtype: object
This, in turn, defines a kind of score: how much of a support mismatch is there, in standardized units:
We'll want to have a nice way to visualize the results so we'll make some plotting helper functions
def plot_support(supports, ax):
heights = [2, 1, 0]
xmin = [s.support[0] for s in supports]
xmax = [s.support[1] for s in supports]
ax.hlines(y=heights, xmin=min(xmin), xmax=max(xmax), ls=":")
ax.hlines(y=heights, xmin=xmin, xmax=xmax,
colors=["white"], lw=10, alpha=1)
ax.hlines(y=heights, xmin=xmin, xmax=xmax, colors=[
"blue", "black", "orange"], lw=10, alpha=0.5)
for h, label in zip(heights, ["treatment", "joint", "control"]):
midpoint = (min(xmin) + max(xmax))/2
ax.text(midpoint, h, label, horizontalalignment='center',
verticalalignment='center', color="white", fontweight="bold", fontsize="large")
ax.set_yticks([])
ax.set_ylim(-0.5, 2.5)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
def plot_supports(uvbb, columns):
fig, axes = plt.subplots(len(columns), 1, figsize=(5, 5))
for ax, c in zip(axes, columns):
support_list = [uvbb.treatment_support_[c],
uvbb.joint_support_[c], uvbb.control_support_[c]]
plot_support(support_list, ax)
ax.set_ylabel(c)
plt.tight_layout()
plot_supports(
UnivariateBoundingBox(quantile_alpha=None,
continuous_columns=[
"age", "smokeintensity", "smokeyrs", "wt71"]
).fit(X, a),
["wt71", "age", "smokeintensity", "smokeyrs"]
)
plot_supports(
UnivariateBoundingBox(quantile_alpha=0.05,
continuous_columns=[
"age", "smokeintensity", "smokeyrs", "wt71"]
).fit(X, a),
["wt71", "age", "smokeintensity", "smokeyrs"]
)
import seaborn as sb
from causallib.evaluation.weight_evaluator import calculate_covariate_balance
covbalance = calculate_covariate_balance(X, a, UnivariateBoundingBox(quantile_alpha=0.05).fit_predict(X, a))
sb.heatmap(covbalance, annot=True);
I am not happy with the results of this heatmap. It is showing that the trimming does not help the abs_smd
even for variables like smokeintensity
. I don't really see how that makes sense.
This is a great positivity dataset because we know it has problems. Let's see how our simple system works for that:
columns = ["training", # Treatment assignment indicator
"age", # Age of participant
"education", # Years of education
"black", # Indicate whether individual is black
"hispanic", # Indicate whether individual is hispanic
"married", # Indicate whether individual is married
"no_degree", # Indicate if individual has no high-school diploma
"re74", # Real earnings in 1974, prior to study participation
"re75", # Real earnings in 1975, prior to study participation
"re78"] # Real earnings in 1978, after study end
file_names = ["http://www.nber.org/~rdehejia/data/nswre74_treated.txt",
"http://www.nber.org/~rdehejia/data/nswre74_control.txt",
"http://www.nber.org/~rdehejia/data/psid_controls.txt",
"http://www.nber.org/~rdehejia/data/psid2_controls.txt",
"http://www.nber.org/~rdehejia/data/psid3_controls.txt",
"http://www.nber.org/~rdehejia/data/cps_controls.txt",
"http://www.nber.org/~rdehejia/data/cps2_controls.txt",
"http://www.nber.org/~rdehejia/data/cps3_controls.txt"]
files = [pd.read_csv(file_name, delim_whitespace=True,
header=None, names=columns) for file_name in file_names]
lalonde = pd.concat(files, ignore_index=True)
lalonde = lalonde.sample(frac=1.0, random_state=42) # Shuffle
lalonde = lalonde.sample(frac=1.0, random_state=42) # Shuffle
lalonde = lalonde.join(
(lalonde[["re74", "re75"]] == 0).astype(int), rsuffix=("=0"))
print(lalonde.shape)
a_ll = lalonde.pop("training")
y_ll = lalonde.pop("re78")
X_ll = lalonde
(22106, 12)
We want the categoricals to be correctly identified as such. For our purposes today we'll treat years of education as a continuous number.
uvbbll = UnivariateBoundingBox(0.0, continuous_columns=["age", "re74", "re75", "education"]).fit(X_ll, a_ll)
plot_supports(uvbbll, ["age", "re74", "re75", "education"])
uvbbll_q005 = UnivariateBoundingBox(0.05, continuous_columns=["age", "re74", "re75", "education"]).fit(X_ll, a_ll)
plot_supports(uvbbll, ["age", "re74", "re75", "education"])
uvbbll.supports_table_
treatment | control | joint | |
---|---|---|---|
age | support: [17.0, 48.0] | support: [16.0, 55.0] | support: [17.0, 48.0] |
education | support: [4.0, 16.0] | support: [0.0, 18.0] | support: [4.0, 16.0] |
black | support: [0.0, 1.0] | support: [0.0, 1.0] | support: [0.0, 1.0] |
hispanic | support: [0.0, 1.0] | support: [0.0, 1.0] | support: [0.0, 1.0] |
married | support: [0.0, 1.0] | support: [0.0, 1.0] | support: [0.0, 1.0] |
no_degree | support: [0.0, 1.0] | support: [0.0, 1.0] | support: [0.0, 1.0] |
re74 | support: [0.0, 35040.07] | support: [0.0, 137148.68] | support: [0.0, 35040.07] |
re75 | support: [0.0, 25142.24] | support: [0.0, 156653.23] | support: [0.0, 25142.24] |
re74=0 | support: {0, 1} | support: {0, 1} | support: {0, 1} |
re75=0 | support: {0, 1} | support: {0, 1} | support: {0, 1} |
uvbbll.supports_table_["treatment"] - uvbbll.supports_table_["control"]
age [1.0, -7.0] education [4.0, -2.0] black [0.0, 0.0] hispanic [0.0, 0.0] married [0.0, 0.0] no_degree [0.0, 0.0] re74 [0.0, -102108.60999999999] re75 [0.0, -131510.99000000002] re74=0 {} re75=0 {} dtype: object
This is an experimental idea by which we can tell the "score" of the fit by reporting the largest overlap violation in standardized units. It is not included in the main module because it is still experimental.
def score(uvbb, full_output=False):
support_diff = (uvbb.scaled_supports_table_["treatment"] - uvbb.scaled_supports_table_["control"])
rescaled_delta_df = support_diff[support_diff.apply(bool)]
if not full_output:
return abs(np.vstack(rescaled_delta_df)).max()
return rescaled_delta_df
score(uvbb,False)
0.41880441363664245
score(uvbbll, False)
19.54002226126555
We assume that every patient received one treatment
When considering multiple treatments we need to asses the positivity estimation scheme. In the binary case we will have an experimental group and control group in that case we would want to unconfound each treatment group against the control. We provide a scheme that enable positivity estimation even without a control group. In The notebook we present two meta-schemes for multiple treatment positivity that uses a basic positivity algorithm such as Trimming. For further theoritical discussion we reffer the reader to the paper: Theoritical basis of multiple treatments and multiple versions by Miguel A. Hernán
from sklearn.datasets import make_classification
import pandas as pd
from causallib.positivity import Trimming
from causallib.positivity.multiple_treatment_positivity import OneVersusRestPositivity, OneVersusAnotherPositivity
import numpy as np
import seaborn as sns
def create_simulated_data():
X, y = make_classification(
n_samples=500,
n_features=3,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_classes=2,
n_clusters_per_class=1,
flip_y=0.01,
class_sep=1.0,
random_state=899,
)
X, a = X[:, :-1], pd.Series(
pd.qcut(X[:, -1], 5, labels=[1, 2, 3, 4, 5], retbins=True)[0].to_numpy()
)
index_ = ['patient_'+str(i) for i in range(500)]
return {
"X": pd.DataFrame(X,index=index_),
"a": pd.Series(data=a.values,index=index_),
"y": pd.Series(data = y,index=index_),
}
data = create_simulated_data()
pos_one_vs_control = OneVersusAnotherPositivity(Trimming(),verbose=False, treatment_pairs_list=5, treatment_a_name='Treatment', treatment_b_name='Control')
pos_one_vs_control.fit(data['X'],data['a'])
OneVersusAnotherPositivity(base_positivity_estimator=Trimming(threshold='crump'), treatment_a_name='Treatment', treatment_b_name='Control', treatment_pairs_list=[(4, 5), (2, 5), (3, 5), (1, 5)])
We can view the positivity profile of each patient on the tuple he was treated. Notice that the estimation population is both i.e the treated and control groups.
Note that the None values are for the patient that were not included in the single positivity estimations.
profile = pos_one_vs_control.positivity_profile(data['X'],data['a'],estimation_population='Both').head()
profile['a'] =data['a']
profile.head()
Treatment | 4 | 2 | 3 | 1 | a |
---|---|---|---|---|---|
Control | 5 | 5 | 5 | 5 | |
patient_0 | True | NaN | NaN | NaN | 4 |
patient_1 | True | NaN | NaN | NaN | 4 |
patient_2 | NaN | True | NaN | NaN | 2 |
patient_3 | NaN | NaN | True | NaN | 3 |
patient_4 | True | True | True | True | 5 |
We can review the report for every patient - for those who are in the data but not necessarily were part of the control or treated groups for the single estimator. Treated and control modes are available as well.
profile = pos_one_vs_control.positivity_profile(data['X'],data['a'],estimation_population='Every').head()
profile['a'] =data['a']
profile.head()
Treatment | 4 | 2 | 3 | 1 | a |
---|---|---|---|---|---|
Control | 5 | 5 | 5 | 5 | |
patient_0 | True | True | True | False | 4 |
patient_1 | True | True | True | True | 4 |
patient_2 | True | True | True | True | 2 |
patient_3 | True | False | True | False | 3 |
patient_4 | True | True | True | True | 5 |
We can also view a report per treatment, same modes are available as before.
pos_one_vs_control.treatment_positivity_summary(data['X'],a=data['a'])
Control | 5 |
---|---|
Treatment | |
1 | 0.760 |
2 | 0.885 |
3 | 1.000 |
4 | 1.000 |
We can see the rate of the overlapping cases per treated group.
pos_one_vs_another = OneVersusAnotherPositivity(Trimming(),verbose=True)
pos_one_vs_another.fit(data['X'],data['a'])
Fitting treatment 10
100%|██████████| 10/10 [00:06<00:00, 1.59it/s]
OneVersusAnotherPositivity(base_positivity_estimator=Trimming(threshold='crump'), treatment_pairs_list=[(4, 2), (4, 3), (4, 5), (4, 1), (2, 3), (2, 5), (2, 1), (3, 5), (3, 1), (5, 1)], verbose=True)
Let's view the report
pos_one_vs_another.treatment_positivity_summary(data['X'],a=data['a'],estimation_population='Both')
treatmentB | 1 | 2 | 3 | 5 |
---|---|---|---|---|
treatmentA | ||||
2 | 1.00 | NaN | 1.0 | 0.885 |
3 | 1.00 | NaN | NaN | 1.000 |
4 | 0.80 | 0.945 | 1.0 | 1.000 |
5 | 0.76 | NaN | NaN | NaN |
The rows are for the first treatment in the tuple and the columns are for the second treatment in tuple. The report is not symmetric because treatment one for example, appears only as the first treatment. This report has the same modes as the previous. If we want a symmetric report we can use the contingency profile.
con = pos_one_vs_another.treatment_positivity_summary(data['X'],a=data['a'],estimation_population='Both',as_contingency_table=True)
sns.heatmap(con,annot=True)
<AxesSubplot:>
Now we can do a cool trick we will create a contingency table were i,j is the rate of i treated cases that overlap between i and j and cell j,i is the rate of j cases that overlap between i and j.
tr = pos_one_vs_another.treatment_positivity_summary(data['X'],a=data['a'],estimation_population='treated',as_contingency_table=True)
cr = pos_one_vs_another.treatment_positivity_summary(data['X'],a=data['a'],estimation_population='control',as_contingency_table=True)
con = tr.fillna(0)+cr.fillna(0).T
sns.heatmap(con,annot=True)
<AxesSubplot:>
This is a excessive report the diagonal is non relevant
pos_one_vs_rest = OneVersusRestPositivity(Trimming(),verbose=True)
pos_one_vs_rest.fit(data['X'],data['a'])
pos_one_vs_rest.positivity_profile(data['X'],data['a']).head()
Fitting 5 treatment
1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|
patient_0 | False | False | True | True | True |
patient_1 | True | True | True | True | True |
patient_2 | True | True | True | True | True |
patient_3 | False | False | True | True | True |
patient_4 | True | True | True | True | True |
Note that every treatment here is in the positivity population
pos_one_vs_rest.treatment_positivity_summary(data['X'],a=data['a'])
1 0.882 2 0.880 3 1.000 4 0.942 5 0.878 dtype: float64
And here we can see the contingency table as well
con = pos_one_vs_rest.treatment_positivity_summary(data['X'],a=data['a'],as_contingency_table=True)
sns.heatmap(con,annot=True)
<AxesSubplot:>