In [1]:
import numpy as np
import statsmodels.api as sm
from statsmodels.tools import add_constant
from matplotlib import pyplot

In [2]:
def logistic_map(x):
return np.exp(x)/(1+np.exp(x))

def logistic_inv(x):
return np.log(x/(1-x))

def print_population_log_odds(P_T,P_C):
pop_log_odds = logistic_inv(P_T)-logistic_inv(P_C)
print("P O P U L A T I O N")
print("------------------------------------------")
print("Population log odds = {:.2f}".format(pop_log_odds))
print("Population odds ratio = {:.2f}\n\n".format(np.exp(pop_log_odds)))
return pop_log_odds

def logistic_vs_plugin(sampler,pop_log_odds,ntrials=1000,nsamples=10000):
'''
Run ntrials experiments where we sample n individuals from the population, fit a
logistic model predicting Y from (X,Z), and then using the coefficient of X as an
estimate of the log odds. Display the average point estimate and the frequency in
which the true log odds falls in the 95% confidence interval. Also compute a
plug-in estimate of the log odds for comparison.

sampler: method that produces n samples from the generative model of (X,Z,Y)
pop_log_odds: true population log odds
ntrials : number of times to run the experiment
nsamples:  number of samples from the population per experiment
'''

success_count=np.zeros(ntrials)
beta_est = np.empty(ntrials)
logistic_odds_ratio_est = np.empty(ntrials)
plug_in_log_odds_est= np.empty(ntrials)
for i in range(ntrials):

# sample nsamples from the distribution
covariate,treatment,outcome = sampler(nsamples)

# run logistic regression and compute confidence intervals
model = sm.Logit(outcome, features).fit(disp=0)

beta_est[i] = model.params[1]
logistic_odds_ratio_est[i] = np.exp(model.params[1])
lower_int = model.conf_int(alpha=0.05, cols=None)[1,0]
if lower_int<pop_log_odds:
success_count[i]=1

# plugin estimator
P_T_est = np.sum(treatment*outcome)/np.sum(treatment)
P_C_est = np.sum((1-treatment)*outcome)/np.sum(1-treatment)
plug_in_log_odds_est[i]= np.log(P_T_est/(1-P_T_est))-np.log(P_C_est/(1-P_C_est))

print("L O G I S T I C   R E G R E S S I O N")
print("------------------------------------------")
print("Median estimate of treatment coefficient = {:.2f}".format(np.median(beta_est)))
print("Median estimated odds ratio from logistic = {:.2f}".format(np.median(logistic_odds_ratio_est)))
print("Confidence interval contains population log-odds {:.0f}% of the time\n\n".format(100*np.mean(success_count)))

print("P L U G I N   E S T I M A T O R")
print("------------------------------------------")
print("Median plugin estimate of log odds = {:.2f}".format(np.median(plug_in_log_odds_est)))
print("Median estimated odds ratio from plugin = {:.2f}\n\n".format(np.median(np.exp(plug_in_log_odds_est))))

# plot a histogram of the estimates
bins = np.linspace(0, 1.1*np.amax(beta_est), 50)

pyplot.hist(beta_est, bins, alpha=0.5, label='logistic estimate')
pyplot.hist(plug_in_log_odds_est, bins, alpha=0.5, label='plugin estimate')
pyplot.plot([pop_log_odds,pop_log_odds],[0,100],color=[0,0,0])
pyplot.legend(loc='upper right')
pyplot.show()


# Example 1: An additive model¶

True model is

$$\mathbb{E}[Y|X,Z] = \delta_p X + p_{\mathrm{hi}}Z+(1-Z)p_{\mathrm{lo}}$$

$X$ is the treatment variable, $Z$ the covariate. Both $X$ and $Z$ are independent Bernoulli variables with probability of $1$ equal to $\tfrac{1}{2}$.

In [3]:
p_hi = 0.8 # probability of success in the high probability subpopulation
p_lo = 0.2 # probability of success in the low probability subpopulation
delta_p = 0.05 # effect size

# probability of success under treatment
P_T_additive = delta_p + 0.5*p_hi+0.5*p_lo
# probability of success under control

# generate covariate and treatment variables and compute conditional outcome probabilities
covariate = np.random.binomial(1,0.5,n)
treatment = np.random.binomial(1,0.5,n)
prob = delta_p*treatment + (p_hi-p_lo)*covariate + p_lo
outcome = np.random.binomial(1, prob)
return covariate,treatment,outcome

In [4]:
pop_log_odds_additive=print_population_log_odds(P_T_additive,P_C_additive)

P O P U L A T I O N
------------------------------------------
Population log odds = 0.20
Population odds ratio = 1.22

L O G I S T I C   R E G R E S S I O N
------------------------------------------
Median estimate of treatment coefficient = 0.32
Median estimated odds ratio from logistic = 1.37
Confidence interval contains population log-odds 38% of the time

P L U G I N   E S T I M A T O R
------------------------------------------
Median plugin estimate of log odds = 0.20
Median estimated odds ratio from plugin = 1.22



# Example 2: An actual logistic model¶

True model is

$$\log \frac {\mathbb{P}[Y=1|X,Z]}{1-\mathbb{P}[Y=1|X,Z]} = \alpha + \beta X + \gamma Z$$

$X$ is the treatment variable, $Z$ the covariate. Both $X$ and $Z$ are independent Bernoulli variables with probability of $1$ equal to $\tfrac{1}{2}$.

In [5]:
# logistic model parameters
alpha_true = -1.5
beta_true =  0.4
gamma_true =  3.0

# probability of success under treatment
p_hi_T = logistic_map(alpha_true+beta_true+gamma_true)
p_lo_T = logistic_map(alpha_true+beta_true)
P_T_logistic = 0.5*p_hi_T+0.5*p_lo_T

# probability of success under control
p_hi_C = logistic_map(alpha_true+gamma_true)
p_lo_C = logistic_map(alpha_true)
P_C_logistic = 0.5*p_hi_C+0.5*p_lo_C

def sample_logistic_model(n):
# generate covariate and treatment variables and compute conditional outcome probabilities
covariate = np.random.binomial(1,0.5,n)
treatment = np.random.binomial(1,0.5,n)
prob = logistic_map(alpha_true + beta_true*treatment + gamma_true*covariate)
outcome = np.random.binomial(1, prob)
return covariate,treatment,outcome

In [6]:
pop_log_odds_logistic=print_population_log_odds(P_T_logistic,P_C_logistic)
logistic_vs_plugin(sample_logistic_model,pop_log_odds_logistic)

P O P U L A T I O N
------------------------------------------
Population log odds = 0.24
Population odds ratio = 1.27

L O G I S T I C   R E G R E S S I O N
------------------------------------------
Median estimate of treatment coefficient = 0.40
Median estimated odds ratio from logistic = 1.49
Confidence interval contains population log-odds 13% of the time

P L U G I N   E S T I M A T O R
------------------------------------------
Median plugin estimate of log odds = 0.24
Median estimated odds ratio from plugin = 1.27


In [ ]: