import numpy as np
import pandas as pd 

np.random.seed(12345)

# experiment sample size
num_users = 5000

# randomly assign treatment and control groups
df = pd.DataFrame(np.random.choice([0,1],size=num_users,p=[0.5,0.5]),columns=['treated'])

# probability of signup for treatment and control
control_p = 0.3
treatment_effect = 0.07

df['random_p'] = np.random.normal(control_p,0.4,size=num_users)
df['randnum']=np.random.rand(num_users)

df['signup'] = 0
df['signup'] = np.where((df['treated']==0)&(df['randnum']<df['random_p']),1,df['signup'])
df['signup'] = np.where((df['treated']==1)&(df['randnum']<df['random_p']*(1+treatment_effect)),1,df['signup'])


print(df.groupby(['treated'])['signup'].agg(['count','mean']).round(2))

         count  mean
treated             
0         2505  0.34
1         2495  0.36


import statsmodels.formula.api as smf 

formula = 'signup ~ treated'
model = smf.ols(formula,df).fit()

print(model.summary())
se1=model.bse['treated']

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 signup   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     3.105
Date:                Mon, 01 Jan 2024   Prob (F-statistic):             0.0781
Time:                        16:27:32   Log-Likelihood:                -3400.8
No. Observations:                5000   AIC:                             6806.
Df Residuals:                    4998   BIC:                             6819.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3409      0.010     35.712      0.000       0.322       0.360
treated        0.0238      0.014      1.762      0.078      -0.003       0.050
==============================================================================
Omnibus:                    24719.136   Durbin-Watson:                   1.990
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              861.403
Skew:                           0.616   Prob(JB):                    8.88e-188
Kurtosis:                       1.382   Cond. No.                         2.62
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


formula = 'signup ~ treated + random_p'
model = smf.ols(formula,df).fit()

print(model.summary())
se2=model.bse['treated']
print("\nStandard error reduction: {:.2f}%".format(100*(se1-se2)/se1))

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 signup   R-squared:                       0.391
Model:                            OLS   Adj. R-squared:                  0.391
Method:                 Least Squares   F-statistic:                     1604.
Date:                Mon, 01 Jan 2024   Prob (F-statistic):               0.00
Time:                        16:45:25   Log-Likelihood:                -2162.8
No. Observations:                5000   AIC:                             4332.
Df Residuals:                    4997   BIC:                             4351.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1252      0.008     14.957      0.000       0.109       0.142
treated        0.0234      0.011      2.216      0.027       0.003       0.044
random_p       0.7434      0.013     56.588      0.000       0.718       0.769
==============================================================================
Omnibus:                      740.320   Durbin-Watson:                   1.983
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              200.539
Skew:                           0.184   Prob(JB):                     2.84e-44
Kurtosis:                       2.091   Cond. No.                         3.16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Standard error reduction: 21.92%


print("The estimated absolute impact is: {0:.2f}% \
      \nThe estimated relative impact is {1:.2f}% \
      \nThe t-statistic is {2:.1f} \
      \nThe p-value is {3:.2f}%".format(
        100*model.params['treated'],
        100*model.params['treated']/df[df['treated']==0]['signup'].mean(),
        model.tvalues['treated'],
        100*model.pvalues['treated']
))

The estimated absolute impact is: 2.34%       
The estimated relative impact is 6.86%       
The t-statistic is 2.2       
The p-value is 2.67%

Background¶

Simulate data¶

Evaluate experiment using linear regression¶

Add important predictor for CUPED¶

Extract key values from the regression summary¶

Conclusion¶