#!/usr/bin/env python # coding: utf-8 # ## collections_email # In[1]: import pandas as pd import numpy as np np.random.seed(24) n = 5000 email = np.random.binomial(1, 0.5, n) credit_limit = np.random.gamma(6, 200, n) risk_score = np.random.beta(credit_limit, credit_limit.mean(), n) opened = np.random.normal(5 + 0.001*credit_limit - 4*risk_score, 2) opened = (opened > 4).astype(float) * email agreement = np.random.normal(30 +(-0.003*credit_limit - 10*risk_score), 7) * 2 * opened agreement = (agreement > 40).astype(float) payments = (np.random.normal(500 + 0.16*credit_limit - 40*risk_score + 11*agreement + email, 75).astype(int) // 10) * 10 data = pd.DataFrame(dict(payments=payments, email=email, opened=opened, agreement=agreement, credit_limit=credit_limit, risk_score=risk_score)) data.to_csv("collections_email.csv", index=False) # ## hospital_treatment # In[2]: import pandas as pd import numpy as np np.random.seed(24) n = 80 hospital = np.random.binomial(1, 0.5, n) treatment = np.where(hospital.astype(bool), np.random.binomial(1, 0.9, n), np.random.binomial(1, 0.1, n)) severity = np.where(hospital.astype(bool), np.random.normal(20, 5, n), np.random.normal(10, 5, n)) days = np.random.normal(15 + -5*treatment + 2*severity, 7).astype(int) hospital = pd.DataFrame(dict(hospital=hospital, treatment=treatment, severity=severity, days=days)) hospital.to_csv("hospital_treatment.csv", index=False) # ## app engagement push # In[3]: import pandas as pd import numpy as np np.random.seed(24) n = 10000 push_assigned = np.random.binomial(1, 0.5, n) income = np.random.gamma(6, 200, n) push_delivered = np.random.normal(5 + 0.3+income, 500) push_delivered = ((push_delivered > 800) & (push_assigned == 1)).astype(int) in_app_purchase = (np.random.normal(100 + 20*push_delivered + 0.5*income, 75).astype(int) // 10) data = pd.DataFrame(dict(in_app_purchase=in_app_purchase, push_assigned=push_assigned, push_delivered=push_delivered)) data.to_csv("app_engagement_push.csv", index=False) # ## Drug Impact # In[17]: import numpy as np import pandas as pd def make_confounded_data(N): def get_severity(df): return ((np.random.beta(1, 3, size=df.shape[0]) * (df["age"] < 30)) + (np.random.beta(3, 1.5, size=df.shape[0]) * (df["age"] >= 30))) def get_treatment(df): return ((.33 * df["sex"] + 1.5 * df["severity"] + df["severity"] ** 2 + 0.15 * np.random.normal(size=df.shape[0])) > 1.5).astype(int) def get_recovery(df): return ((2 + 0.5 * df["sex"] + 0.03 * df["age"] + 0.03 * ((df["age"] * 0.1) ** 2) + df["severity"] + np.log(df["severity"]) + df["sex"] * df["severity"] - df["medication"]) * 10).astype(int) np.random.seed(1111) sexes = np.random.randint(0, 2, size=N) ages = np.random.gamma(8, scale=4, size=N) meds = np.random.beta(1, 1, size=N) # dados com designação aleatória df_rnd = pd.DataFrame(dict(sex=sexes, age=ages, medication=meds)) df_rnd['severity'] = get_severity(df_rnd) df_rnd['recovery'] = get_recovery(df_rnd) features = ['sex', 'age', 'severity', 'medication', 'recovery'] df_rnd = df_rnd[features] # to enforce column order # dados observacionais df_obs = df_rnd.copy() df_obs['medication'] = get_treatment(df_obs) df_obs['recovery'] = get_recovery(df_obs) # dados contrafactuais data df_ctf = df_obs.copy() df_ctf['medication'] = ((df_ctf['medication'] == 1) ^ 1).astype(float) df_ctf['recovery'] = get_recovery(df_ctf) return df_rnd, df_obs, df_ctf np.random.seed(1234) df_rnd, df_obs, df_ctf = make_confounded_data(20000) df_obs.to_csv("medicine_impact_recovery.csv", index=False) # ## Bilboard Mkt # In[5]: import pandas as pd import numpy as np np.random.seed(123) POAMay = np.random.gamma(7,10, 500) * np.random.binomial(1, .7, 500) POAJul = np.random.gamma(7,15, 800) * np.random.binomial(1, .8, 800) FLMay = np.random.gamma(10,20, 1300) * np.random.binomial(1, .85, 1300) FLJul = np.random.gamma(11,21, 2000) * np.random.binomial(1, .9, 2000) data = pd.concat([ pd.DataFrame(dict(deposits = POAMay.astype(int), poa=1, jul=0)), pd.DataFrame(dict(deposits = POAJul.astype(int), poa=1, jul=1)), pd.DataFrame(dict(deposits = FLMay.astype(int), poa=0, jul=0)), pd.DataFrame(dict(deposits = FLJul.astype(int), poa=0, jul=1)) ]) data.to_csv("billboard_impact.csv", index=False) # In[ ]: