#!/usr/bin/env python
# coding: utf-8

# # MegaFon Uplift Competition Dataset

# The dataset is provided by MegaFon at the [MegaFon Uplift Competition](https://ods.ai/competitions/megafon-df21-comp) hosted in may 2021.
# 
# Here is a description of the fields:
# 
# - **id** (int): client id
# 
# - **treatment_group** (str): treatment/control group flag
# 
# - **X_1...X_50** (float): feature values
# 
# - **conversion** (binary): target

# In[ ]:


import sys

# install uplift library scikit-uplift and other libraries 
get_ipython().system('{sys.executable} -m pip install scikit-uplift dill lightgbm')


# In[1]:


from sklift.datasets import fetch_megafon
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn.model_selection import train_test_split
from sklift.models import TwoModels
import lightgbm as lgb
from sklift.metrics import qini_auc_score
from sklift.viz import plot_qini_curve

seed=31


# ## 📝 Load data
# 
# Dataset can be loaded from `sklift.datasets` module using `fetch_megafon` function.

# In[2]:


# returns sklearn Bunch object
# with data, target, treatment keys
# data features (pd.DataFrame), target (pd.Series), treatment (pd.Series) values 

dataset = fetch_megafon()


# In[3]:


print(f"Dataset type: {type(dataset)}\n")
print(f"Dataset features shape: {dataset.data.shape}")
print(f"Dataset target shape: {dataset.target.shape}")
print(f"Dataset treatment shape: {dataset.treatment.shape}")


# ## 📝 EDA

# Let's have a look at the data features.

# In[4]:


dataset.data.head().append(dataset.data.tail())


# In[5]:


dataset.data.describe()


# In[9]:


dataset.data.dtypes.value_counts()


# In[6]:


print('Number NA:', dataset.data.isna().sum().sum())


# There are 51 columns in the dataset: 50 feature values are float and id is integer. The dataset has no missing values.

# ###  Target and treatment 

# Also take a look at target and treatment.

# In[10]:


sns.countplot(x=dataset.treatment)


# In[11]:


dataset.treatment.value_counts()


# In[12]:


sns.countplot(x=dataset.target)


# In[13]:


dataset.target.value_counts()


# In[14]:


pd.crosstab(dataset.treatment, dataset.target, normalize='index')


# As we can see, the target is not balanced.

# ### Simple baseline

# Convert the **treatment** to binary values.

# In[16]:


dataset.treatment.loc[dataset.treatment == 'treatment'] = 1
dataset.treatment.loc[dataset.treatment == 'control'] = 0


# In[17]:


dataset.treatment.value_counts()


# In a binary classification problem definition we stratify train set by splitting target `0/1` column. In uplift modeling we have two columns instead of one. 

# In[18]:


stratify_cols = pd.concat([dataset.treatment, dataset.target], axis=1)

X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(
    dataset.data,
    dataset.treatment,
    dataset.target,
    stratify=stratify_cols,
    test_size=0.3,
    random_state=31
)

print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")


# In[19]:


treatment_model = lgb.LGBMClassifier(random_state=31)
control_model = lgb.LGBMClassifier(random_state=31)
tm = TwoModels(estimator_trmnt = treatment_model, estimator_ctrl = control_model, method='vanilla')


# In[20]:


tm = tm.fit(X_train, y_train, trmnt_train)


# In[21]:


uplift_tm = tm.predict(X_val)


# In[22]:


# AUQC = area under Qini curve = Qini coefficient
auqc = qini_auc_score(y_val, uplift_tm, trmnt_val) 
print(f"Qini coefficient on full data: {auqc:.4f}")


# In[23]:


# with ideal Qini curve (red line)
# perfect=True

plot_qini_curve(y_val, uplift_tm, trmnt_val, perfect=True);


# In[ ]: