#!/usr/bin/env python # coding: utf-8 # # MegaFon Uplift Competition Dataset # The dataset is provided by MegaFon at the [MegaFon Uplift Competition](https://ods.ai/competitions/megafon-df21-comp) hosted in may 2021. # # Here is a description of the fields: # # - **id** (int): client id # # - **treatment_group** (str): treatment/control group flag # # - **X_1...X_50** (float): feature values # # - **conversion** (binary): target # In[ ]: import sys # install uplift library scikit-uplift and other libraries get_ipython().system('{sys.executable} -m pip install scikit-uplift dill lightgbm') # In[1]: from sklift.datasets import fetch_megafon import pandas as pd import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.model_selection import train_test_split from sklift.models import TwoModels import lightgbm as lgb from sklift.metrics import qini_auc_score from sklift.viz import plot_qini_curve seed=31 # ## 📝 Load data # # Dataset can be loaded from `sklift.datasets` module using `fetch_megafon` function. # In[2]: # returns sklearn Bunch object # with data, target, treatment keys # data features (pd.DataFrame), target (pd.Series), treatment (pd.Series) values dataset = fetch_megafon() # In[3]: print(f"Dataset type: {type(dataset)}\n") print(f"Dataset features shape: {dataset.data.shape}") print(f"Dataset target shape: {dataset.target.shape}") print(f"Dataset treatment shape: {dataset.treatment.shape}") # ## 📝 EDA # Let's have a look at the data features. # In[4]: dataset.data.head().append(dataset.data.tail()) # In[5]: dataset.data.describe() # In[9]: dataset.data.dtypes.value_counts() # In[6]: print('Number NA:', dataset.data.isna().sum().sum()) # There are 51 columns in the dataset: 50 feature values are float and id is integer. The dataset has no missing values. # ### Target and treatment # Also take a look at target and treatment. # In[10]: sns.countplot(x=dataset.treatment) # In[11]: dataset.treatment.value_counts() # In[12]: sns.countplot(x=dataset.target) # In[13]: dataset.target.value_counts() # In[14]: pd.crosstab(dataset.treatment, dataset.target, normalize='index') # As we can see, the target is not balanced. # ### Simple baseline # Convert the **treatment** to binary values. # In[16]: dataset.treatment.loc[dataset.treatment == 'treatment'] = 1 dataset.treatment.loc[dataset.treatment == 'control'] = 0 # In[17]: dataset.treatment.value_counts() # In a binary classification problem definition we stratify train set by splitting target `0/1` column. In uplift modeling we have two columns instead of one. # In[18]: stratify_cols = pd.concat([dataset.treatment, dataset.target], axis=1) X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split( dataset.data, dataset.treatment, dataset.target, stratify=stratify_cols, test_size=0.3, random_state=31 ) print(f"Train shape: {X_train.shape}") print(f"Validation shape: {X_val.shape}") # In[19]: treatment_model = lgb.LGBMClassifier(random_state=31) control_model = lgb.LGBMClassifier(random_state=31) tm = TwoModels(estimator_trmnt = treatment_model, estimator_ctrl = control_model, method='vanilla') # In[20]: tm = tm.fit(X_train, y_train, trmnt_train) # In[21]: uplift_tm = tm.predict(X_val) # In[22]: # AUQC = area under Qini curve = Qini coefficient auqc = qini_auc_score(y_val, uplift_tm, trmnt_val) print(f"Qini coefficient on full data: {auqc:.4f}") # In[23]: # with ideal Qini curve (red line) # perfect=True plot_qini_curve(y_val, uplift_tm, trmnt_val, perfect=True); # In[ ]: