#!/usr/bin/env python
# coding: utf-8
# # 🎯 Uplift modeling `metrics` advanced
#
#
#
#
#
#
#
# SCIKIT-UPLIFT REPO |
# SCIKIT-UPLIFT DOCS |
# USER GUIDE
#
#
# In[1]:
import sys
# install uplift library scikit-uplift and other libraries
get_ipython().system('{sys.executable} -m pip install scikit-uplift dill catboost')
from IPython.display import clear_output
clear_output()
# # 📝 Load data
#
# We are going to use a `Lenta dataset` from the BigTarget Hackathon hosted in summer 2020 by Lenta and Microsoft.
#
# Lenta is a russian food retailer.
#
# ### Data description
#
# ✏️ Dataset can be loaded from `sklift.datasets` module using `fetch_lenta` function.
#
# Read more about dataset in the api docs.
#
# This is an uplift modeling dataset containing data about Lenta's customers grociery shopping, marketing campaigns communications as `treatment` and store visits as `target`.
#
# #### ✏️ Major columns:
#
# - `group` - treatment / control flag
# - `response_att` - binary target
# - `CardHolder` - customer id
# - `gender` - customer gender
# - `age` - customer age
# In[2]:
from sklift.datasets import fetch_lenta
# returns sklearn Bunch object
# with data, target, treatment keys
# data features (pd.DataFrame), target (pd.Series), treatment (pd.Series) values
dataset = fetch_lenta()
# In[3]:
print(f"Dataset type: {type(dataset)}\n")
print(f"Dataset features shape: {dataset.data.shape}")
print(f"Dataset target shape: {dataset.target.shape}")
print(f"Dataset treatment shape: {dataset.treatment.shape}")
# # 📝 EDA
# In[4]:
dataset.data.head().append(dataset.data.tail())
# ### 🤔 target share for `treatment / control`
# In[5]:
import pandas as pd
pd.crosstab(dataset.treatment, dataset.target, normalize='index')
# In[6]:
# make treatment binary
treat_dict = {
'test': 1,
'control': 0
}
dataset.treatment = dataset.treatment.map(treat_dict)
# In[7]:
# fill NaNs in the categorical feature `gender`
# for CatBoostClassifier
dataset.data['gender'] = dataset.data['gender'].fillna(value='Не определен')
print(dataset.data['gender'].value_counts(dropna=False))
# ### ✂️ train test split
#
# - stratify by two columns: treatment and target.
#
# `Intuition:` In a binary classification problem definition we stratify train set by splitting target `0/1` column. In uplift modeling we have two columns instead of one.
# In[8]:
from sklearn.model_selection import train_test_split
stratify_cols = pd.concat([dataset.treatment, dataset.target], axis=1)
X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(
dataset.data,
dataset.treatment,
dataset.target,
stratify=stratify_cols,
test_size=0.3,
random_state=42
)
print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
# # 👾 Class Transformation uplift model and Two Models
#
# ### For example, let's take the models [ Class Transformation ](https://github.com/maks-sh/scikit-uplift/blob/c9dd56aa0277e81ef7c4be62bf2fd33432e46f36/sklift/models/models.py#L181) and [Two Models](https://github.com/maks-sh/scikit-uplift/blob/c9dd56aa0277e81ef7c4be62bf2fd33432e46f36/sklift/models/models.py#L271). Let's display their uplift scores on one graph
# In[9]:
from catboost import CatBoostClassifier
from sklearn.base import clone
from sklift.models import TwoModels
from sklift.models import ClassTransformation
first_estimator = CatBoostClassifier(verbose=100,
task_type="GPU",
devices='0:1',
cat_features=['gender'],
random_state=42,
thread_count=1)
second_estimator = clone(first_estimator)
transform_model = ClassTransformation(estimator=first_estimator)
two_model = TwoModels(estimator_trmnt=first_estimator, estimator_ctrl=second_estimator)
# In[10]:
transform_model = transform_model.fit(
X=X_train,
y=y_train,
treatment=trmnt_train
)
two_model = two_model.fit(
X=X_train,
y=y_train,
treatment=trmnt_train
)
# ### Uplift prediction
# In[11]:
uplift_transform_model_val = transform_model.predict(X_val)
uplift_transform_model_train = transform_model.predict(X_train)
uplift_two_model = two_model.predict(X_val)
# # 🚀🚀🚀 Uplift metrics
# ### 🚀 `uplift@k`
#
# - uplift at first k%
# - usually falls between [0; 1] depending on k, model quality and data
#
#
# ### `uplift@k` = `target mean at k% in the treatment group` - `target mean at k% in the control group`
#
# ___
#
# How to count `uplift@k`:
#
# 1. sort by predicted uplift
# 2. select first k%
# 3. count target mean in the treatment group
# 4. count target mean in the control group
# 5. substract the mean in the control group from the mean in the treatment group
#
# ---
#
# Code parameter options:
#
# - `strategy='overall'` - sort by uplift treatment and control together
# - `strategy='by_group'` - sort by uplift treatment and control separately
# ## `🚀uplift@k with a small step ot the k parameter`
#
#
# In[12]:
import matplotlib.pyplot as plt
import numpy as np
from sklift.metrics import uplift_at_k
values_uplift_k_transform = []
values_uplift_k_two = []
values_k = []
for k in np.arange(0.01,1,0.01):
values_uplift_k_transform.append(uplift_at_k(y_val, uplift_transform_model_val, trmnt_val, strategy='overall', k=k))
values_uplift_k_two.append(uplift_at_k(y_val, uplift_two_model, trmnt_val, strategy='overall', k=k))
values_k.append(k)
# ### `For ClassTransformation model`
# In[13]:
plt.plot(values_k, values_uplift_k_transform)
plt.title('Dependence of uplift@k on k')
plt.xlabel('The value of k')
plt.ylabel('The value of uplift@k')
plt.show()
# ### `For TwoModels`
# In[14]:
plt.plot(values_k, values_uplift_k_two)
plt.title('Dependence of uplift@k on k')
plt.xlabel('The value of k')
plt.ylabel('The value of uplift@k')
plt.show()
# # 🚀 `ASD metric`
# ### `The average squared deviation (ASD) is a model stability metric that shows how much the model overfits the training data. Larger values of ASD mean greater overfit.`
#
# ## Code parameter options:
#
# - `strategy='overall'` - The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated.
# - `strategy='by_group'` - Separately calculates conversions in top k observations in each group (control and treatment) sorted by uplift predictions. Then the difference between these conversions is calculated
# - `bins=10` - Determines the number of bins (and the relative percentile) in the data.
# In[15]:
from sklift.metrics import average_squared_deviation
asd_overall = average_squared_deviation(y_train, uplift_transform_model_train, trmnt_train, y_val,
uplift_transform_model_val, trmnt_val, strategy='overall')
asd_by_group = average_squared_deviation(y_train, uplift_transform_model_train, trmnt_train, y_val,
uplift_transform_model_val, trmnt_val, strategy='by_group')
print(f"average squared deviation by overall strategy for the ClassTransformation model: {asd_overall:.6f}")
print(f"average squared deviation by group strategy for the ClassTransformation model: {asd_by_group:.6f}")
# # `↗️Display 2 different model uplift scores on one qini plot`
#
# ### `Only qiwi curves`
# In[16]:
from sklift.viz import plot_qini_curve
fig, ax_roc = plt.subplots(1, 1)
plot_qini_curve(y_val, uplift_transform_model_val, trmnt_val, name='Transform model', random=False, perfect=False, ax=ax_roc)
plot_qini_curve(y_val, uplift_two_model, trmnt_val, name='Two models', random=False, perfect=False, ax=ax_roc)
# ### `Qini curves with a random curve and with a perfect curve`
# In[17]:
fig, ax_roc = plt.subplots(1, 1)
plot_qini_curve(y_val, uplift_transform_model_val, trmnt_val, name='Transform model', random=True, perfect=True, ax=ax_roc)
plot_qini_curve(y_val, uplift_two_model, trmnt_val, name='Two models', random=True, perfect=True, ax=ax_roc)
# In[ ]: