#!/usr/bin/env python # coding: utf-8 # # Example of usage model from sklift.models in sklearn.pipeline # #
#
# # # #
# SCIKIT-UPLIFT REPO | # SCIKIT-UPLIFT DOCS | # USER GUIDE #
# RUSSIAN VERSION # #
# This is a simple example on how to use [sklift.models](https://scikit-uplift.readthedocs.io/en/latest/api/models.html) with [sklearn.pipeline](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline). # # The data is taken from [MineThatData E-Mail Analytics And Data Mining Challenge dataset by Kevin Hillstrom](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html). # # This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test: # * 1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise. # * 1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise. # * 1/3 were randomly chosen to not receive an e-mail campaign. # # During a period of two weeks following the e-mail campaign, results were tracked. The task is to tell the world if the Mens or Womens e-mail campaign was successful. # # The full description of the dataset can be found at the [link](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html). # # Firstly, install the necessary libraries: # In[1]: get_ipython().system('pip install scikit-uplift xgboost==1.0.2 category_encoders==2.1.0 -U') # For simplicity of the example, we will leave only two user segments: # * those who were sent an e-mail advertising campaign with women's products; # * those who were not sent out the ad campaign. # # We will use the `visit` variable as the target variable. # In[2]: import pandas as pd from sklift.datasets import fetch_hillstrom get_ipython().run_line_magic('matplotlib', 'inline') bunch = fetch_hillstrom(target_col='visit') dataset, target, treatment = bunch['data'], bunch['target'], bunch['treatment'] print(f'Shape of the dataset before processing: {dataset.shape}') # Selecting two segments dataset = dataset[treatment!='Mens E-Mail'] target = target[treatment!='Mens E-Mail'] treatment = treatment[treatment!='Mens E-Mail'].map({ 'Womens E-Mail': 1, 'No E-Mail': 0 }) print(f'Shape of the dataset after processing: {dataset.shape}') dataset.head() # Divide all the data into a training and validation sample: # In[3]: from sklearn.model_selection import train_test_split X_tr, X_val, y_tr, y_val, treat_tr, treat_val = train_test_split( dataset, target, treatment, test_size=0.5, random_state=42 ) # Select categorical features: # In[4]: cat_cols = X_tr.select_dtypes(include='object').columns.tolist() print(cat_cols) # Create the necessary objects and combining them into a pipieline: # In[5]: from sklearn.pipeline import Pipeline from category_encoders import CatBoostEncoder from sklift.models import ClassTransformation from xgboost import XGBClassifier encoder = CatBoostEncoder(cols=cat_cols) estimator = XGBClassifier(max_depth=2, random_state=42) ct = ClassTransformation(estimator=estimator) my_pipeline = Pipeline([ ('encoder', encoder), ('model', ct) ]) # Train pipeline as usual, but adding the treatment column in the step model as a parameter `model__treatment`. # In[6]: my_pipeline = my_pipeline.fit( X=X_tr, y=y_tr, model__treatment=treat_tr ) # Predict the uplift and calculate the uplift@30% # In[7]: from sklift.metrics import uplift_at_k uplift_predictions = my_pipeline.predict(X_val) uplift_30 = uplift_at_k(y_val, uplift_predictions, treat_val, strategy='overall') print(f'uplift@30%: {uplift_30:.4f}')