import sys
from matplotlib import pyplot as plt
from sklift.metrics import uplift_at_k
import seaborn as sns
import numpy as np
import pandas as pd
# install uplift library scikit-uplift and other libraries
#!{sys.executable} -m pip install scikit-uplift dill catboost
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
We are going to use a Hillstrom dataset from the MineThatData hosted in march 2008 by the president of this company Kevin Hillstrom.
MineThatData is a consulting company that helps CEO understand the complex relationship between Customers, Advertising, Products, Brands, and Channels.
Dataset can be loaded from sklift.datasets module using fetch_hillstrom function.
Read more about dataset in the api docs.
This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test
Major columns
visit (binary): target. 1/0 indicator, 1 = Customer visited website in the following two weeks.
conversion (binary): target. 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks.
spend (float): target. Actual dollars spent in the following two weeks.
segment (str): treatment. The e-mail campaign the customer received
Read more in the docs
from sklift.datasets import fetch_hillstrom
# returns sklearn Bunch object
# with data, target, treatment keys
# data features (pd.DataFrame), target (pd.Series), treatment (pd.Series) values
dataset = fetch_hillstrom()
print(f"Dataset type: {type(dataset)}\n")
print(f"Dataset features shape: {dataset.data.shape}")
print(f"Dataset target shape: {dataset.target.shape}")
print(f"Dataset treatment shape: {dataset.treatment.shape}")
dataset.data.head().append(dataset.data.tail())
#info about types and null cells in dataset
dataset.data.info()
There is no missing data in the cells!
cat_features = ['channel', 'zip_code', 'history_segment', 'newbie']
dataset.data.channel.unique()
dataset.data.zip_code.unique()
dataset.data.history_segment.unique()
Zip code
dataset.data.zip_code.value_counts().plot(kind = 'bar', grid=True)
Channel
dataset.data.channel.value_counts().plot(kind = 'bar', grid=True)
History segment
dataset.data.history_segment.value_counts().plot(kind = 'bar', grid=True)
#As option I propose to apply following function for transformation data in column "historic_segment"
def historic_segment_transform(dataset):
for payment in dataset.data['history_segment'].unique():
if payment =='1) $0 - $100':
dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 50
elif payment =='2) $100 - $200':
dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 150
elif payment =='3) $200 - $350':
dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 275
elif payment =='4) $350 - $500':
dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 425
elif payment =='5) $500 - $750':
dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 575
elif payment =='5) $750 - $1000':
dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 825
else:
dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 1000
return dataset.data.history_segment
dataset.data.history_segment.value_counts()
dataset.data.history_segment.value_counts().plot(kind = 'bar', grid=True)
#It's better to proceed from categorical to numeric data
# For example, we could replace 0-100 on average value 50
dataset.data.womens.value_counts().plot(kind = 'bar', grid=True)
dataset.data.mens.value_counts().plot(kind = 'bar', grid=True)
dataset.data.womens.value_counts()
dataset.data.groupby('womens').size()/dataset.data['womens'].count()*100
#55% - womens purchases
#44% - mens purchases
dataset.data.mens.value_counts()
dataset.data.groupby('mens').size()/dataset.data['mens'].count()*100
#55% - mens purchases
#44% - womens purchases
plt.figure(figsize = (14,8))
sns.set(font_scale=0.75)
sns.heatmap(dataset.data.corr().round(3), annot=True, square = True, linewidths=.75, cmap='RdPu', fmt = '.2f',annot_kws = {"size": 10} )
plt.title('Correlation matrix')
plt.show()
# womens and mens are in inverse correlation. I propose to make 1 column "gender" and merge.
# As we can see, there is high correlation between 'history_segment' and 'history'. Could we merge it also and transform columns to numeric data?
dataset.data.loc[:, 'recency'].hist(figsize=(8, 4), bins=12, grid=True)
dataset.data.history.value_counts()
dataset.data.loc[:, 'history'].hist(figsize=(8, 4), bins=20, grid=True);
#dataset_segment
dataset.treatment.head()
dataset.treatment.unique()
dataset.treatment.value_counts().plot(kind = 'bar', grid = 'True')
#dataset_visit
dataset.target.head()
dataset.target.value_counts().plot(kind = 'bar')
#Target is disbalanced
import pandas as pd
pd.crosstab(dataset.treatment, dataset.target, normalize='index')
dataset.target.unique()
crosstab = pd.crosstab(dataset.treatment, dataset.target, normalize='index')
sns.heatmap(crosstab, annot=True, fmt=".2f", linewidths=1, square = True, cmap = 'RdPu')
plt.xlabel('Target')
plt.title("Treatment & Target")
#Let's consider two cases:
#1) Womens E-mail - No E-mail
#2) Mens E-mail - No E-mail
# make treatment binary
treat_dict_womens = {
'Womens E-Mail': 1,
'No E-Mail': 0,
'Mens E-Mail': 0
}
dataset.treatment_womens = dataset.treatment.map(treat_dict_womens)
dataset.treatment_womens.value_counts().plot(kind = 'bar', grid = 'True')
stratify_cols = pd.concat([dataset.treatment_womens, dataset.target], axis=1)
stratify_cols.head(5)
from sklearn.model_selection import train_test_split
stratify_cols = pd.concat([dataset.treatment_womens, dataset.target], axis=1)
X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(
dataset.data,
dataset.treatment_womens,
dataset.target,
stratify=stratify_cols,
test_size=0.3,
random_state=42
)
print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
from sklift.models import ClassTransformation
from catboost import CatBoostClassifier
estimator = CatBoostClassifier(verbose=100,
cat_features=['womens', 'mens','channel', 'zip_code', 'history_segment', 'newbie'],
random_state=42,
thread_count=1
)
ct_model = ClassTransformation(estimator=estimator)
ct_model.fit(
X=X_train,
y=y_train,
treatment=trmnt_train
)
#in progress with multiclassclassifier for Catboost.
from sklift.metrics import uplift_at_k
uplift_predictions = ct_model.predict(X_val)
# k = 10%
k = 0.1
# strategy='overall' sort by uplift treatment and control together
uplift_overall = uplift_at_k(y_val, uplift_predictions, trmnt_val, strategy='overall', k=k)
# strategy='by_group' sort by uplift treatment and control separately
uplift_bygroup = uplift_at_k(y_val, uplift_predictions, trmnt_val, strategy='by_group', k=k)
print(f"[email protected]{k * 100:.0f}%: {uplift_overall:.4f} (sort groups by uplift together)")
print(f"[email protected]{k * 100:.0f}%: {uplift_bygroup:.4f} (sort groups by uplift separately)")
treat_dict_mens = {
'Mens E-Mail': 1,
'No E-Mail': 0,
'Womens E-Mail': 0
}
dataset.treatment_mens = dataset.treatment.map(treat_dict_mens)
dataset.treatment_mens = dataset.treatment.map(treat_dict_mens)
dataset.treatment_mens.value_counts().plot(kind = 'bar', grid = 'True')
stratify_cols = pd.concat([dataset.treatment_mens, dataset.target], axis=1)
from sklearn.model_selection import train_test_split
stratify_cols = pd.concat([dataset.treatment_mens, dataset.target], axis=1)
X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(
dataset.data,
dataset.treatment_mens,
dataset.target,
stratify=stratify_cols,
test_size=0.3,
random_state=42
)
print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
from sklift.models import ClassTransformation
from catboost import CatBoostClassifier
estimator = CatBoostClassifier(verbose=100,
cat_features=['womens', 'mens','channel', 'zip_code', 'history_segment', 'newbie'],
random_state=42,
thread_count=1
)
ct_model_mens = ClassTransformation(estimator=estimator)
ct_model_mens.fit(
X=X_train,
y=y_train,
treatment=trmnt_train
)
uplift_predictions_mens = ct_model_mens.predict(X_val)
# k = 10%
k = 0.1
# strategy='overall' sort by uplift treatment and control together
uplift_overall_mens = uplift_at_k(y_val, uplift_predictions_mens, trmnt_val, strategy='overall', k=k)
# strategy='by_group' sort by uplift treatment and control separately
uplift_bygroup_mens = uplift_at_k(y_val, uplift_predictions_mens, trmnt_val, strategy='by_group', k=k)
print(f"[email protected]{k * 100:.0f}%: {uplift_overall_mens:.4f} (sort groups by uplift_mens together)")
print(f"[email protected]{k * 100:.0f}%: {uplift_bygroup_mens:.4f} (sort groups by uplift_mens separately)")