# Standard libraries
import os
# Data manipulation
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
# Machine learning - scikit-learn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
classification_report,
roc_auc_score,
confusion_matrix,
precision_recall_curve,
auc,
f1_score,
precision_score,
recall_score
)
# Machine learning - other frameworks
import xgboost as xgb
import lightgbm as lgb
import shap
# Imbalanced learning
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
/kaggle/input/dspp1/product_info.csv /kaggle/input/dspp1/customer_product.csv /kaggle/input/dspp1/customer_info.csv /kaggle/input/dspp1/customer_cases.csv
product_info = pd.read_csv("/kaggle/input/dspp1/product_info.csv")
customer_product = pd.read_csv("/kaggle/input/dspp1/customer_product.csv")
customer_info = pd.read_csv("/kaggle/input/dspp1/customer_info.csv")
customer_cases = pd.read_csv("/kaggle/input/dspp1/customer_cases.csv")
def style_df(df, caption=''):
return df.style.set_caption(caption).set_table_styles([
{'selector': 'th', 'props':[('background-color', '#f0f0f0'),
('color', 'black'),
('font-weight', 'bold')]},
{'selector': 'tr:nth-of-type(odd)','props':[('background-color', '#f9f9f9')]},
])
style_df(customer_product.head(), 'Customer Sign-up and Cancellation Dates')
Unnamed: 0 | customer_id | product | signup_date_time | cancel_date_time | |
---|---|---|---|---|---|
0 | 1 | C2448 | prd_1 | 2017-01-01 10:35:09 | nan |
1 | 2 | C2449 | prd_1 | 2017-01-01 11:39:29 | 2021-09-05 10:00:02 |
2 | 3 | C2450 | prd_1 | 2017-01-01 11:42:00 | 2019-01-13 16:24:55 |
3 | 4 | C2451 | prd_2 | 2017-01-01 13:32:08 | nan |
4 | 5 | C2452 | prd_1 | 2017-01-01 13:57:30 | 2021-06-28 18:06:01 |
style_df(customer_info.head(), 'Customer Demographics')
Unnamed: 0 | customer_id | age | gender | |
---|---|---|---|---|
0 | 1 | C2448 | 76 | female |
1 | 2 | C2449 | 61 | male |
2 | 3 | C2450 | 58 | female |
3 | 4 | C2451 | 62 | female |
4 | 5 | C2452 | 71 | male |
style_df(customer_cases.head(), 'Call Center Activity')
Unnamed: 0 | case_id | date_time | customer_id | channel | reason | |
---|---|---|---|---|---|---|
0 | 1 | CC101 | 2017-01-01 10:32:03 | C2448 | phone | signup |
1 | 2 | CC102 | 2017-01-01 11:35:47 | C2449 | phone | signup |
2 | 3 | CC103 | 2017-01-01 11:37:09 | C2450 | phone | signup |
3 | 4 | CC104 | 2017-01-01 13:28:14 | C2451 | phone | signup |
4 | 5 | CC105 | 2017-01-01 13:52:22 | C2452 | phone | signup |
style_df(product_info.head(), 'Product Information')
product_id | name | price | billing_cycle | |
---|---|---|---|---|
0 | prd_1 | annual_subscription | 1200 | 12 |
1 | prd_2 | monthly_subscription | 125 | 1 |
Customer Product Table will be the centre of the star schema. customer_id will be the primary key linking to the customer_centre and customer_info. product_id will be the primary key for the product_info table
for df_name, df in [("product_info", product_info),
("customer_product", customer_product),
("customer_info", customer_info),
("customer_cases", customer_cases)]:
print(f"\n{df_name} missing values:")
print(df.isnull().sum())
print(f"\n{df_name} data types:")
print(df.dtypes)
product_info missing values: product_id 0 name 0 price 0 billing_cycle 0 dtype: int64 product_info data types: product_id object name object price int64 billing_cycle int64 dtype: object customer_product missing values: Unnamed: 0 0 customer_id 0 product 0 signup_date_time 0 cancel_date_time 396447 dtype: int64 customer_product data types: Unnamed: 0 int64 customer_id object product object signup_date_time object cancel_date_time object dtype: object customer_info missing values: Unnamed: 0 0 customer_id 0 age 0 gender 0 dtype: int64 customer_info data types: Unnamed: 0 int64 customer_id object age int64 gender object dtype: object customer_cases missing values: Unnamed: 0 0 case_id 0 date_time 0 customer_id 0 channel 0 reason 0 dtype: int64 customer_cases data types: Unnamed: 0 int64 case_id object date_time object customer_id object channel object reason object dtype: object
customer_product.isnull().sum()
Unnamed: 0 0 customer_id 0 product 0 signup_date_time 0 cancel_date_time 396447 dtype: int64
This suggests that the missing values in cancel date_time are not actual missing values but rather customers who have not cancelled their subscription
customer_product['signup_date_time'] = pd.to_datetime(customer_product['signup_date_time'])
customer_product['cancel_date_time'] = pd.to_datetime(customer_product['cancel_date_time'])
customer_cases['date_time'] = pd.to_datetime(customer_cases['date_time'])
today = pd.to_datetime('2022-01-01')
customer_product['end_date'] = customer_product['cancel_date_time'].fillna(today)
customer_product['tenure_days'] = (customer_product['end_date'] - customer_product['signup_date_time']).dt.days
joined_df = pd.merge(customer_product,customer_info, how='left', left_on='customer_id', right_on='customer_id')
joined_df = pd.merge(joined_df,product_info, how='left', left_on='product',right_on='product_id')
Joining customer_cases
with the rest of the DataFrame has a few complications. customer_cases
exhibits one-to-many relationship problem, where a customer can have multiple support cases. Combining all the data into one table by directly joining results in duplicate customer records.
To solve this, use aggregation, groupby
and .agg
operations transform this one-to-many relationship into a one-to-one relationship by summarizing all the customer_cases
into one row
case_summary = customer_cases.groupby('customer_id').agg({
'case_id': 'count',
'date_time':['min','max']
})
case_summary.columns = [
'total_cases',
'first_case_date',
'last_case_date'
]
case_summary['days_between_cases'] = (case_summary['last_case_date'] - case_summary['first_case_date']).dt.days
# Create separate columns for each reason and count the frequency of each reason
reason_counts = customer_cases.groupby(['customer_id', 'reason']).size().unstack(fill_value=0)
reason_counts.columns = [f"reason{col}" for col in reason_counts.columns]
case_summary = case_summary.join(reason_counts)
churn_df = pd.merge(joined_df, case_summary, on='customer_id', how='left')
churn_df['total_cases'] = churn_df['total_cases'].fillna(0)
churn_df = churn_df.drop('Unnamed: 0_x', axis=1)
churn_df = churn_df.drop('Unnamed: 0_y', axis=1)
print(churn_df.isnull().sum())
customer_id 0 product 0 signup_date_time 0 cancel_date_time 396447 end_date 0 tenure_days 0 age 0 gender 0 product_id 0 name 0 price 0 billing_cycle 0 total_cases 0 first_case_date 250272 last_case_date 250272 days_between_cases 250272 reasonsignup 250272 reasonsupport 250272 dtype: int64
churn_df['total_cases'] = churn_df['total_cases'].fillna(0)
churn_df['first_case_date'] = churn_df['first_case_date'].fillna(pd.NaT)
churn_df['last_case_date'] = churn_df['last_case_date'].fillna(pd.NaT)
churn_df['days_between_cases'] = churn_df['days_between_cases'].fillna(0)
churn_df['ever_contacted_support'] = churn_df['total_cases'].gt(0).astype(int)
reason_columns = [col for col in churn_df.columns if col.startswith('reason')]
churn_df[reason_columns] = churn_df[reason_columns].fillna(0)
print(churn_df.isnull().sum())
customer_id 0 product 0 signup_date_time 0 cancel_date_time 396447 end_date 0 tenure_days 0 age 0 gender 0 product_id 0 name 0 price 0 billing_cycle 0 total_cases 0 first_case_date 250272 last_case_date 250272 days_between_cases 0 reasonsignup 0 reasonsupport 0 ever_contacted_support 0 dtype: int64
Earlier defined churn as has cancellation_date = Churned. This is good retrospectively, but it's not predictive. Using a time-window approach here (will churn in Next X Days). This is where the business value is. The purpose of churn prediction is preventing future churn, not explaining past churn
cutoff_date = pd.to_datetime('2021-10-03')
churn_df['will_churn_next_90d'] = (
(churn_df['cancel_date_time'] > cutoff_date) &
(churn_df['cancel_date_time'] <= cutoff_date + pd.Timedelta(days = 90))
).astype(int)
Tenure is one of the strongest predictors of churn in SaaS businesses. Subtracting the reference data (today
) with the signup_date_time
, gives tenure (days_since_signup
)
churn_df['days_since_signup'] = (today - churn_df['signup_date_time']).dt.days
Measures how long after signing up a customer first reached out to support (if they ever did). For the customers who did not contact support at all, fill the rows with -1 instead of 0. 0 would indicated the customer contacted support the exact same day they signed up. -1 or any negative value is impossible in real data, since a customer can't contact support before they signup
churn_df['age_group'] = pd.cut(churn_df['age'],
bins = [0, 30, 45, 60, 75, 100],
labels = ['<30', '30-45', '46-60','61-75','75+'])
churn_df['tenure_group'] = pd.cut(churn_df['tenure_days'],
bins = [0, 90, 180, 365, 730, float('inf')],
labels = ['0-3mo', '3-6mo', '6-12mo','1-2yr', '2yr+'])
Filter the dataset to keep only the customers who either churned after the cutoff date or who never churned at all.
This approach ensures:
modeling_df = churn_df[churn_df['signup_date_time'] < cutoff_date]
modeling_df = modeling_df[
(modeling_df['cancel_date_time'] > cutoff_date) |
(modeling_df['cancel_date_time'].isna())
]
churn_df.head()
customer_id | product | signup_date_time | cancel_date_time | end_date | tenure_days | age | gender | product_id | name | ... | first_case_date | last_case_date | days_between_cases | reasonsignup | reasonsupport | ever_contacted_support | will_churn_next_90d | days_since_signup | age_group | tenure_group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C2448 | prd_1 | 2017-01-01 10:35:09 | NaT | 2022-01-01 00:00:00 | 1825 | 76 | female | prd_1 | annual_subscription | ... | 2017-01-01 10:32:03 | 2017-01-01 10:32:03 | 0.0 | 1.0 | 0.0 | 1 | 0 | 1825 | 75+ | 2yr+ |
1 | C2449 | prd_1 | 2017-01-01 11:39:29 | 2021-09-05 10:00:02 | 2021-09-05 10:00:02 | 1707 | 61 | male | prd_1 | annual_subscription | ... | 2017-01-01 11:35:47 | 2017-01-01 11:35:47 | 0.0 | 1.0 | 0.0 | 1 | 0 | 1825 | 61-75 | 2yr+ |
2 | C2450 | prd_1 | 2017-01-01 11:42:00 | 2019-01-13 16:24:55 | 2019-01-13 16:24:55 | 742 | 58 | female | prd_1 | annual_subscription | ... | 2017-01-01 11:37:09 | 2017-01-01 11:37:09 | 0.0 | 1.0 | 0.0 | 1 | 0 | 1825 | 46-60 | 2yr+ |
3 | C2451 | prd_2 | 2017-01-01 13:32:08 | NaT | 2022-01-01 00:00:00 | 1825 | 62 | female | prd_2 | monthly_subscription | ... | 2017-01-01 13:28:14 | 2017-03-31 12:06:58 | 88.0 | 1.0 | 1.0 | 1 | 0 | 1825 | 61-75 | 2yr+ |
4 | C2452 | prd_1 | 2017-01-01 13:57:30 | 2021-06-28 18:06:01 | 2021-06-28 18:06:01 | 1639 | 71 | male | prd_1 | annual_subscription | ... | 2017-01-01 13:52:22 | 2017-01-01 13:52:22 | 0.0 | 1.0 | 0.0 | 1 | 0 | 1825 | 61-75 | 2yr+ |
5 rows × 23 columns
Normalize billing calculations as monthly_spend. Currently, there are two main product types; a 125 monthly subscription (billing_cycles = 1) and a 1,200 annual subscription (billing_cycles = 12). Without normalization, the price difference makes it hard to compare these customers. The annual customer appears to spend much more, but their monthly commitment is similar. Monthly subscribers pay 25% more every month, which might influence their churn behavior
modeling_df['monthly_spend'] = modeling_df['price'] / modeling_df['billing_cycle']
modeling_df['tenure_at_cutoff'] = (cutoff_date - modeling_df['signup_date_time']).dt.days
modeling_df['tenure_months'] = (modeling_df['tenure_at_cutoff'] / 30).round()
Measure how long after signing up a customer first reached out to support (if they ever did). For the customers who did not contact support at all, fill the rows with -1 instead of 0. 0 would indicated the customer contacted support the exact same day they signed up. -1 or any negative value is impossible in real data, since a customer can't contact support before they signup
mask = churn_df['ever_contacted_support'] == 1
churn_df.loc[mask,'days_to_first_contact'] = (churn_df.loc[mask, 'first_case_date'] - churn_df.loc[mask, 'signup_date_time']).dt.days
churn_df['days_to_first_contact'] = churn_df['days_to_first_contact'].fillna(-1)
Indicate whether a customer's most recent support interaction occurred before the defined cutoff_date
mask = modeling_df['ever_contacted_support'] == 1
modeling_df.loc[mask, 'support_before_cutoff'] = (
modeling_df.loc[mask, 'last_case_date'] < cutoff_date
).astype(int)
modeling_df['support_before_cutoff'] = modeling_df['support_before_cutoff'].fillna(0)
Calculate how many days passed between the cutoff_date
and the customer's last support contact, but only if that last contact happened before the cutoff.
mask = modeling_df['support_before_cutoff'] == 1
modeling_df.loc[mask, 'days_since_last_contact'] = (
cutoff_date - modeling_df.loc[mask, 'last_case_date']
).dt.days
modeling_df['days_since_last_contact'] = modeling_df['days_since_last_contact'].fillna(999)
Calculate the average frequency of support contacts per month over the customer's tenure
mask = modeling_df['ever_contacted_support'] == 1
modeling_df.loc[mask, 'monthly_contact_rate'] = (
modeling_df.loc[mask, 'total_cases'] / (modeling_df.loc[mask, 'tenure_months'] + 1)
)
modeling_df['monthly_contact_rate'] = modeling_df['monthly_contact_rate'].fillna(0)
modeling_df['support_level'] = pd.cut(
modeling_df['monthly_contact_rate'],
bins = [-0.001, 0 , 0.2, 0.5, 1, float('inf')],
labels = ['None', 'Low', 'Medium', 'High', 'Very High']
)
target = 'will_churn_next_90d'
segments = ['age_group', 'tenure_group', 'support_level', 'product']
for segment in segments:
churn_by_segment = modeling_df.groupby(segment, observed=True)[target].agg(['mean', 'count'])
churn_by_segment['mean'] *= 100
print(f"\nChurn rate by {segment}:")
print(churn_by_segment.sort_values('mean', ascending=False))
Churn rate by age_group: mean count age_group 75+ 4.382609 2875 30-45 4.260450 24880 46-60 4.008111 195279 61-75 3.995778 163898 <30 3.880597 1005 Churn rate by tenure_group: mean count tenure_group 0-3mo 75.802998 934 3-6mo 4.752750 37452 1-2yr 3.925560 119881 6-12mo 3.703624 93260 2yr+ 3.630965 136410 Churn rate by support_level: mean count support_level Medium 4.955527 31480 Very High 4.945055 546 High 4.672024 5351 Low 4.182353 162827 None 3.704197 187733 Churn rate by product: mean count product prd_2 5.389810 135348 prd_1 3.288346 252589
Extremely high churn rate (76%) for customers with a tenure of 0-3 months. Might be dealing with 'onboarding cliff' in subscription businesses, could be a free trial effect, but the data does not indicate that the service offers a free trial, so I am making an assumption here.
modeling_df['tenure_weeks'] = (modeling_df['tenure_at_cutoff'] / 7).astype(int)
new_churn = modeling_df[modeling_df['tenure_at_cutoff'] < 90]
weekly_churn = new_churn.groupby('tenure_weeks')['will_churn_next_90d'].agg(['mean','count'])
weekly_churn['mean'] *= 100
print(weekly_churn)
new_product_churn = new_churn.groupby(['product', 'tenure_weeks'])['will_churn_next_90d'].mean() * 100
print(new_product_churn)
mean count tenure_weeks 0 4.268293 2460 1 4.101951 2511 2 3.953749 2681 3 3.834472 2634 4 4.235727 2715 5 3.463961 2858 6 5.005325 2817 7 3.766542 2947 8 3.993344 3005 9 4.386808 3123 10 4.700991 3127 11 3.773585 3180 12 4.515522 3189 product tenure_weeks prd_1 0 3.568465 1 2.510121 2 3.403933 3 3.448276 4 3.419453 5 2.620087 6 2.964570 7 2.746845 8 2.709848 9 2.927478 10 3.715992 11 2.754644 12 3.388747 prd_2 0 4.940239 1 5.642633 2 4.488595 3 4.181687 4 5.003574 5 4.245283 6 6.973501 7 4.625000 8 5.294906 9 5.740741 10 5.617284 11 4.756022 12 5.600000 Name: will_churn_next_90d, dtype: float64
tenure_check = modeling_df[modeling_df['tenure_group'] == '0-3mo']
print(f"Count of customers in 0-3mo group: {len(tenure_check)}")
print(tenure_check['tenure_at_cutoff'].describe())
print(f"Churn rate in 0-3mo group: {tenure_check['will_churn_next_90d'].mean()*100:.2f}%")
print(tenure_check.groupby('product')['will_churn_next_90d'].mean()*100)
Count of customers in 0-3mo group: 934 count 934.000000 mean 23.361884 std 23.019626 min 0.000000 25% 0.000000 50% 18.000000 75% 39.000000 max 89.000000 Name: tenure_at_cutoff, dtype: float64 Churn rate in 0-3mo group: 75.80% product prd_1 71.111111 prd_2 78.745645 Name: will_churn_next_90d, dtype: float64
We have 934 customers who joined less than 3 months before the cutoff date. On average, these customers are only subscribed for 23 days. Half of them (median) have been customers for 18 days or less. 75% of the customers will cancel within the next 90 days. This is much higher than other tenure groups' 3-5% churn rate. There is also a difference in monthly subscribers (78.7%) who are more likely to churn compared to annual customers (71.1%)
If we were to acquire 100 customers, ~75 would leave within the first 3 months. A large amount of leakage happening in the initial stages of the subscription signup
The customer journey pattern seems to be Signup -> [Critical 3-month decision period] -> Stable Relationship
Given these dramatic differences, the best approach is segment-based modeling, particularly separating new customers from established ones
modeling_df['new_customer'] = modeling_df['tenure_at_cutoff'] < 90 # 0-3 months
modeling_df.loc[modeling_df['new_customer'], 'signup_day_of_week'] = modeling_df.loc[modeling_df['new_customer'], 'signup_date_time'].dt.dayofweek
modeling_df.loc[modeling_df['new_customer'], 'signup_month'] = modeling_df.loc[modeling_df['new_customer'], 'signup_date_time'].dt.month
new_customer_df = modeling_df[modeling_df['new_customer']].copy()
established_customer_df = modeling_df[~modeling_df['new_customer']].copy()
print(f"New customers: {len(new_customer_df)} ({len(new_customer_df)/len(modeling_df):.1%})")
print(f"Established customers: {len(established_customer_df)} ({len(established_customer_df)/len(modeling_df):.1%}) ")
New customers: 37247 (9.6%) Established customers: 350690 (90.4%)
Dealing with a significant class imbalance here - Will need feature engineering for each segment and then build separate models
new_features = [
'product',
'days_since_signup',
'ever_contacted_support',
'total_cases',
'age',
'gender',
'signup_month',
'signup_day_of_week'
]
new_customer_df['signup_day_of_week'] = new_customer_df['signup_date_time'].dt.dayofweek
new_customer_df['signup_month'] = new_customer_df['signup_date_time'].dt.month
new_customer_df['days_to_first_contact'] = new_customer_df.apply(
lambda x: (x['first_case_date'] - x['signup_date_time']).days
if x['ever_contacted_support'] == 1 else -1, axis = 1
)
X_new = new_customer_df[new_features]
y_new = new_customer_df['will_churn_next_90d']
X_new = pd.get_dummies(X_new, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(
X_new, y_new, test_size = 0.20, random_state = 0, stratify = y_new
)
new_churn_rate = y_new.mean()
print(f"New customer churn rate: {new_churn_rate: .4%}")
weights_new = {0:1, 1:3} # Non-churners get 3x the weight
print(f"Using class weights for new model: {weights_new}")
new_model = RandomForestClassifier(n_estimators = 100, class_weight = weights_new, random_state = 0)
new_model.fit(X_train, y_train)
New customer churn rate: 4.1587% Using class weights for new model: {0: 1, 1: 3}
RandomForestClassifier(class_weight={0: 1, 1: 3}, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(class_weight={0: 1, 1: 3}, random_state=0)
established_customer_df['billing_cycles_completed'] = (
established_customer_df['tenure_days'] / (established_customer_df['billing_cycle'] * 30).astype(int)
)
established_customer_df['recent_support'] = (
established_customer_df['days_since_last_contact'] < 30).astype(int)
established_features = [
'tenure_group',
'product',
'support_level',
'days_since_last_contact',
'monthly_contact_rate',
'price',
'billing_cycle',
'age_group',
'gender',
'billing_cycles_completed',
'recent_support'
]
X_established = established_customer_df[established_features]
y_established = established_customer_df['will_churn_next_90d']
X_established = pd.get_dummies(X_established, drop_first = True, dummy_na = False)
X_train_est, X_test_est, y_train_est, y_test_est = train_test_split(
X_established, y_established,
test_size = 0.20,
random_state = 0,
stratify = y_established
)
final_feature_names = X_established.columns.tolist()
established_churn_rate = y_established.mean()
print(f"Established customer churn rate: {established_churn_rate: .2%}")
weight_est = {0:1,1:5}
print(f"Using class weights for established model: {weight_est}")
established_model = RandomForestClassifier(n_estimators=100,
class_weight = weight_est,
random_state = 0)
established_model.fit(X_train_est, y_train_est)
Established customer churn rate: 4.01% Using class weights for established model: {0: 1, 1: 5}
RandomForestClassifier(class_weight={0: 1, 1: 5}, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(class_weight={0: 1, 1: 5}, random_state=0)
def evaluate_model(model, X_test, y_test, model_name = "Model"):
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]
print(f'Evaluation Report for {model_name}')
print(classification_report(y_test, y_pred, target_names = ['No Churn (0)', 'Churn (1)']))
roc_auc = roc_auc_score(y_test, y_prob)
print(f'ROC AUC Score: {roc_auc:.4f}')
precision, recall, _ = precision_recall_curve(y_test, y_prob)
prc_auc = auc(recall, precision)
print(f'Precision-Recall Curve AUC (PRC AUC): {prc_auc:.4f}')
cm = confusion_matrix(y_test, y_pred)
print("\n Confusion Matrix:")
plt.figure(figsize = (6,4))
sns.heatmap(cm, annot = True, fmt='d', cmap='Blues',
xticklabels = ['Predicted No Churn', 'Predicted Churn'],
yticklabels = ['Actual No Churn', 'Actual Churn'])
plt.title(f'Confusion Matrix - {model_name}')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
print(""* (len(model_name) +24))
return {
'roc_auc': roc_auc,
'prc_auc': prc_auc,
'f1_churn': f1_score(y_test, y_pred, pos_label=1),
'recall_churn': recall_score(y_test, y_pred, pos_label=1),
'precision_churn': precision_score(y_test, y_pred, pos_label=1)
}
print("Evaluating New Customer Model (Random Forest Baseline)...")
new_model_metrics = evaluate_model(new_model, X_test, y_test, 'New Customer RF')
print("\nEvaluating Established Customer Model (Random Forest Baseline) ...")
established_model_metrics = evaluate_model(established_model, X_test_est, y_test_est, "Established Customer RF")
baseline_metrics = {
'New RF': new_model_metrics,
'Established RF': established_model_metrics
}
Evaluating New Customer Model (Random Forest Baseline)... Evaluation Report for New Customer RF precision recall f1-score support No Churn (0) 0.96 0.97 0.97 7140 Churn (1) 0.06 0.04 0.05 310 accuracy 0.93 7450 macro avg 0.51 0.51 0.51 7450 weighted avg 0.92 0.93 0.93 7450 ROC AUC Score: 0.5414 Precision-Recall Curve AUC (PRC AUC): 0.0481 Confusion Matrix:
Evaluating Established Customer Model (Random Forest Baseline) ... Evaluation Report for Established Customer RF precision recall f1-score support No Churn (0) 0.97 0.99 0.98 67328 Churn (1) 0.59 0.32 0.42 2810 accuracy 0.96 70138 macro avg 0.78 0.66 0.70 70138 weighted avg 0.96 0.96 0.96 70138 ROC AUC Score: 0.7615 Precision-Recall Curve AUC (PRC AUC): 0.4083 Confusion Matrix:
New Customer Random Forest Baseline
Precision: 0.06
- Extremely poor, when this model predicts a new customer will churn, it is right 6% of the timeRecall: 0.04
- Extremely poor, model only identifies 4% of the new customers who actually churned, it missed 96% of themF1-score: 0.05
- Extremely poor, confirms the model is failing badly on the churn classOverall metrics
Accuracy: 0.93
- Looks high but is very misleading: Since only ~4% of churn (310/7450), a model predicting ‘No Churn’ for everyone would get a ~96% accuracy, this model isn’t much betterROC AUC: 0.5414
- Very close to 0.5 (random chance) - indicates the model has almost no ability to distinguish between churning and non-churning new customersPRC AUC: 0.0481
- Extremely low. For an imbalanced dataset like this (~4% positive), a random model would have a PRC AUC of around 0.04, this model is barely above random guessingThe baseline Random Forest for new customers is performing terribly. The class weights were not enough, It has no predictive power for identifying churn in this segment. Will need to try different features and algorithm. Will need to put this segment on hold for now
Established Customer Random Forest Baseline
Performance on Churn (Class ‘1’)
Precision: 0.59
- Decent. When this model predicts an established customer will churn, it’s correct 59% of the time - useable but still means 41% are false alarmsRecall: 0.32
- Poor. The model only identifies 32% (less than a 3rd) of the established customer who churned. It misses the majority (68%)F1-score: 0.42
- Mediocre. Balanced score reflecting decent precision but poor recallOverall metrics:
Accuracy: 0.96
- High, but again, influenced by the imbalance (~4% churn rate)ROC AUC: 0.7615
- Fair/Okay, significantly better than the random 0.5. Shows some ability to distinguish between classesPRC AUC: 0.4093
- Fair/Okay. Considerably better than random (~0.04) for this imbalance level. Indicates reasonable predictive signalThe baseline Random Forest for established customers shows potential but is flawed. It is significantly better than the new customer model. Its main weakness is low recall - failing to identify most customers who will churn. The 5x weight class helped precision but wasn’t enough to catch more churners
# XGBoost
neg_count_est = (y_train_est == 0).sum()
pos_count_est = (y_train_est == 1).sum()
scale_pos_weight_est = neg_count_est / pos_count_est
print(f'\nCalculated scale_pos_weight for Established XGBoost: {scale_pos_weight_est:.2f}')
xgb_model_est = xgb.XGBClassifier(
objective = 'binary:logistic',
eval_metric = 'logloss',
use_label_encoder = False,
scale_pos_weight = scale_pos_weight_est,
random_state = 0
)
print('\nTraining default XGBoost model for Established Customers...')
xgb_model_est.fit(X_train_est, y_train_est)
print('\nEvaluating Established Customer Model (XGBoost Default)...')
xgb_est_metrics = evaluate_model(xgb_model_est, X_test_est, y_test_est, 'Established Customer XGB')
Calculated scale_pos_weight for Established XGBoost: 23.96 Training default XGBoost model for Established Customers... Evaluating Established Customer Model (XGBoost Default)... Evaluation Report for Established Customer XGB precision recall f1-score support No Churn (0) 0.98 0.86 0.92 67328 Churn (1) 0.15 0.59 0.24 2810 accuracy 0.85 70138 macro avg 0.57 0.73 0.58 70138 weighted avg 0.95 0.85 0.89 70138 ROC AUC Score: 0.8201 Precision-Recall Curve AUC (PRC AUC): 0.4760 Confusion Matrix:
Established Customer Model:
The RF baseline missed too many churners (low recall).
The default XGBoost found more churners (good recall) but was very inaccurate in its churn predictions (low precision).
XGBoost shows more promise due to higher AUC scores, indicating better potential.
xgb_base = xgb.XGBClassifier(
objective = 'binary:logistic',
eval_metric = 'logloss',
use_label_encoder = False,
random_state = 0
)
param_distributions = {
'n_estimators': [int(x) for x in np.linspace(start = 100, stop=1000, num=10)],
'max_depth': [3,4,5,6,7,8],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'scale_pos_weight': [5,10,15,20,24],
'subsample': [0.7,0.8,0.9,1.0],
'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}
n_iterations = 50
cv_folds = 3
scoring_metric = 'f1'
print(f"Starting Randomized Search with {n_iterations} iterations, {cv_folds}-fold CV, optimizing for '{scoring_metric}'...")
random_search = RandomizedSearchCV(
estimator = xgb_base,
param_distributions = param_distributions,
n_iter = n_iterations,
n_jobs = -1,
cv = cv_folds,
verbose = 1,
random_state = 0
)
random_search.fit(X_train_est, y_train_est)
print('\nRandomized Search Complete')
print(f"Best Score ({scoring_metric}) found: {random_search.best_score_:.4f}")
print("Best Parameters found:")
print(random_search.best_params_)
Starting Randomized Search with 50 iterations, 3-fold CV, optimizing for 'f1'... Fitting 3 folds for each of 50 candidates, totalling 150 fits Randomized Search Complete Best Score (f1) found: 0.9743 Best Parameters found: {'subsample': 1.0, 'scale_pos_weight': 5, 'n_estimators': 900, 'max_depth': 7, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
best_xgb_model = random_search.best_estimator_
print("\nEvaluating the Best Tuned XGBoost Model on the Test Set...")
tuned_xgb_metrics = evaluate_model(best_xgb_model, X_test_est, y_test_est, 'Established Customer Tuned XGB')
print("\nComparison (Recall / Precision / F1 / PRC AUC / ROC AUC):")
print(f"Baseline RF: {baseline_metrics['Established RF']['recall_churn']:.2f} / {baseline_metrics['Established RF']['precision_churn']:.2f} / {baseline_metrics['Established RF']['f1_churn']:.2f} / {baseline_metrics['Established RF']['prc_auc']:.4f} / {baseline_metrics['Established RF']['roc_auc']:.4f}")
print(f"Tuned XGB: {tuned_xgb_metrics['recall_churn']:.2f} / {tuned_xgb_metrics['precision_churn']:.2f} / {tuned_xgb_metrics['f1_churn']:.2f} / {tuned_xgb_metrics['prc_auc']:.4f} / {tuned_xgb_metrics['roc_auc']:.4f}")
Evaluating the Best Tuned XGBoost Model on the Test Set... Evaluation Report for Established Customer Tuned XGB precision recall f1-score support No Churn (0) 0.98 1.00 0.99 67328 Churn (1) 0.95 0.40 0.56 2810 accuracy 0.98 70138 macro avg 0.96 0.70 0.77 70138 weighted avg 0.97 0.98 0.97 70138 ROC AUC Score: 0.8292 Precision-Recall Curve AUC (PRC AUC): 0.5149 Confusion Matrix:
Comparison (Recall / Precision / F1 / PRC AUC / ROC AUC): Baseline RF: 0.32 / 0.59 / 0.42 / 0.4083 / 0.7615 Tuned XGB: 0.40 / 0.95 / 0.56 / 0.5149 / 0.8292
importance_scores = best_xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({
'Feature': final_feature_names,
'Importance': importance_scores
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
top_n = 20
print(f"\nTop {top_n} Features by Importance (XGBoost Built-in):")
print(feature_importance_df.head(top_n))
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(top_n), palette='viridis')
plt.title(f'Top {top_n} Feature Importances - Tuned XGBoost (Established Customers)')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
Top 20 Features by Importance (XGBoost Built-in): Feature Importance 6 tenure_group_3-6mo 0.535142 1 monthly_contact_rate 0.075205 4 billing_cycles_completed 0.059344 3 billing_cycle 0.053441 8 tenure_group_1-2yr 0.051230 2 price 0.046264 7 tenure_group_6-12mo 0.037301 0 days_since_last_contact 0.026869 9 tenure_group_2yr+ 0.017336 11 support_level_Low 0.017140 10 product_prd_2 0.015845 12 support_level_Medium 0.014056 18 age_group_75+ 0.007627 13 support_level_High 0.007614 16 age_group_46-60 0.007538 15 age_group_30-45 0.007509 17 age_group_61-75 0.007499 19 gender_male 0.007222 5 recent_support 0.005816 14 support_level_Very High 0.000000
shap.initjs()
print("Creating SHAP explainer...")
explainer = shap.TreeExplainer(best_xgb_model)
try:
if not isinstance(X_test_est, pd.DataFrame):
X_test_est_df = pd.DataFrame(X_test_est, columns=final_feature_names)
else:
X_test_est_df = X_test_est
shap_values = explainer.shap_values(X_test_est_df)
print("SHAP values calculated.")
except Exception as e:
print(f"Error calculating SHAP values: {e}")
print("Ensure X_test_est is a pandas DataFrame or can be converted with final_feature_names.")
shap_values = None
Creating SHAP explainer... SHAP values calculated.
if shap_values is not None:
print("\n--- SHAP Summary Plots ---")
print("Generating SHAP Bar Plot (Mean Absolute SHAP Value)...")
plt.figure() # Create a new figure context
shap.summary_plot(shap_values, X_test_est_df, plot_type="bar", max_display=top_n, show=False)
plt.title('SHAP Global Feature Importance (Bar Plot)')
plt.tight_layout()
plt.show()
print("\nGenerating SHAP Beeswarm Plot...")
plt.figure()
shap.summary_plot(shap_values, X_test_est_df, max_display=top_n, show=False)
plt.title('SHAP Feature Impact on Model Output (Beeswarm)')
plt.tight_layout()
plt.show()
print("\n--- SHAP Dependence Plots (Examples) ---")
top_feature_1 = 'monthly_contact_rate'
top_feature_2 = 'billing_cycles_completed'
try:
print(f"Generating SHAP Dependence Plot for '{top_feature_1}'...")
plt.figure()
shap.dependence_plot(top_feature_1, shap_values, X_test_est_df, interaction_index=None, show=False)
plt.title(f'SHAP Dependence Plot - {top_feature_1}')
plt.tight_layout()
plt.show()
print(f"Generating SHAP Dependence Plot for '{top_feature_2}'...")
plt.figure() # Create a new figure context
shap.dependence_plot(top_feature_2, shap_values, X_test_est_df, interaction_index="auto", show=False) # 'auto' tries to find interaction
plt.title(f'SHAP Dependence Plot - {top_feature_2} (Interaction colored)')
plt.tight_layout()
plt.show()
except Exception as e:
print(f"Could not generate one or more dependence plots: {e}")
print("Check if feature names exist exactly in X_test_est_df.columns")
else:
print("Skipping SHAP plots due to calculation error.")
--- SHAP Summary Plots --- Generating SHAP Bar Plot (Mean Absolute SHAP Value)...
Generating SHAP Beeswarm Plot...
--- SHAP Dependence Plots (Examples) --- Generating SHAP Dependence Plot for 'monthly_contact_rate'...
<Figure size 640x480 with 0 Axes>
Generating SHAP Dependence Plot for 'billing_cycles_completed'...
<Figure size 640x480 with 0 Axes>
churn_df['churned_within_90d'] = (
(churn_df['cancel_date_time'].notna()) &
(churn_df['tenure_days'] < 90)
).astype(int)
new_churn_actual_rate = churn_df['churned_within_90d'].mean()
print(f'Overall rate of churn within first 90 days: {new_churn_actual_rate:.2%}')
print(churn_df['churned_within_90d'].value_counts())
Overall rate of churn within first 90 days: 3.06% churned_within_90d 0 493351 1 15581 Name: count, dtype: int64
try:
idx_first_case_per_customer = customer_cases.loc[customer_cases.groupby('customer_id')['date_time'].idxmin()]
first_case_details = idx_first_case_per_customer[['customer_id', 'reason','channel']].rename(columns={
'reason':'first_case_reason',
'channel':'first_case_channel'
})
if 'first_case_reason' in churn_df.columns:
churn_df = churn_df.drop(columns=['first_case_reaon'])
if 'first_case_channel' in churn_df.columns:
churn_df = churn_df.drop(columns=['first_case_channel'])
churn_df = pd.merge(churn_df, first_case_details, on = 'customer_id', how = 'left')
churn_df['first_case_reason'] = churn_df['first_case_reason'].fillna('No_Contact')
churn_df['first_case_channel'] = churn_df['first_case_channel'].fillna('No_Contact')
print(churn_df['first_case_reason'].value_counts())
print(churn_df['first_case_channel'].value_counts())
except KeyError as e:
print(f"Error: A required column is missing. Make sure 'customer_cases' DataFrame is loaded correctly. Error: {e}")
except Exception as e:
print(f"An unexpected error occurred during first case feature engineering: {e}")
first_case_reason No_Contact 250272 signup 129527 support 129133 Name: count, dtype: int64 first_case_channel No_Contact 250272 phone 230416 email 28244 Name: count, dtype: int64
churn_df['signup_month'] = churn_df['signup_date_time'].dt.month
churn_df['signup_day_of_week'] = churn_df['signup_date_time'].dt.dayofweek
churn_df['contacted_within_7d'] = (churn_df['days_to_first_contact'] >= 0 & (churn_df['days_to_first_contact'] <= 7 )).astype(int)
new_churn_features = [
'product',
'price',
'age_group',
'gender',
'signup_month',
'signup_day_of_week',
'contacted_within_7d',
'days_to_first_contact',
'first_case_reason',
'first_case_channel'
]
print('\nColumns available for new churn features:', churn_df.columns)
X_new = churn_df[new_churn_features].copy()
y_new = churn_df['churned_within_90d']
print('\nFeatures selected for new Churn model (Revised):')
print(X_new.info())
Columns available for new churn features: Index(['customer_id', 'product', 'signup_date_time', 'cancel_date_time', 'end_date', 'tenure_days', 'age', 'gender', 'product_id', 'name', 'price', 'billing_cycle', 'total_cases', 'first_case_date', 'last_case_date', 'days_between_cases', 'reasonsignup', 'reasonsupport', 'ever_contacted_support', 'will_churn_next_90d', 'days_since_signup', 'age_group', 'tenure_group', 'days_to_first_contact', 'churned_within_90d', 'first_case_reason', 'first_case_channel', 'signup_month', 'signup_day_of_week', 'contacted_within_7d'], dtype='object') Features selected for new Churn model (Revised): <class 'pandas.core.frame.DataFrame'> RangeIndex: 508932 entries, 0 to 508931 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product 508932 non-null object 1 price 508932 non-null int64 2 age_group 508932 non-null category 3 gender 508932 non-null object 4 signup_month 508932 non-null int32 5 signup_day_of_week 508932 non-null int32 6 contacted_within_7d 508932 non-null int64 7 days_to_first_contact 508932 non-null float64 8 first_case_reason 508932 non-null object 9 first_case_channel 508932 non-null object dtypes: category(1), float64(1), int32(2), int64(2), object(4) memory usage: 31.5+ MB None
X_new_encoded = pd.get_dummies(X_new, drop_first = True, dummy_na = False)
final_new_feature_names = X_new_encoded.columns.tolist()
print(f'Number of features after dummification: {len(final_new_feature_names)}')
print('First few new feature names:', final_new_feature_names[:10])
Number of features after dummification: 15 First few new feature names: ['price', 'signup_month', 'signup_day_of_week', 'contacted_within_7d', 'days_to_first_contact', 'product_prd_2', 'age_group_30-45', 'age_group_46-60', 'age_group_61-75', 'age_group_75+']
X_train_new, X_test_new, y_train_new, y_test_new, = train_test_split(
X_new_encoded,
y_new,
test_size = 0.20,
random_state = 0,
stratify = y_new
)
print(f'Training set shape: {X_train_new.shape}')
print(f'Testing set shape: {X_test_new.shape}')
print(f'Training target distribution:\n{y_train_new.value_counts(normalize=True)}')
print(f'Testing target distribution:\n{y_test_new.value_counts(normalize=True)}')
Training set shape: (407145, 15) Testing set shape: (101787, 15) Training target distribution: churned_within_90d 0 0.969384 1 0.030616 Name: proportion, dtype: float64 Testing target distribution: churned_within_90d 0 0.969387 1 0.030613 Name: proportion, dtype: float64
neg_count_new = (y_train_new == 0).sum()
pos_count_new = (y_train_new == 1).sum()
if pos_count_new > 0:
scale_pos_weight_new = neg_count_new / pos_count_new
else:
scale_pos_weight_new = 1
print(f'\nCalculated scale_pos_weight for new Churn XGBoost: {scale_pos_weight_new:.2f}')
xgb_model_new = xgb.XGBClassifier(
objective = 'binary:logistic',
eval_metric = 'logloss',
use_label_encoder = False,
scale_pos_weight = scale_pos_weight_new,
random_state = 0
)
xgb_model_new.fit(X_train_new, y_train_new)
Calculated scale_pos_weight for new Churn XGBoost: 31.66
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=0, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=0, ...)
new_xgb_metrics = evaluate_model(xgb_model_new, X_test_new, y_test_new, 'new Churn XGB Baseline')
try:
new_importance_scores = xgb_model_new.feature_importances_
new_feature_importance_df = pd.DataFrame({
'Feature': final_new_feature_names,
'Importance': new_importance_scores
})
new_feature_importance_df.sort_values(by='Importance', ascending=False)
top_n_new = 15
print(f'\nTop {top_n_new} Features by Importance')
print(new_feature_importance_df.head(top_n_new))
plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=new_feature_importance_df.head(top_n_new), palette='rocket')
plt.title(f'Top {top_n_new} Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show
except Exception as e:
print(f'Could not generate feature importance plot {e}')
Evaluation Report for new Churn XGB Baseline precision recall f1-score support No Churn (0) 0.97 0.67 0.79 98671 Churn (1) 0.04 0.42 0.07 3116 accuracy 0.66 101787 macro avg 0.51 0.55 0.43 101787 weighted avg 0.94 0.66 0.77 101787 ROC AUC Score: 0.5433 Precision-Recall Curve AUC (PRC AUC): 0.0357 Confusion Matrix:
Top 15 Features by Importance Feature Importance 0 price 0.270229 1 signup_month 0.066864 2 signup_day_of_week 0.063876 3 contacted_within_7d 0.061183 4 days_to_first_contact 0.059339 5 product_prd_2 0.000000 6 age_group_30-45 0.055126 7 age_group_46-60 0.061580 8 age_group_61-75 0.065243 9 age_group_75+ 0.051873 10 gender_male 0.061767 11 first_case_reason_signup 0.057988 12 first_case_reason_support 0.000000 13 first_case_channel_email 0.055668 14 first_case_channel_phone 0.069265
Abysmal performance on the current model. Will try tuning it
xgb_base_new = xgb.XGBClassifier(
objective = 'binary:logistic',
eval_metric='logloss',
use_label_encoder = False,
random_state = 0
)
param_distributions = {
'n_estimators': [int(x) for x in np.linspace(start = 100, stop=1000, num=10)],
'max_depth': [3,4,5,6,7,8],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'scale_pos_weight': [5,10,15,20,24],
'subsample': [0.7,0.8,0.9,1.0],
'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}
n_iterations = 50
cv_folds = 3
scoring_metric = 'average_precision'
random_search_new = RandomizedSearchCV(
estimator = xgb_base_new,
param_distributions = param_distributions,
n_iter = n_iterations,
scoring = scoring_metric,
n_jobs = -1,
cv = cv_folds,
verbose = 1,
random_state = 0
)
random_search_new.fit(X_train_new, y_train_new)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
RandomizedSearchCV(cv=3, estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning... num_parallel_tree=None, random_state=0, ...), n_iter=50, n_jobs=-1, param_distributions={'colsample_bytree': [0.7, 0.8, 0.9, 1.0], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'max_depth': [3, 4, 5, 6, 7, 8], 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'scale_pos_weight': [5, 10, 15, 20, 24], 'subsample': [0.7, 0.8, 0.9, 1.0]}, random_state=0, scoring='average_precision', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=3, estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning... num_parallel_tree=None, random_state=0, ...), n_iter=50, n_jobs=-1, param_distributions={'colsample_bytree': [0.7, 0.8, 0.9, 1.0], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'max_depth': [3, 4, 5, 6, 7, 8], 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'scale_pos_weight': [5, 10, 15, 20, 24], 'subsample': [0.7, 0.8, 0.9, 1.0]}, random_state=0, scoring='average_precision', verbose=1)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=0, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=0, ...)
print(f'Best Score ({scoring_metric}) found: {random_search_new.best_score_:.4f}')
print(random_search_new.best_params_)
best_xgb_model_new = random_search_new.best_estimator_
Best Score (average_precision) found: 0.0370 {'subsample': 0.7, 'scale_pos_weight': 24, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
tuned_xgb_new_metrics = evaluate_model(
best_xgb_model_new,
X_test_new,
y_test_new,
'new Churn Tuned XGB'
)
Evaluation Report for new Churn Tuned XGB precision recall f1-score support No Churn (0) 0.97 0.76 0.86 98671 Churn (1) 0.04 0.32 0.07 3116 accuracy 0.75 101787 macro avg 0.51 0.54 0.46 101787 weighted avg 0.94 0.75 0.83 101787 ROC AUC Score: 0.5609 Precision-Recall Curve AUC (PRC AUC): 0.0373 Confusion Matrix:
The tuning process confirms the findings from the baseline model: The features currently available and engineered for predicting churn within the first 90 days lack sufficient predictive power. Even with optimized hyperparameters, the XGBoost model cannot reliably identify customers who will churn new based only on product info, demographics, signup timing, and basic first contact information.
test_probabilities = best_xgb_model.predict_proba(X_test_est)[:,1]
results_df = pd.DataFrame({
'Actual_Churn': y_test_est,
'Predicted_Probability': test_probabilities
}, index = X_test_est.index)
results_df_sorted = results_df.sort_values(by='Predicted_Probability', ascending=False)
display(results_df_sorted.head(10))
risk_threshold = 0.5
high_risk_list = results_df_sorted[results_df_sorted['Predicted_Probability'] >= risk_threshold]
print(f"\nIdentified {len(high_risk_list)} customers above {risk_threshold:.2f} probability threshold.")
Actual_Churn | Predicted_Probability | |
---|---|---|
106273 | 1 | 0.999996 |
113177 | 1 | 0.999993 |
109789 | 1 | 0.999990 |
439767 | 1 | 0.999986 |
436756 | 1 | 0.999986 |
453122 | 1 | 0.999983 |
441013 | 1 | 0.999982 |
108998 | 1 | 0.999982 |
440203 | 1 | 0.999979 |
446190 | 1 | 0.999979 |
Identified 1176 customers above 0.50 probability threshold.
if 'explainer' in locals() and shap_values is not None:
try:
idx_high_risk = results_df_sorted.index[0]
idx_low_risk = results_df_sorted.index[-1]
print('idx_high_risk')
display(shap.force_plot(explainer.expected_value,
shap_values[results_df_sorted.index.get_loc(idx_high_risk),:],
X_test_est_df.loc[idx_high_risk,:]))
print('idx_low_risk')
display(shap.force_plot(explainer.expected_value,
shap_values[results_df_sorted.index.get_loc(idx_low_risk),:],
X_test_est_df.loc[idx_low_risk,:]))
except IndexError:
print("Could not retrieve example indices. Ensure results_df_sorted is populated.")
except Exception as e:
print(f"Could not generate force plots: {e}")
else:
print("SHAP explainer or values not available for force plots.")
idx_high_risk
idx_low_risk