# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
%matplotlib inline
# Reading the data
data = pd.read_csv('Customer_Segmentation.csv')
data.head()
CUSTOMER_ID | LAST_DATE | MONTH_RECENCY | TRX_COUNT | VISIT_COUNT | REVENUE | CAT_1_TRX_PEN | CAT_2_TRX_PEN | CAT_3_TRX_PEN | CAT_4_TRX_PEN | ... | CAT_10_DIST_SUBCAT_PEN | CAT_11_DIST_SUBCAT_PEN | CAT_12_DIST_SUBCAT_PEN | CAT_13_DIST_SUBCAT_PEN | CAT_14_DIST_SUBCAT_PEN | CAT_15_DIST_SUBCAT_PEN | CAT_16_DIST_SUBCAT_PEN | CAT_17_DIST_SUBCAT_PEN | CAT_18_DIST_SUBCAT_PEN | CAT_19_DIST_SUBCAT_PEN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -2.322982e+09 | 06-Jan-21 | 4 | 1 | 1 | 110.95 | 0.000000 | 0.000000 | 0.00 | 0.000000 | ... | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.500000 | 0.000000 | 0.0 | 0.0 | 0.000000 |
1 | -2.323002e+09 | 28-Nov-20 | 6 | 4 | 4 | 1998.02 | 0.000000 | 0.250000 | 0.00 | 0.250000 | ... | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.333333 | 0.0 | 0.0 | 0.000000 |
2 | -2.323123e+09 | 26-Jan-21 | 4 | 4 | 3 | 232.92 | 0.000000 | 0.250000 | 0.25 | 0.000000 | ... | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 |
3 | -2.323131e+09 | 28-Mar-21 | 2 | 3 | 2 | 3351.00 | 0.333333 | 0.000000 | 0.00 | 0.333333 | ... | 0.066667 | 0.0 | 0.133333 | 0.0 | 0.0 | 0.266667 | 0.000000 | 0.0 | 0.0 | 0.066667 |
4 | -2.323236e+09 | 15-Apr-21 | 1 | 3 | 3 | 60.97 | 0.000000 | 0.333333 | 0.00 | 0.000000 | ... | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 |
5 rows × 101 columns
Penetrations in the data are normalized for customer level at data preparation step.
# Summary of the data
data.describe()
CUSTOMER_ID | MONTH_RECENCY | TRX_COUNT | VISIT_COUNT | REVENUE | CAT_1_TRX_PEN | CAT_2_TRX_PEN | CAT_3_TRX_PEN | CAT_4_TRX_PEN | CAT_5_TRX_PEN | ... | CAT_10_DIST_SUBCAT_PEN | CAT_11_DIST_SUBCAT_PEN | CAT_12_DIST_SUBCAT_PEN | CAT_13_DIST_SUBCAT_PEN | CAT_14_DIST_SUBCAT_PEN | CAT_15_DIST_SUBCAT_PEN | CAT_16_DIST_SUBCAT_PEN | CAT_17_DIST_SUBCAT_PEN | CAT_18_DIST_SUBCAT_PEN | CAT_19_DIST_SUBCAT_PEN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4.725400e+04 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | ... | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 | 47254.000000 |
mean | 6.717942e+10 | 4.567000 | 4.255703 | 3.593093 | 3059.220853 | 0.073767 | 0.098022 | 0.109329 | 0.079775 | 0.199718 | ... | 0.026370 | 0.088049 | 0.051917 | 0.037335 | 0.012352 | 0.188552 | 0.016608 | 0.025543 | 0.012244 | 0.019302 |
std | 8.454424e+10 | 3.145063 | 7.968327 | 5.712813 | 9719.466656 | 0.193579 | 0.222411 | 0.232107 | 0.200762 | 0.309438 | ... | 0.100083 | 0.207963 | 0.135909 | 0.115612 | 0.074432 | 0.312839 | 0.095122 | 0.104242 | 0.077570 | 0.098275 |
min | -2.324080e+09 | 1.000000 | 1.000000 | 1.000000 | 0.740000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 2.357210e+10 | 2.000000 | 1.000000 | 1.000000 | 379.825000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 2.366641e+10 | 4.000000 | 2.000000 | 2.000000 | 1235.080000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 2.394085e+10 | 7.000000 | 4.000000 | 4.000000 | 2984.285000 | 0.000000 | 0.044895 | 0.111111 | 0.000000 | 0.333333 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
max | 2.310000e+11 | 12.000000 | 353.000000 | 181.000000 | 814984.410000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 100 columns
# Null variables show up in revenue penetrations, because summation of empty categories
null_vars = data.isna().sum()
null_vars[null_vars>0]
CAT_1_REVENUE_PEN 37563 CAT_2_REVENUE_PEN 35236 CAT_3_REVENUE_PEN 34076 CAT_4_REVENUE_PEN 36880 CAT_5_REVENUE_PEN 27528 CAT_6_REVENUE_PEN 31089 CAT_7_REVENUE_PEN 37849 CAT_8_REVENUE_PEN 38898 CAT_9_REVENUE_PEN 34176 CAT_10_REVENUE_PEN 40236 CAT_11_REVENUE_PEN 35489 CAT_12_REVENUE_PEN 36114 CAT_13_REVENUE_PEN 38613 CAT_14_REVENUE_PEN 44010 CAT_15_REVENUE_PEN 27064 CAT_16_REVENUE_PEN 43582 CAT_17_REVENUE_PEN 41820 CAT_18_REVENUE_PEN 44246 CAT_19_REVENUE_PEN 43692 dtype: int64
# Setting customer id as index and filling the missing values
data_f = data.set_index("CUSTOMER_ID")
data_f.fillna(0,inplace=True)
# Selecting customers with more than 2 visits
kmeans_data_visit = data_f[data_f.VISIT_COUNT >= 2]
Customers should not be categorized if their visit is less than 2.
# Sliding the data with respect to variables
kmeans_data_visit_vp = kmeans_data_visit.iloc[:,24:43]
kmeans_data_visit_rp = kmeans_data_visit.iloc[:,43:62]
kmeans_data_visit_dist_art = kmeans_data_visit.iloc[:,62:81]
kmeans_data_visit_dist_subcat = kmeans_data_visit.iloc[:,81:100]
# Renaming the categories
columns = ['Category 1','Category 2', 'Category 3', 'Category 4', 'Category 5', 'Category 6', 'Category 7','Category 8', 'Category 9', 'Category 10',
'Category 11', 'Category 12', 'Category 13','Category 14', 'Category 15', 'Category 16','Category 17', 'Category 18', 'Category 19']
kmeans_data_visit_vp.columns = columns
kmeans_data_visit_rp.columns = columns
kmeans_data_visit_dist_art.columns = columns
kmeans_data_visit_dist_subcat.columns = columns
# Subpressing the future warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# Freeing up memory
del data
del null_vars
del kmeans_data_visit
# Calculating different clusters with different metrics for revenue penetrations
sum_of_squared_distances_rp = []
silhouette_scores_rp = []
ch_scores_rp = []
K = range(2,12)
for k in K:
k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247)
model = k_means.fit(kmeans_data_visit_rp)
sum_of_squared_distances_rp.append(k_means.inertia_)
labels = k_means.labels_
silhouette_scores_rp.append(silhouette_score(kmeans_data_visit_rp, labels, metric = 'euclidean'))
ch_scores_rp.append(calinski_harabasz_score(kmeans_data_visit_rp, labels))
# Visualizing the previous computation
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_rp, 'bx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_rp, 'rx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_rp, 'rx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('Revenue Penetration Clusters', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.show()
sum_of_squared_distances_vp = []
silhouette_scores_vp = []
ch_scores_vp = []
K = range(2,12)
for k in K:
k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247)
model = k_means.fit(kmeans_data_visit_vp)
sum_of_squared_distances_vp.append(k_means.inertia_)
labels = k_means.labels_
silhouette_scores_vp.append(silhouette_score(kmeans_data_visit_vp, labels, metric = 'euclidean'))
ch_scores_vp.append(calinski_harabasz_score(kmeans_data_visit_vp, labels))
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_vp, 'bx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_vp, 'rx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_vp, 'rx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('Visit Penetration Clusters', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.show()
Silhouette Score of Visit Penetration Clusters fluctuates. Because customers tend to shop from different categories, however, in Revenue Penetrations, a customer is appointed to the most spent category. Therefore, we need to unify this variables with different weights. If we want to find more related categories we can give weight to Visit Penetration, or we can determine more spent category by giving weight to Revenue Penetration.
X64 = (kmeans_data_visit_vp.multiply(0.6) + kmeans_data_visit_rp.multiply(0.4))
sum_of_squared_distances_X64 = []
silhouette_scores_X64 = []
ch_scores_X64 = []
K = range(2,12)
for k in K:
k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247)
model = k_means.fit(X64)
sum_of_squared_distances_X64.append(k_means.inertia_)
labels = k_means.labels_
silhouette_scores_X64.append(silhouette_score(X64, labels, metric = 'euclidean'))
ch_scores_X64.append(calinski_harabasz_score(X64, labels))
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_X64, 'bx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_X64, 'rx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_X64, 'rx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('0.6 Visit Penetration + 0.4 Revenue Penetration Clusters', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.show()