#!/usr/bin/env python
# coding: utf-8
# # Data & Library Import
# In[58]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
get_ipython().run_line_magic('matplotlib', 'inline')
# In[2]:
# Reading the data
data = pd.read_csv('Customer_Segmentation.csv')
data.head()
# Penetrations in the data are normalized for customer level at data preparation step.
# In[3]:
# Summary of the data
data.describe()
# In[4]:
# Null variables show up in revenue penetrations, because summation of empty categories
null_vars = data.isna().sum()
null_vars[null_vars>0]
# In[5]:
# Setting customer id as index and filling the missing values
data_f = data.set_index("CUSTOMER_ID")
data_f.fillna(0,inplace=True)
# In[6]:
# Selecting customers with more than 2 visits
kmeans_data_visit = data_f[data_f.VISIT_COUNT >= 2]
# Customers should not be categorized if their visit is less than 2.
# In[7]:
# Sliding the data with respect to variables
kmeans_data_visit_vp = kmeans_data_visit.iloc[:,24:43]
kmeans_data_visit_rp = kmeans_data_visit.iloc[:,43:62]
kmeans_data_visit_dist_art = kmeans_data_visit.iloc[:,62:81]
kmeans_data_visit_dist_subcat = kmeans_data_visit.iloc[:,81:100]
# In[8]:
# Renaming the categories
columns = ['Category 1','Category 2', 'Category 3', 'Category 4', 'Category 5', 'Category 6', 'Category 7','Category 8', 'Category 9', 'Category 10',
'Category 11', 'Category 12', 'Category 13','Category 14', 'Category 15', 'Category 16','Category 17', 'Category 18', 'Category 19']
kmeans_data_visit_vp.columns = columns
kmeans_data_visit_rp.columns = columns
kmeans_data_visit_dist_art.columns = columns
kmeans_data_visit_dist_subcat.columns = columns
# In[9]:
# Subpressing the future warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# In[10]:
# Freeing up memory
del data
del null_vars
del kmeans_data_visit
# # Revenue Penetration
# In[11]:
# Calculating different clusters with different metrics for revenue penetrations
sum_of_squared_distances_rp = []
silhouette_scores_rp = []
ch_scores_rp = []
K = range(2,12)
for k in K:
k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247)
model = k_means.fit(kmeans_data_visit_rp)
sum_of_squared_distances_rp.append(k_means.inertia_)
labels = k_means.labels_
silhouette_scores_rp.append(silhouette_score(kmeans_data_visit_rp, labels, metric = 'euclidean'))
ch_scores_rp.append(calinski_harabasz_score(kmeans_data_visit_rp, labels))
# In[12]:
# Visualizing the previous computation
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_rp, 'bx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_rp, 'rx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_rp, 'rx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('Revenue Penetration Clusters', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.show()
# - Here 10 different K values are evaluated to determine the best parameter for K-Means algorithm. According to *elbow method*, 4 clusters would be optimal, since the distance gets smaller and smaller after adding another cluster center, however at 4. cluster **momentum shifts**. In this case, 4 cluster would not be feasible, therefore, Silhouette and Calinski Harabasz Score are also calculated to demonstrate the subtle differences.
# - *Silhouette Score* gives values between -1 and 1. If the clusters are well apart from each other, it is close to 1 and if not, then close to -1. In this example, after the 4. cluster until 8 it increases **almost linearly**. Therefore cluster size can be chosen between 4-8.
# - *Calinski Harabasz Score* resembles to elbow method, for this example, K can be chosen between 5-7, since **the line is broken** at 5. point twice.
# # Visit Penetration
# In[13]:
sum_of_squared_distances_vp = []
silhouette_scores_vp = []
ch_scores_vp = []
K = range(2,12)
for k in K:
k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247)
model = k_means.fit(kmeans_data_visit_vp)
sum_of_squared_distances_vp.append(k_means.inertia_)
labels = k_means.labels_
silhouette_scores_vp.append(silhouette_score(kmeans_data_visit_vp, labels, metric = 'euclidean'))
ch_scores_vp.append(calinski_harabasz_score(kmeans_data_visit_vp, labels))
# In[14]:
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_vp, 'bx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_vp, 'rx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_vp, 'rx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('Visit Penetration Clusters', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.show()
# Silhouette Score of **Visit Penetration** Clusters *fluctuates*. Because customers tend to shop from different categories, however, in **Revenue Penetrations**, a customer is appointed to *the most spent category*. Therefore, we need to unify this variables with different weights. If we want to find more *related categories* we can give weight to Visit Penetration, or we can determine more spent category by giving weight to Revenue Penetration.
# # 0.6 VP + 0.4 RP
# In[15]:
X64 = (kmeans_data_visit_vp.multiply(0.6) + kmeans_data_visit_rp.multiply(0.4))
# In[16]:
sum_of_squared_distances_X64 = []
silhouette_scores_X64 = []
ch_scores_X64 = []
K = range(2,12)
for k in K:
k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247)
model = k_means.fit(X64)
sum_of_squared_distances_X64.append(k_means.inertia_)
labels = k_means.labels_
silhouette_scores_X64.append(silhouette_score(X64, labels, metric = 'euclidean'))
ch_scores_X64.append(calinski_harabasz_score(X64, labels))
# In[17]:
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_X64, 'bx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_X64, 'rx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_X64, 'rx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('0.6 Visit Penetration + 0.4 Revenue Penetration Clusters', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.show()
# # Distinct Article Penetration
# Distinct Article & Distinct Sub-Category tells how many articles or sub-categories a customer buys in that category. For example, if a customer comes and buys from two different category, **Visit Penetrations** would be the same, nonetheless, if a customer buys different products from one category then **Distinct Article Penetration** would be higher for that category.
# In[18]:
sum_of_squared_distances_dmp = []
silhouette_scores_dmp = []
ch_scores_dmp = []
K = range(2,12)
for k in K:
k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247)
model = k_means.fit(kmeans_data_visit_dist_art)
sum_of_squared_distances_dmp.append(k_means.inertia_)
labels = k_means.labels_
silhouette_scores_dmp.append(silhouette_score(kmeans_data_visit_dist_art, labels, metric = 'euclidean'))
ch_scores_dmp.append(calinski_harabasz_score(kmeans_data_visit_dist_art, labels))
# In[19]:
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_dmp, 'bx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_dmp, 'rx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_dmp, 'rx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('Distinct Article Penetration Clusters', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.show()
# # Distinct Sub-Category Penetration
# In[20]:
sum_of_squared_distances_dmgp = []
silhouette_scores_dmgp = []
ch_scores_dmgp = []
K = range(2,12)
for k in K:
k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247)
model = k_means.fit(kmeans_data_visit_dist_subcat)
sum_of_squared_distances_dmgp.append(k_means.inertia_)
labels = k_means.labels_
silhouette_scores_dmgp.append(silhouette_score(kmeans_data_visit_dist_subcat, labels, metric = 'euclidean'))
ch_scores_dmgp.append(calinski_harabasz_score(kmeans_data_visit_dist_subcat, labels))
# In[21]:
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_dmgp, 'bx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_dmgp, 'rx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_dmgp, 'rx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('Distinct Sub-Category Penetration Clusters', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.show()
# In[22]:
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_rp, 'kx-',K, sum_of_squared_distances_vp, 'gx-',
K, sum_of_squared_distances_dmp, 'bx-',K, sum_of_squared_distances_dmgp, 'rx-',
K, sum_of_squared_distances_X64, 'mx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.6 VP + 0.4 RP'], loc='upper right')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_rp, 'kx-',K, silhouette_scores_vp, 'gx-',
K, silhouette_scores_dmp, 'bx-',K, silhouette_scores_dmgp, 'rx-',
K, silhouette_scores_X64, 'mx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.6 VP + 0.4 RP'], loc='upper right')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_rp, 'kx-',K, ch_scores_vp, 'gx-',K, ch_scores_dmp, 'bx-',
K, ch_scores_dmgp, 'rx-',K, ch_scores_X64, 'mx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('Clusters Differences', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.6 VP + 0.4 RP'], loc='upper right')
plt.show()
# From the graphs, we can see that *Revenue Penetration* is the **easiest way** to distinguish the clusters, and *Visit Penetration* is the **hardest**. The unified variable which is *0.6 VP + 0.4 RP* works fine.
# # 0.4 VP + 0.6 RP
# In[23]:
X46 = (kmeans_data_visit_vp.multiply(0.4) + kmeans_data_visit_rp.multiply(0.6))
# In[24]:
sum_of_squared_distances_X46 = []
silhouette_scores_X46 = []
ch_scores_X46 = []
K = range(2,12)
for k in K:
k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247)
model = k_means.fit(X46)
sum_of_squared_distances_X46.append(k_means.inertia_)
labels = k_means.labels_
silhouette_scores_X46.append(silhouette_score(X46, labels, metric = 'euclidean'))
ch_scores_X46.append(calinski_harabasz_score(X46, labels))
# In[25]:
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_X46, 'bx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_X46, 'rx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_X46, 'rx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('0.4 Visit Penetration + 0.6 Revenue Penetration Clusters', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.show()
# In[26]:
fig = plt.figure(figsize=(12, 10), dpi=150)
ax1 = fig.add_subplot(311)
ax1.plot(K, sum_of_squared_distances_rp, 'kx-',K, sum_of_squared_distances_vp, 'gx-',
K, sum_of_squared_distances_dmp, 'bx-',K, sum_of_squared_distances_dmgp, 'rx-',
K, sum_of_squared_distances_X46, 'mx-')
ax1.set_title('elbow method for optimal clusters')
ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances')
plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.4 VP + 0.6 RP'], loc='upper right')
ax2 = fig.add_subplot(312)
ax2.plot(K, silhouette_scores_rp, 'kx-',K, silhouette_scores_vp, 'gx-',
K, silhouette_scores_dmp, 'bx-',K, silhouette_scores_dmgp, 'rx-',
K, silhouette_scores_X46, 'mx-')
ax2.set_title('silhouette method for optimal clusters')
ax2.set(xlabel='Clusters', ylabel='Silhouette Score')
plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.4 VP + 0.6 RP'], loc='upper right')
ax3 = fig.add_subplot(313)
ax3.plot(K, ch_scores_rp, 'kx-',K, ch_scores_vp, 'gx-',K, ch_scores_dmp, 'bx-',
K, ch_scores_dmgp, 'rx-',K, ch_scores_X46, 'mx-')
ax3.set_title('calinski harabasz method for optimal clusters')
ax3.set(xlabel='Clusters', ylabel='CH Score')
fig.suptitle('Clusters Differences', size=16)
fig.tight_layout()
fig.subplots_adjust(top=0.92)
plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.4 VP + 0.6 RP'], loc='upper right')
plt.show()
# *0.4 VP + 0.6 RP* shows better performance in these graphs. However, in this project, we would like to find **relative categories**, therefore, *0.6 VP + 0.4 RP* is chosen.
# ### 0.4 VP + 0.6 RP - 5 CL
# In[27]:
kmeans_data_visit_x46_cl5 = (kmeans_data_visit_vp.multiply(0.4) + kmeans_data_visit_rp.multiply(0.6))
# In[28]:
kmeans_visit_x46_cl5 = KMeans(n_clusters=5, init='k-means++', max_iter=1000, n_init=500, random_state=247)
pred_y = kmeans_visit_x46_cl5.fit_predict(kmeans_data_visit_x46_cl5)
# In[29]:
labels = kmeans_visit_x46_cl5.labels_
s_score = silhouette_score(kmeans_data_visit_x46_cl5, labels, metric = 'euclidean')
ch_score = calinski_harabasz_score(kmeans_data_visit_x46_cl5, labels)
print(f'silhouette_score :{s_score:.3f}')
print(f'calinski_harabasz_score :{ch_score:,.2f}')
# In[30]:
kmeans_data_visit_x46_cl5 = kmeans_data_visit_x46_cl5.reset_index()
kmeans_data_visit_x46_cl5["cluster_1"] = pred_y
# In[31]:
df = pd.concat([kmeans_data_visit_x46_cl5.groupby(["cluster_1"])["CUSTOMER_ID"].nunique(),\
kmeans_data_visit_x46_cl5.groupby(["cluster_1"])["CUSTOMER_ID"].nunique()/\
kmeans_data_visit_x46_cl5["CUSTOMER_ID"].nunique()],axis=1)
df.columns=['Customer (#)', 'Customer (%)']
df.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.2f}')
# Cluster 3 is the **biggest** one, since it is the *mixed group*. We would like to *diminish* that group.
# In[32]:
kmeans_data_visit_x46_cl5 = kmeans_data_visit_x46_cl5.merge(data_f[["VISIT_COUNT","REVENUE"]].reset_index(), how = 'left', on = 'CUSTOMER_ID')
# In[33]:
df_1 = kmeans_data_visit_x46_cl5.groupby("cluster_1")\
.agg({"CUSTOMER_ID":'nunique',"VISIT_COUNT":'sum',"REVENUE":'sum'})
df_1.columns = ['Customer (#)', 'Visit (#)', 'Revenue']
df_1.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.0f}')
# Even though the cluster 0 is *almost half* of the cluster 3, **most visits and revenue** belong to cluster 0.
# In[34]:
df_2 = kmeans_data_visit_x46_cl5.drop(columns = ['CUSTOMER_ID','VISIT_COUNT','REVENUE'])
df_2 = pd.DataFrame(df_2.groupby(["cluster_1"]).aggregate("mean"))
# In[35]:
import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)
df_2.style.background_gradient(axis=0, cmap=cm).format('{:,.2f}')
# These are the percentages of shopping rate of the customers that are appointed to that category. Here we can see that:
# - Cluster 0 is mainly **Category 15** and with a small percentage **Category 5**.
# - Cluster 1 is highly **Category 9** and with small percentages **Category 12 and 13**.
# - Cluster 2 is **Category 11**
# - Cluster 3 is **the mixed one**.
# - Cluster 4 is pretty much **Category 17** and with a small percentage **Category 6**.
# In[36]:
#kmeans_data_visit_x46_cl5[["CUSTOMER_ID","cluster_1"]].to_csv('kmeans_data_visit_x46_cl8.csv', index = False)
# In[37]:
#kmeans_visit_x46_cl5.cluster_centers_
# ### 0.4 VP + 0.6 RP - 7 CL
# In[38]:
kmeans_data_visit_x46_cl7 = (kmeans_data_visit_vp.multiply(0.4) + kmeans_data_visit_rp.multiply(0.6))
# In[39]:
kmeans_visit_x46_cl7 = KMeans(n_clusters=7, init='k-means++', max_iter=1000, n_init=500, random_state=247)
pred_y = kmeans_visit_x46_cl7.fit_predict(kmeans_data_visit_x46_cl7)
# In[40]:
labels = kmeans_visit_x46_cl7.labels_
s_score = silhouette_score(kmeans_data_visit_x46_cl7, labels, metric = 'euclidean')
ch_score = calinski_harabasz_score(kmeans_data_visit_x46_cl7, labels)
print(f'silhouette_score :{s_score:.3f}')
print(f'calinski_harabasz_score :{ch_score:,.2f}')
# In[41]:
kmeans_data_visit_x46_cl7 = kmeans_data_visit_x46_cl7.reset_index()
kmeans_data_visit_x46_cl7["cluster_1"] = pred_y
# In[42]:
df = pd.concat([kmeans_data_visit_x46_cl7.groupby(["cluster_1"])["CUSTOMER_ID"].nunique(),\
kmeans_data_visit_x46_cl7.groupby(["cluster_1"])["CUSTOMER_ID"].nunique()/\
kmeans_data_visit_x46_cl7["CUSTOMER_ID"].nunique()],axis=1)
df.columns=['Customer (#)', 'Customer (%)']
df.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.2f}')
# In[43]:
kmeans_data_visit_x46_cl7 = kmeans_data_visit_x46_cl7.merge(data_f[["VISIT_COUNT","REVENUE"]].reset_index(), how = 'left', on = 'CUSTOMER_ID')
# In[44]:
df_1 = kmeans_data_visit_x46_cl7.groupby("cluster_1")\
.agg({"CUSTOMER_ID":'nunique',"VISIT_COUNT":'sum',"REVENUE":'sum'})
df_1.columns = ['Customer (#)', 'Visit (#)', 'Revenue']
df_1.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.0f}')
# In[45]:
df_2 = kmeans_data_visit_x46_cl7.drop(columns = ['CUSTOMER_ID','VISIT_COUNT','REVENUE'])
df_2 = pd.DataFrame(df_2.groupby(["cluster_1"]).aggregate("mean"))
df_2.style.background_gradient(axis=0, cmap=cm).format('{:,.2f}')
# In[46]:
#kmeans_data_visit_x46_cl7[["CUSTOMER_ID","cluster_1"]].to_csv('kmeans_data_visit_x46_cl7.csv', index = False)
# In[47]:
#kmeans_visit_x46_cl7.cluster_centers_
# ### 0.6 VP + 0.4 RP - 7 CL
# In[48]:
kmeans_data_visit_x64_cl7 = (kmeans_data_visit_vp.multiply(0.6) + kmeans_data_visit_rp.multiply(0.4))
# In[49]:
kmeans_visit_x64_cl7 = KMeans(n_clusters=7, init='k-means++', max_iter=1000, n_init=500, random_state=247)
pred_y = kmeans_visit_x64_cl7.fit_predict(kmeans_data_visit_x64_cl7)
# In[50]:
labels = kmeans_visit_x64_cl7.labels_
s_score = silhouette_score(kmeans_data_visit_x64_cl7, labels, metric = 'euclidean')
ch_score = calinski_harabasz_score(kmeans_data_visit_x64_cl7, labels)
print(f'silhouette_score :{s_score:.3f}')
print(f'calinski_harabasz_score :{ch_score:,.2f}')
# In[51]:
kmeans_data_visit_x64_cl7 = kmeans_data_visit_x64_cl7.reset_index()
kmeans_data_visit_x64_cl7["cluster_1"] = pred_y
# In[52]:
df = pd.concat([kmeans_data_visit_x64_cl7.groupby(["cluster_1"])["CUSTOMER_ID"].nunique(),\
kmeans_data_visit_x64_cl7.groupby(["cluster_1"])["CUSTOMER_ID"].nunique()/\
kmeans_data_visit_x64_cl7["CUSTOMER_ID"].nunique()],axis=1)
df.columns=['Customer (#)', 'Customer (%)']
df.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.2f}')
# In[53]:
kmeans_data_visit_x64_cl7 = kmeans_data_visit_x64_cl7.merge(data_f[["VISIT_COUNT","REVENUE"]].reset_index(), how = 'left', on = 'CUSTOMER_ID')
# In[54]:
df_1 = kmeans_data_visit_x64_cl7.groupby("cluster_1")\
.agg({"CUSTOMER_ID":'nunique',"VISIT_COUNT":'sum',"REVENUE":'sum'})
df_1.columns = ['Customer (#)', 'Visit (#)', 'Revenue']
df_1.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.0f}')
# In[55]:
df_2 = kmeans_data_visit_x64_cl7.drop(columns = ['CUSTOMER_ID','VISIT_COUNT','REVENUE'])
df_2 = pd.DataFrame(df_2.groupby(["cluster_1"]).aggregate("mean"))
df_2.style.background_gradient(axis=0, cmap=cm).format('{:,.2f}')
# In[56]:
#kmeans_data_visit_x64_cl7[["CUSTOMER_ID","cluster_1"]].to_csv('kmeans_data_visit_x64_cl7.csv', index = False)
# In[57]:
#kmeans_visit_x64_cl7.cluster_centers_