#!/usr/bin/env python # coding: utf-8 # # Data & Library Import # In[58]: # Importing the libraries import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score, calinski_harabasz_score get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: # Reading the data data = pd.read_csv('Customer_Segmentation.csv') data.head() # Penetrations in the data are normalized for customer level at data preparation step. # In[3]: # Summary of the data data.describe() # In[4]: # Null variables show up in revenue penetrations, because summation of empty categories null_vars = data.isna().sum() null_vars[null_vars>0] # In[5]: # Setting customer id as index and filling the missing values data_f = data.set_index("CUSTOMER_ID") data_f.fillna(0,inplace=True) # In[6]: # Selecting customers with more than 2 visits kmeans_data_visit = data_f[data_f.VISIT_COUNT >= 2] # Customers should not be categorized if their visit is less than 2. # In[7]: # Sliding the data with respect to variables kmeans_data_visit_vp = kmeans_data_visit.iloc[:,24:43] kmeans_data_visit_rp = kmeans_data_visit.iloc[:,43:62] kmeans_data_visit_dist_art = kmeans_data_visit.iloc[:,62:81] kmeans_data_visit_dist_subcat = kmeans_data_visit.iloc[:,81:100] # In[8]: # Renaming the categories columns = ['Category 1','Category 2', 'Category 3', 'Category 4', 'Category 5', 'Category 6', 'Category 7','Category 8', 'Category 9', 'Category 10', 'Category 11', 'Category 12', 'Category 13','Category 14', 'Category 15', 'Category 16','Category 17', 'Category 18', 'Category 19'] kmeans_data_visit_vp.columns = columns kmeans_data_visit_rp.columns = columns kmeans_data_visit_dist_art.columns = columns kmeans_data_visit_dist_subcat.columns = columns # In[9]: # Subpressing the future warnings import warnings warnings.filterwarnings("ignore", category=FutureWarning) # In[10]: # Freeing up memory del data del null_vars del kmeans_data_visit # # Revenue Penetration # In[11]: # Calculating different clusters with different metrics for revenue penetrations sum_of_squared_distances_rp = [] silhouette_scores_rp = [] ch_scores_rp = [] K = range(2,12) for k in K: k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247) model = k_means.fit(kmeans_data_visit_rp) sum_of_squared_distances_rp.append(k_means.inertia_) labels = k_means.labels_ silhouette_scores_rp.append(silhouette_score(kmeans_data_visit_rp, labels, metric = 'euclidean')) ch_scores_rp.append(calinski_harabasz_score(kmeans_data_visit_rp, labels)) # In[12]: # Visualizing the previous computation fig = plt.figure(figsize=(12, 10), dpi=150) ax1 = fig.add_subplot(311) ax1.plot(K, sum_of_squared_distances_rp, 'bx-') ax1.set_title('elbow method for optimal clusters') ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances') ax2 = fig.add_subplot(312) ax2.plot(K, silhouette_scores_rp, 'rx-') ax2.set_title('silhouette method for optimal clusters') ax2.set(xlabel='Clusters', ylabel='Silhouette Score') ax3 = fig.add_subplot(313) ax3.plot(K, ch_scores_rp, 'rx-') ax3.set_title('calinski harabasz method for optimal clusters') ax3.set(xlabel='Clusters', ylabel='CH Score') fig.suptitle('Revenue Penetration Clusters', size=16) fig.tight_layout() fig.subplots_adjust(top=0.92) plt.show() # - Here 10 different K values are evaluated to determine the best parameter for K-Means algorithm. According to *elbow method*, 4 clusters would be optimal, since the distance gets smaller and smaller after adding another cluster center, however at 4. cluster **momentum shifts**. In this case, 4 cluster would not be feasible, therefore, Silhouette and Calinski Harabasz Score are also calculated to demonstrate the subtle differences. # - *Silhouette Score* gives values between -1 and 1. If the clusters are well apart from each other, it is close to 1 and if not, then close to -1. In this example, after the 4. cluster until 8 it increases **almost linearly**. Therefore cluster size can be chosen between 4-8. # - *Calinski Harabasz Score* resembles to elbow method, for this example, K can be chosen between 5-7, since **the line is broken** at 5. point twice. # # Visit Penetration # In[13]: sum_of_squared_distances_vp = [] silhouette_scores_vp = [] ch_scores_vp = [] K = range(2,12) for k in K: k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247) model = k_means.fit(kmeans_data_visit_vp) sum_of_squared_distances_vp.append(k_means.inertia_) labels = k_means.labels_ silhouette_scores_vp.append(silhouette_score(kmeans_data_visit_vp, labels, metric = 'euclidean')) ch_scores_vp.append(calinski_harabasz_score(kmeans_data_visit_vp, labels)) # In[14]: fig = plt.figure(figsize=(12, 10), dpi=150) ax1 = fig.add_subplot(311) ax1.plot(K, sum_of_squared_distances_vp, 'bx-') ax1.set_title('elbow method for optimal clusters') ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances') ax2 = fig.add_subplot(312) ax2.plot(K, silhouette_scores_vp, 'rx-') ax2.set_title('silhouette method for optimal clusters') ax2.set(xlabel='Clusters', ylabel='Silhouette Score') ax3 = fig.add_subplot(313) ax3.plot(K, ch_scores_vp, 'rx-') ax3.set_title('calinski harabasz method for optimal clusters') ax3.set(xlabel='Clusters', ylabel='CH Score') fig.suptitle('Visit Penetration Clusters', size=16) fig.tight_layout() fig.subplots_adjust(top=0.92) plt.show() # Silhouette Score of **Visit Penetration** Clusters *fluctuates*. Because customers tend to shop from different categories, however, in **Revenue Penetrations**, a customer is appointed to *the most spent category*. Therefore, we need to unify this variables with different weights. If we want to find more *related categories* we can give weight to Visit Penetration, or we can determine more spent category by giving weight to Revenue Penetration. # # 0.6 VP + 0.4 RP # In[15]: X64 = (kmeans_data_visit_vp.multiply(0.6) + kmeans_data_visit_rp.multiply(0.4)) # In[16]: sum_of_squared_distances_X64 = [] silhouette_scores_X64 = [] ch_scores_X64 = [] K = range(2,12) for k in K: k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247) model = k_means.fit(X64) sum_of_squared_distances_X64.append(k_means.inertia_) labels = k_means.labels_ silhouette_scores_X64.append(silhouette_score(X64, labels, metric = 'euclidean')) ch_scores_X64.append(calinski_harabasz_score(X64, labels)) # In[17]: fig = plt.figure(figsize=(12, 10), dpi=150) ax1 = fig.add_subplot(311) ax1.plot(K, sum_of_squared_distances_X64, 'bx-') ax1.set_title('elbow method for optimal clusters') ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances') ax2 = fig.add_subplot(312) ax2.plot(K, silhouette_scores_X64, 'rx-') ax2.set_title('silhouette method for optimal clusters') ax2.set(xlabel='Clusters', ylabel='Silhouette Score') ax3 = fig.add_subplot(313) ax3.plot(K, ch_scores_X64, 'rx-') ax3.set_title('calinski harabasz method for optimal clusters') ax3.set(xlabel='Clusters', ylabel='CH Score') fig.suptitle('0.6 Visit Penetration + 0.4 Revenue Penetration Clusters', size=16) fig.tight_layout() fig.subplots_adjust(top=0.92) plt.show() # # Distinct Article Penetration # Distinct Article & Distinct Sub-Category tells how many articles or sub-categories a customer buys in that category. For example, if a customer comes and buys from two different category, **Visit Penetrations** would be the same, nonetheless, if a customer buys different products from one category then **Distinct Article Penetration** would be higher for that category. # In[18]: sum_of_squared_distances_dmp = [] silhouette_scores_dmp = [] ch_scores_dmp = [] K = range(2,12) for k in K: k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247) model = k_means.fit(kmeans_data_visit_dist_art) sum_of_squared_distances_dmp.append(k_means.inertia_) labels = k_means.labels_ silhouette_scores_dmp.append(silhouette_score(kmeans_data_visit_dist_art, labels, metric = 'euclidean')) ch_scores_dmp.append(calinski_harabasz_score(kmeans_data_visit_dist_art, labels)) # In[19]: fig = plt.figure(figsize=(12, 10), dpi=150) ax1 = fig.add_subplot(311) ax1.plot(K, sum_of_squared_distances_dmp, 'bx-') ax1.set_title('elbow method for optimal clusters') ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances') ax2 = fig.add_subplot(312) ax2.plot(K, silhouette_scores_dmp, 'rx-') ax2.set_title('silhouette method for optimal clusters') ax2.set(xlabel='Clusters', ylabel='Silhouette Score') ax3 = fig.add_subplot(313) ax3.plot(K, ch_scores_dmp, 'rx-') ax3.set_title('calinski harabasz method for optimal clusters') ax3.set(xlabel='Clusters', ylabel='CH Score') fig.suptitle('Distinct Article Penetration Clusters', size=16) fig.tight_layout() fig.subplots_adjust(top=0.92) plt.show() # # Distinct Sub-Category Penetration # In[20]: sum_of_squared_distances_dmgp = [] silhouette_scores_dmgp = [] ch_scores_dmgp = [] K = range(2,12) for k in K: k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247) model = k_means.fit(kmeans_data_visit_dist_subcat) sum_of_squared_distances_dmgp.append(k_means.inertia_) labels = k_means.labels_ silhouette_scores_dmgp.append(silhouette_score(kmeans_data_visit_dist_subcat, labels, metric = 'euclidean')) ch_scores_dmgp.append(calinski_harabasz_score(kmeans_data_visit_dist_subcat, labels)) # In[21]: fig = plt.figure(figsize=(12, 10), dpi=150) ax1 = fig.add_subplot(311) ax1.plot(K, sum_of_squared_distances_dmgp, 'bx-') ax1.set_title('elbow method for optimal clusters') ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances') ax2 = fig.add_subplot(312) ax2.plot(K, silhouette_scores_dmgp, 'rx-') ax2.set_title('silhouette method for optimal clusters') ax2.set(xlabel='Clusters', ylabel='Silhouette Score') ax3 = fig.add_subplot(313) ax3.plot(K, ch_scores_dmgp, 'rx-') ax3.set_title('calinski harabasz method for optimal clusters') ax3.set(xlabel='Clusters', ylabel='CH Score') fig.suptitle('Distinct Sub-Category Penetration Clusters', size=16) fig.tight_layout() fig.subplots_adjust(top=0.92) plt.show() # In[22]: fig = plt.figure(figsize=(12, 10), dpi=150) ax1 = fig.add_subplot(311) ax1.plot(K, sum_of_squared_distances_rp, 'kx-',K, sum_of_squared_distances_vp, 'gx-', K, sum_of_squared_distances_dmp, 'bx-',K, sum_of_squared_distances_dmgp, 'rx-', K, sum_of_squared_distances_X64, 'mx-') ax1.set_title('elbow method for optimal clusters') ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances') plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.6 VP + 0.4 RP'], loc='upper right') ax2 = fig.add_subplot(312) ax2.plot(K, silhouette_scores_rp, 'kx-',K, silhouette_scores_vp, 'gx-', K, silhouette_scores_dmp, 'bx-',K, silhouette_scores_dmgp, 'rx-', K, silhouette_scores_X64, 'mx-') ax2.set_title('silhouette method for optimal clusters') ax2.set(xlabel='Clusters', ylabel='Silhouette Score') plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.6 VP + 0.4 RP'], loc='upper right') ax3 = fig.add_subplot(313) ax3.plot(K, ch_scores_rp, 'kx-',K, ch_scores_vp, 'gx-',K, ch_scores_dmp, 'bx-', K, ch_scores_dmgp, 'rx-',K, ch_scores_X64, 'mx-') ax3.set_title('calinski harabasz method for optimal clusters') ax3.set(xlabel='Clusters', ylabel='CH Score') fig.suptitle('Clusters Differences', size=16) fig.tight_layout() fig.subplots_adjust(top=0.92) plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.6 VP + 0.4 RP'], loc='upper right') plt.show() # From the graphs, we can see that *Revenue Penetration* is the **easiest way** to distinguish the clusters, and *Visit Penetration* is the **hardest**. The unified variable which is *0.6 VP + 0.4 RP* works fine. # # 0.4 VP + 0.6 RP # In[23]: X46 = (kmeans_data_visit_vp.multiply(0.4) + kmeans_data_visit_rp.multiply(0.6)) # In[24]: sum_of_squared_distances_X46 = [] silhouette_scores_X46 = [] ch_scores_X46 = [] K = range(2,12) for k in K: k_means = KMeans(n_clusters=k, init='k-means++', max_iter = 500, n_init=100, random_state=247) model = k_means.fit(X46) sum_of_squared_distances_X46.append(k_means.inertia_) labels = k_means.labels_ silhouette_scores_X46.append(silhouette_score(X46, labels, metric = 'euclidean')) ch_scores_X46.append(calinski_harabasz_score(X46, labels)) # In[25]: fig = plt.figure(figsize=(12, 10), dpi=150) ax1 = fig.add_subplot(311) ax1.plot(K, sum_of_squared_distances_X46, 'bx-') ax1.set_title('elbow method for optimal clusters') ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances') ax2 = fig.add_subplot(312) ax2.plot(K, silhouette_scores_X46, 'rx-') ax2.set_title('silhouette method for optimal clusters') ax2.set(xlabel='Clusters', ylabel='Silhouette Score') ax3 = fig.add_subplot(313) ax3.plot(K, ch_scores_X46, 'rx-') ax3.set_title('calinski harabasz method for optimal clusters') ax3.set(xlabel='Clusters', ylabel='CH Score') fig.suptitle('0.4 Visit Penetration + 0.6 Revenue Penetration Clusters', size=16) fig.tight_layout() fig.subplots_adjust(top=0.92) plt.show() # In[26]: fig = plt.figure(figsize=(12, 10), dpi=150) ax1 = fig.add_subplot(311) ax1.plot(K, sum_of_squared_distances_rp, 'kx-',K, sum_of_squared_distances_vp, 'gx-', K, sum_of_squared_distances_dmp, 'bx-',K, sum_of_squared_distances_dmgp, 'rx-', K, sum_of_squared_distances_X46, 'mx-') ax1.set_title('elbow method for optimal clusters') ax1.set(xlabel='Clusters', ylabel='Sum of Squared Distances') plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.4 VP + 0.6 RP'], loc='upper right') ax2 = fig.add_subplot(312) ax2.plot(K, silhouette_scores_rp, 'kx-',K, silhouette_scores_vp, 'gx-', K, silhouette_scores_dmp, 'bx-',K, silhouette_scores_dmgp, 'rx-', K, silhouette_scores_X46, 'mx-') ax2.set_title('silhouette method for optimal clusters') ax2.set(xlabel='Clusters', ylabel='Silhouette Score') plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.4 VP + 0.6 RP'], loc='upper right') ax3 = fig.add_subplot(313) ax3.plot(K, ch_scores_rp, 'kx-',K, ch_scores_vp, 'gx-',K, ch_scores_dmp, 'bx-', K, ch_scores_dmgp, 'rx-',K, ch_scores_X46, 'mx-') ax3.set_title('calinski harabasz method for optimal clusters') ax3.set(xlabel='Clusters', ylabel='CH Score') fig.suptitle('Clusters Differences', size=16) fig.tight_layout() fig.subplots_adjust(top=0.92) plt.legend(['Revenue Pen', 'Visit Pen', 'Dist. Art. Pen', 'Dist. Sub-Cat. Pen','0.4 VP + 0.6 RP'], loc='upper right') plt.show() # *0.4 VP + 0.6 RP* shows better performance in these graphs. However, in this project, we would like to find **relative categories**, therefore, *0.6 VP + 0.4 RP* is chosen. # ### 0.4 VP + 0.6 RP - 5 CL # In[27]: kmeans_data_visit_x46_cl5 = (kmeans_data_visit_vp.multiply(0.4) + kmeans_data_visit_rp.multiply(0.6)) # In[28]: kmeans_visit_x46_cl5 = KMeans(n_clusters=5, init='k-means++', max_iter=1000, n_init=500, random_state=247) pred_y = kmeans_visit_x46_cl5.fit_predict(kmeans_data_visit_x46_cl5) # In[29]: labels = kmeans_visit_x46_cl5.labels_ s_score = silhouette_score(kmeans_data_visit_x46_cl5, labels, metric = 'euclidean') ch_score = calinski_harabasz_score(kmeans_data_visit_x46_cl5, labels) print(f'silhouette_score :{s_score:.3f}') print(f'calinski_harabasz_score :{ch_score:,.2f}') # In[30]: kmeans_data_visit_x46_cl5 = kmeans_data_visit_x46_cl5.reset_index() kmeans_data_visit_x46_cl5["cluster_1"] = pred_y # In[31]: df = pd.concat([kmeans_data_visit_x46_cl5.groupby(["cluster_1"])["CUSTOMER_ID"].nunique(),\ kmeans_data_visit_x46_cl5.groupby(["cluster_1"])["CUSTOMER_ID"].nunique()/\ kmeans_data_visit_x46_cl5["CUSTOMER_ID"].nunique()],axis=1) df.columns=['Customer (#)', 'Customer (%)'] df.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.2f}') # Cluster 3 is the **biggest** one, since it is the *mixed group*. We would like to *diminish* that group. # In[32]: kmeans_data_visit_x46_cl5 = kmeans_data_visit_x46_cl5.merge(data_f[["VISIT_COUNT","REVENUE"]].reset_index(), how = 'left', on = 'CUSTOMER_ID') # In[33]: df_1 = kmeans_data_visit_x46_cl5.groupby("cluster_1")\ .agg({"CUSTOMER_ID":'nunique',"VISIT_COUNT":'sum',"REVENUE":'sum'}) df_1.columns = ['Customer (#)', 'Visit (#)', 'Revenue'] df_1.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.0f}') # Even though the cluster 0 is *almost half* of the cluster 3, **most visits and revenue** belong to cluster 0. # In[34]: df_2 = kmeans_data_visit_x46_cl5.drop(columns = ['CUSTOMER_ID','VISIT_COUNT','REVENUE']) df_2 = pd.DataFrame(df_2.groupby(["cluster_1"]).aggregate("mean")) # In[35]: import seaborn as sns cm = sns.light_palette("green", as_cmap=True) df_2.style.background_gradient(axis=0, cmap=cm).format('{:,.2f}') # These are the percentages of shopping rate of the customers that are appointed to that category. Here we can see that: # - Cluster 0 is mainly **Category 15** and with a small percentage **Category 5**. # - Cluster 1 is highly **Category 9** and with small percentages **Category 12 and 13**. # - Cluster 2 is **Category 11** # - Cluster 3 is **the mixed one**. # - Cluster 4 is pretty much **Category 17** and with a small percentage **Category 6**. # In[36]: #kmeans_data_visit_x46_cl5[["CUSTOMER_ID","cluster_1"]].to_csv('kmeans_data_visit_x46_cl8.csv', index = False) # In[37]: #kmeans_visit_x46_cl5.cluster_centers_ # ### 0.4 VP + 0.6 RP - 7 CL # In[38]: kmeans_data_visit_x46_cl7 = (kmeans_data_visit_vp.multiply(0.4) + kmeans_data_visit_rp.multiply(0.6)) # In[39]: kmeans_visit_x46_cl7 = KMeans(n_clusters=7, init='k-means++', max_iter=1000, n_init=500, random_state=247) pred_y = kmeans_visit_x46_cl7.fit_predict(kmeans_data_visit_x46_cl7) # In[40]: labels = kmeans_visit_x46_cl7.labels_ s_score = silhouette_score(kmeans_data_visit_x46_cl7, labels, metric = 'euclidean') ch_score = calinski_harabasz_score(kmeans_data_visit_x46_cl7, labels) print(f'silhouette_score :{s_score:.3f}') print(f'calinski_harabasz_score :{ch_score:,.2f}') # In[41]: kmeans_data_visit_x46_cl7 = kmeans_data_visit_x46_cl7.reset_index() kmeans_data_visit_x46_cl7["cluster_1"] = pred_y # In[42]: df = pd.concat([kmeans_data_visit_x46_cl7.groupby(["cluster_1"])["CUSTOMER_ID"].nunique(),\ kmeans_data_visit_x46_cl7.groupby(["cluster_1"])["CUSTOMER_ID"].nunique()/\ kmeans_data_visit_x46_cl7["CUSTOMER_ID"].nunique()],axis=1) df.columns=['Customer (#)', 'Customer (%)'] df.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.2f}') # In[43]: kmeans_data_visit_x46_cl7 = kmeans_data_visit_x46_cl7.merge(data_f[["VISIT_COUNT","REVENUE"]].reset_index(), how = 'left', on = 'CUSTOMER_ID') # In[44]: df_1 = kmeans_data_visit_x46_cl7.groupby("cluster_1")\ .agg({"CUSTOMER_ID":'nunique',"VISIT_COUNT":'sum',"REVENUE":'sum'}) df_1.columns = ['Customer (#)', 'Visit (#)', 'Revenue'] df_1.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.0f}') # In[45]: df_2 = kmeans_data_visit_x46_cl7.drop(columns = ['CUSTOMER_ID','VISIT_COUNT','REVENUE']) df_2 = pd.DataFrame(df_2.groupby(["cluster_1"]).aggregate("mean")) df_2.style.background_gradient(axis=0, cmap=cm).format('{:,.2f}') # In[46]: #kmeans_data_visit_x46_cl7[["CUSTOMER_ID","cluster_1"]].to_csv('kmeans_data_visit_x46_cl7.csv', index = False) # In[47]: #kmeans_visit_x46_cl7.cluster_centers_ # ### 0.6 VP + 0.4 RP - 7 CL # In[48]: kmeans_data_visit_x64_cl7 = (kmeans_data_visit_vp.multiply(0.6) + kmeans_data_visit_rp.multiply(0.4)) # In[49]: kmeans_visit_x64_cl7 = KMeans(n_clusters=7, init='k-means++', max_iter=1000, n_init=500, random_state=247) pred_y = kmeans_visit_x64_cl7.fit_predict(kmeans_data_visit_x64_cl7) # In[50]: labels = kmeans_visit_x64_cl7.labels_ s_score = silhouette_score(kmeans_data_visit_x64_cl7, labels, metric = 'euclidean') ch_score = calinski_harabasz_score(kmeans_data_visit_x64_cl7, labels) print(f'silhouette_score :{s_score:.3f}') print(f'calinski_harabasz_score :{ch_score:,.2f}') # In[51]: kmeans_data_visit_x64_cl7 = kmeans_data_visit_x64_cl7.reset_index() kmeans_data_visit_x64_cl7["cluster_1"] = pred_y # In[52]: df = pd.concat([kmeans_data_visit_x64_cl7.groupby(["cluster_1"])["CUSTOMER_ID"].nunique(),\ kmeans_data_visit_x64_cl7.groupby(["cluster_1"])["CUSTOMER_ID"].nunique()/\ kmeans_data_visit_x64_cl7["CUSTOMER_ID"].nunique()],axis=1) df.columns=['Customer (#)', 'Customer (%)'] df.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.2f}') # In[53]: kmeans_data_visit_x64_cl7 = kmeans_data_visit_x64_cl7.merge(data_f[["VISIT_COUNT","REVENUE"]].reset_index(), how = 'left', on = 'CUSTOMER_ID') # In[54]: df_1 = kmeans_data_visit_x64_cl7.groupby("cluster_1")\ .agg({"CUSTOMER_ID":'nunique',"VISIT_COUNT":'sum',"REVENUE":'sum'}) df_1.columns = ['Customer (#)', 'Visit (#)', 'Revenue'] df_1.style.background_gradient(axis=0, cmap='YlOrRd').format('{:,.0f}') # In[55]: df_2 = kmeans_data_visit_x64_cl7.drop(columns = ['CUSTOMER_ID','VISIT_COUNT','REVENUE']) df_2 = pd.DataFrame(df_2.groupby(["cluster_1"]).aggregate("mean")) df_2.style.background_gradient(axis=0, cmap=cm).format('{:,.2f}') # In[56]: #kmeans_data_visit_x64_cl7[["CUSTOMER_ID","cluster_1"]].to_csv('kmeans_data_visit_x64_cl7.csv', index = False) # In[57]: #kmeans_visit_x64_cl7.cluster_centers_