#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.cluster import KMeans import warnings warnings.filterwarnings('ignore') # In[2]: mall_df = pd.read_excel("C:/Users/gertr/Desktop/DATA ANALYTICS/PERSONAL PROJECTS/CLUSTERING(WINE)/Mall_Customers.xlsx") # In[3]: mall_df.head() # In[4]: mall_df.shape # In[5]: mall_df.isnull().sum() # In[6]: ## Check for duplicates mall_df.duplicated().sum() # In[7]: ## rename Genre as Gender mall_df.rename(columns={'Genre':'Gender'}, inplace=True) # In[8]: mall_df # # UNIVARIATE ANALYSIS # In[9]: mall_df.describe() # ###### From the above statistics, we can tell that annual income and spending score has somewhat of a normal distribution since its mean and median are really close. # In[10]: sns.displot(mall_df['Annual Income (k$)'],kde=True); # #### In order to visualize the distribution of all the other numeric variables, a for loop will be efficient since it allows to go through each item in a list or any kind of structure. # In[11]: ### To easily copy and paste the columns needed for the loop below mall_df.columns # In[12]: columns=['Age', 'Annual Income (k$)','Spending Score (1-100)'] for i in columns: ## For each item/feature in the column variable created, plt.figure() ## create a new figure for each feature when sns.displot(mall_df[i],kde=True); ## a plot is run # #### However, a stand alone kdeplot allows for a better understanding of the data especially with the additional dimension of gender. In other words. a kdeplot allows for a better understanding /distribution of the data based on gender # In[13]: sns.kdeplot(mall_df['Annual Income (k$)'], shade=True,hue=mall_df['Gender']); # #### To analyze a kdeplot for all variables, a for loop will be ideal # In[14]: columns=['Age', 'Annual Income (k$)','Spending Score (1-100)'] for i in columns: ## For each item/feature in the column variable created, plt.figure() ## create a new figure for each feature when sns.kdeplot(mall_df[i], shade=True,hue=mall_df['Gender']); # #### From the above kdeplots, the distribution of each feature for each gender can clearly be identified. Females have more frequency for each feature. # In[15]: mall_df['Gender'].value_counts(normalize=True) # In[16]: ## To know the mean of each features between the genders mall_df.groupby(['Gender'])['Age', 'Annual Income (k$)','Spending Score (1-100)'].mean() # In[17]: ### A boxplot can be used to dive deeper to check for outliers and gives an idea of the percentiles that the data is concentrated between columns=['Age', 'Annual Income (k$)','Spending Score (1-100)'] for i in columns: ## For each item/feature in the column variable created, plt.figure() ## create a new figure for each feature when sns.boxplot(data=mall_df, x='Gender',y=mall_df[i]); # # BIVARIATE ANALYSIS # In[18]: sns.scatterplot(data=mall_df, x='Annual Income (k$)', y= 'Spending Score (1-100)'); # In[19]: ### To view the relationship amongst the variables, a pairplot is best used mall_df.drop('CustomerID',axis=1) sns.pairplot(mall_df,hue='Gender') # In[20]: ## To find the degree of the relationship amongst the variables, a correlation funstion is used mall_df.corr() # In[21]: ## A heatmap is used to visualize the degree of correlation sns.heatmap(mall_df.corr(), annot=True,cmap='viridis') # In[22]: mall_df.columns # # CLUSTERING - UNIVARIATE, BIVARIATE AND MULTIVARIATE # ##### For any machine learning algorithm in sklearn to be done, there are three steps # ##### The first thing is to initialize the algorithm # ##### The second is to fit our data to that algorithm which allows the algorithm to learn the data # ##### The third is to either predict or gather the necessary labels needed out of that fitted model # ## UNIVARIATE CLUSTERING (Annual Income) # In[23]: ## Here we perform the first step by initializing the KMeans algorithm clustering1 = KMeans() ## The standard parameters for clusters in this algorithm is 8. # In[24]: ## Here we fit the feature that we want the algorith,KMeans, to learn clustering1.fit(mall_df[['Annual Income (k$)']]) # In[25]: ### A lst of the labels. which we already know are 8 since the default cluster is 8 clustering1.labels_ # In[26]: ### Now we need to compare these labels to our initial data so we can make meaning out of it mall_df['Income Cluster']=clustering1.labels_ mall_df # In[27]: ## Now we can do some summary statistics on the clusters # 1. the number of customers in each cluster mall_df['Income Cluster'].value_counts() # ## ELBOW METHOD # #### Rather than using a default clustering number, the elbow method can help us in identifying the optimal number of clusters to choose. This can be done by selecting the right inertia(WCSS) # In[28]: ## To efficient way to check the inertia(WCSS) on ach cluster is to create a for loop inertia_scores=[] ## create an empty list to put all the inertia scores in once calculated for i in range(1,11): ## For each cluster in a range of 1 to 10, kmeans=KMeans(n_clusters=i) ### Initialize the algorithm kmeans.fit(mall_df[['Annual Income (k$)']]) ## Fit the data to be studied by the algorithm inertia_scores.append(kmeans.inertia_) ### append the calculated WCSS to inertia scores that was created # In[29]: inertia_scores # In[30]: ### We need to pair the inertia scores with their respective clusters plt.plot(range(1,11), inertia_scores, marker='o') plt.title('The Elbow Method') plt.ylabel("WCSS") plt.xlabel('Number of Clusters') plt.show() # ### KNEELOCATOR # In[31]: pip install kneed # In[32]: #### In case we are unsure about the exact number from the elbow method, the KneeLocater function from the Kneed Library can help. from kneed import KneeLocator k1=KneeLocator(range(1,11), inertia_scores, curve='convex', direction='decreasing') k1.elbow # In[33]: ## With our chosen cluster, we can input it into the KMeans algorithm kmeans1=KMeans(n_clusters=3) ## initialize algorithm with optimal clusters kmeans1.fit(mall_df[['Annual Income (k$)']]) kmeans1.labels_ # In[34]: mall_df['Income Cluster']=kmeans1.labels_ mall_df # In[35]: mall_df['Income Cluster'].value_counts() # In[36]: mall_df.columns # In[37]: ## We can now check the mean for each feature in each cluster mall_df.groupby('Income Cluster')['Age', 'Annual Income (k$)', 'Spending Score (1-100)'].mean() # ## BIVARIATE CLUSTERING (Annual Income & Spending Score) # In[38]: inertia_scores2=[] ## create an empty list to put all the inertia scores in once calculated for i in range(1,11): ## For each cluster in a range of 1 to 10, kmeans2=KMeans(n_clusters=i) ### Initialize the algorithm kmeans2.fit(mall_df[['Annual Income (k$)','Spending Score (1-100)']]) ## Fit the data to be studied by the algorithm inertia_scores2.append(kmeans2.inertia_) ### append the calculated WCSS to inertia scores that was created plt.plot(range(1,11), inertia_scores2, marker='o') plt.title('The Elbow Method') plt.ylabel("WCSS") plt.xlabel('Number of Clusters') plt.show() # In[39]: k2=KneeLocator(range(1,11), inertia_scores2, curve='convex', direction='decreasing') k2.elbow # In[40]: ## With our chosen cluster, we can input it into the KMeans algorithm kmeans2=KMeans(n_clusters=5) ## initialize algorithm with optimal clusters kmeans2.fit(mall_df[['Annual Income (k$)','Spending Score (1-100)']]) mall_df['Income & Spending Cluster']=kmeans2.labels_ mall_df # In[41]: ## We can get the centriods for each cluster with the 'cluster_centers_'function. The results are the x and y cordinates kmeans2.cluster_centers_ # In[42]: ## In order to add the centriods into a scatter plot, we first make it a dataframe centers=pd.DataFrame(kmeans2.cluster_centers_) centers.columns=['x','y'] centers # In[43]: ### Visualize to better understand clustering result plt.figure(figsize=(10,8)) plt.scatter(x=centers['x'],y=centers['y'], s=100, c='black',marker='*') sns.scatterplot(data=mall_df, x='Annual Income (k$)',y='Spending Score (1-100)',hue='Income & Spending Cluster',palette='tab10') # In[44]: ## For further analysis, we can use a crosstab to breakdown results into male and female pd.crosstab(mall_df['Income & Spending Cluster'],mall_df['Gender'],normalize='index') # In[45]: mall_df.groupby('Income & Spending Cluster')['Age', 'Annual Income (k$)','Spending Score (1-100)'].mean() # #### From the above, we can say that the for our target group which is the orange cluster(cluster 1) are those with high annual income and high spending score. And this cluster has a higher percentage of females(53%).We can also say that in this cluster,the average age is 32, average income is 86 thousand and average spending score is 82 # ## MULTIVARIATE CLUSTERING # In[46]: ## From the data frame, we would need to encode Gender by replacing them with numbers. This allows to easily feed it into the ML algorithm mall_df.head() # In[47]: ## replace the Gender feature with dummy variables through one hot encoding mall_df2 = pd.get_dummies(mall_df) mall_df2 # In[48]: ### One of the dummy variables can be dropped since we can infer if its male or female from only one dummy mall_df2 = pd.get_dummies(mall_df,drop_first=True) mall_df2 # In[49]: mall_df2.columns # In[50]: mall_df2=mall_df2[['Age', 'Annual Income (k$)', 'Spending Score (1-100)','Gender_Male']] mall_df2 # In[51]: ### the variables need to be standardized so that the algorithm puts them on the same scale from sklearn.preprocessing import StandardScaler # In[52]: scale=StandardScaler() ### Initialize # In[53]: mall_df2 = pd.DataFrame(scale.fit_transform(mall_df2)) ## fit the data to be studied by the algorithm mall_df2.head() # In[54]: ## Now we can go through the clustering process for multiple features inertia_scores3=[] ## create an empty list to put all the inertia scores in once calculated for i in range(1,11): ## For each cluster in a range of 1 to 10, kmeans3=KMeans(n_clusters=i) ### Initialize the algorithm kmeans3.fit(mall_df2) ## Fit the data to be studied by the algorithm inertia_scores3.append(kmeans3.inertia_) ### append the calculated WCSS to inertia scores that was created plt.plot(range(1,11), inertia_scores3, marker='o') plt.title('The Elbow Method') plt.ylabel("WCSS") plt.xlabel('Number of Clusters') plt.show() # In[55]: ## With our chosen cluster, we can input it into the KMeans algorithm kmeans3=KMeans(n_clusters=4) ## initialize algorithm with optimal clusters kmeans3.fit(mall_df2) mall_df2['Cluster Label']=kmeans3.labels_ mall_df2