#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')


# In[2]:


mall_df = pd.read_excel("C:/Users/gertr/Desktop/DATA ANALYTICS/PERSONAL PROJECTS/CLUSTERING(WINE)/Mall_Customers.xlsx")


# In[3]:


mall_df.head()


# In[4]:


mall_df.shape


# In[5]:


mall_df.isnull().sum()


# In[6]:


## Check for duplicates
mall_df.duplicated().sum()


# In[7]:


## rename Genre as Gender
mall_df.rename(columns={'Genre':'Gender'}, inplace=True)


# In[8]:


mall_df


# # UNIVARIATE ANALYSIS

# In[9]:


mall_df.describe()


# ###### From the above statistics, we can tell that annual income and spending score has somewhat of a normal distribution since its mean and median are really close.

# In[10]:


sns.displot(mall_df['Annual Income (k$)'],kde=True);


# #### In order to visualize the distribution of all the other numeric variables, a for loop will be efficient since it allows to go through each item in a list or any kind of structure.

# In[11]:


### To easily copy and paste the columns needed for the loop below
mall_df.columns


# In[12]:


columns=['Age', 'Annual Income (k$)','Spending Score (1-100)']
for i in columns:   ## For each item/feature in the column variable created,
    plt.figure()    ## create a new figure for each feature when
    sns.displot(mall_df[i],kde=True);   ## a plot is run


# #### However, a stand alone kdeplot allows for a better understanding of the data especially with the additional dimension of gender. In other words. a kdeplot allows for a better understanding /distribution of the data based on gender

# In[13]:


sns.kdeplot(mall_df['Annual Income (k$)'], shade=True,hue=mall_df['Gender']);


# #### To analyze a kdeplot for all variables, a for loop will be ideal

# In[14]:


columns=['Age', 'Annual Income (k$)','Spending Score (1-100)']
for i in columns:   ## For each item/feature in the column variable created,
    plt.figure()    ## create a new figure for each feature when
    sns.kdeplot(mall_df[i], shade=True,hue=mall_df['Gender']);


# #### From the above kdeplots, the distribution of each feature for each gender can clearly be identified. Females have more frequency for each feature.

# In[15]:


mall_df['Gender'].value_counts(normalize=True)


# In[16]:


## To know the mean of each features between the genders
mall_df.groupby(['Gender'])['Age', 'Annual Income (k$)','Spending Score (1-100)'].mean()


# In[17]:


### A boxplot can be used to dive deeper to check for outliers and gives an idea of the percentiles that the data is concentrated between
columns=['Age', 'Annual Income (k$)','Spending Score (1-100)']
for i in columns:   ## For each item/feature in the column variable created,
    plt.figure()    ## create a new figure for each feature when
    sns.boxplot(data=mall_df, x='Gender',y=mall_df[i]);


# # BIVARIATE ANALYSIS

# In[18]:


sns.scatterplot(data=mall_df, x='Annual Income (k$)', y= 'Spending Score (1-100)');


# In[19]:


### To view the relationship amongst the variables, a pairplot is best used
mall_df.drop('CustomerID',axis=1)
sns.pairplot(mall_df,hue='Gender')


# In[20]:


## To find the degree of the relationship amongst the variables, a correlation funstion is used
mall_df.corr()


# In[21]:


## A heatmap is used to visualize the degree of correlation
sns.heatmap(mall_df.corr(), annot=True,cmap='viridis')


# In[22]:


mall_df.columns


# # CLUSTERING - UNIVARIATE, BIVARIATE AND MULTIVARIATE

# ##### For any machine learning algorithm in sklearn to be done, there are three steps
# ##### The first thing is to initialize  the algorithm
# ##### The second is to fit our data to that algorithm which allows the algorithm to learn the data
# ##### The third is to either predict or gather the necessary labels needed out of that fitted model

# ## UNIVARIATE CLUSTERING (Annual Income)

# In[23]:


## Here we perform the first step by initializing the KMeans algorithm
clustering1 = KMeans() ## The standard parameters for clusters in this algorithm is 8.


# In[24]:


## Here we fit the feature that we want the algorith,KMeans, to learn
clustering1.fit(mall_df[['Annual Income (k$)']])


# In[25]:


### A lst of the labels. which we already know are 8 since the default cluster is 8
clustering1.labels_


# In[26]:


### Now we need to compare these labels to our initial data so we can make meaning out of it
mall_df['Income Cluster']=clustering1.labels_
mall_df


# In[27]:


## Now we can do some summary statistics on the clusters
# 1. the number of customers in each cluster
mall_df['Income Cluster'].value_counts()


# ##   ELBOW METHOD
# #### Rather than using a default clustering number, the elbow method can help us in identifying the optimal number of clusters to choose. This can be done by selecting the right inertia(WCSS)

# In[28]:


## To efficient way to check the inertia(WCSS) on ach cluster is to create a for loop
inertia_scores=[]    ## create an empty list to put all the inertia scores in once calculated
for i in range(1,11):   ## For each cluster in a range of 1 to 10,
    kmeans=KMeans(n_clusters=i) ### Initialize the algorithm
    kmeans.fit(mall_df[['Annual Income (k$)']])  ## Fit the data to be studied by the algorithm
    inertia_scores.append(kmeans.inertia_)     ### append the calculated WCSS to inertia scores that was created
    

# In[29]:


inertia_scores


# In[30]:


### We need to pair the inertia scores with their respective clusters
plt.plot(range(1,11), inertia_scores, marker='o')
plt.title('The Elbow Method')
plt.ylabel("WCSS")
plt.xlabel('Number of Clusters')
plt.show()


# ### KNEELOCATOR

# In[31]:


pip install kneed


# In[32]:


#### In case we are unsure about the exact number from the elbow method, the KneeLocater function from the Kneed Library can help.
from kneed import KneeLocator

k1=KneeLocator(range(1,11), inertia_scores, curve='convex', direction='decreasing')
k1.elbow


# In[33]:


## With our chosen cluster, we can input it into the KMeans algorithm
kmeans1=KMeans(n_clusters=3) ## initialize algorithm with optimal clusters
kmeans1.fit(mall_df[['Annual Income (k$)']])
kmeans1.labels_


# In[34]:


mall_df['Income Cluster']=kmeans1.labels_
mall_df


# In[35]:


mall_df['Income Cluster'].value_counts()


# In[36]:


mall_df.columns


# In[37]:


## We can now check the mean for each feature in each cluster
mall_df.groupby('Income Cluster')['Age', 'Annual Income (k$)',
       'Spending Score (1-100)'].mean()


# ## BIVARIATE CLUSTERING (Annual Income & Spending Score)

# In[38]:


inertia_scores2=[]    ## create an empty list to put all the inertia scores in once calculated
for i in range(1,11):   ## For each cluster in a range of 1 to 10,
    kmeans2=KMeans(n_clusters=i) ### Initialize the algorithm
    kmeans2.fit(mall_df[['Annual Income (k$)','Spending Score (1-100)']])  ## Fit the data to be studied by the algorithm
    inertia_scores2.append(kmeans2.inertia_)     ### append the calculated WCSS to inertia scores that was created
    
plt.plot(range(1,11), inertia_scores2, marker='o')
plt.title('The Elbow Method')
plt.ylabel("WCSS")
plt.xlabel('Number of Clusters')
plt.show()


# In[39]:


k2=KneeLocator(range(1,11), inertia_scores2, curve='convex', direction='decreasing')
k2.elbow


# In[40]:


## With our chosen cluster, we can input it into the KMeans algorithm
kmeans2=KMeans(n_clusters=5) ## initialize algorithm with optimal clusters
kmeans2.fit(mall_df[['Annual Income (k$)','Spending Score (1-100)']])
mall_df['Income & Spending Cluster']=kmeans2.labels_
mall_df


# In[41]:


## We can get the centriods for each cluster with the 'cluster_centers_'function. The results are the x and y cordinates
kmeans2.cluster_centers_


# In[42]:


## In order to add the centriods into a scatter plot, we first make it a dataframe
centers=pd.DataFrame(kmeans2.cluster_centers_)
centers.columns=['x','y']
centers


# In[43]:


### Visualize to better understand clustering result
plt.figure(figsize=(10,8))
plt.scatter(x=centers['x'],y=centers['y'], s=100, c='black',marker='*')
sns.scatterplot(data=mall_df, x='Annual Income (k$)',y='Spending Score (1-100)',hue='Income & Spending Cluster',palette='tab10')


# In[44]:


## For further analysis, we can use a crosstab to breakdown results into male and female
pd.crosstab(mall_df['Income & Spending Cluster'],mall_df['Gender'],normalize='index')


# In[45]:


mall_df.groupby('Income & Spending Cluster')['Age', 'Annual Income (k$)','Spending Score (1-100)'].mean()


# #### From the above, we can say that the for our target group which is the orange cluster(cluster 1) are those with high annual income and high spending score. And this cluster has a higher percentage of females(53%).We can also say that in this cluster,the average age is 32, average income is 86 thousand and average spending score is 82

# ## MULTIVARIATE CLUSTERING

# In[46]:


## From the data frame, we would need to encode Gender by replacing them with numbers. This allows to easily feed it into the ML algorithm
mall_df.head()


# In[47]:


## replace the Gender feature with dummy variables through one hot encoding
mall_df2 = pd.get_dummies(mall_df)
mall_df2


# In[48]:


### One of the dummy variables can be dropped since we can infer if its male or female from only one dummy
mall_df2 = pd.get_dummies(mall_df,drop_first=True)
mall_df2


# In[49]:


mall_df2.columns


# In[50]:


mall_df2=mall_df2[['Age', 'Annual Income (k$)', 'Spending Score (1-100)','Gender_Male']]
mall_df2               


# In[51]:


### the variables need to be standardized so that the algorithm puts them on the same scale
from sklearn.preprocessing import StandardScaler


# In[52]:


scale=StandardScaler()   ### Initialize


# In[53]:


mall_df2 = pd.DataFrame(scale.fit_transform(mall_df2))    ## fit the data to be studied by the algorithm
mall_df2.head()


# In[54]:


## Now we can go through the clustering process for multiple features
inertia_scores3=[]    ## create an empty list to put all the inertia scores in once calculated
for i in range(1,11):   ## For each cluster in a range of 1 to 10,
    kmeans3=KMeans(n_clusters=i) ### Initialize the algorithm
    kmeans3.fit(mall_df2)  ## Fit the data to be studied by the algorithm
    inertia_scores3.append(kmeans3.inertia_)     ### append the calculated WCSS to inertia scores that was created
    
plt.plot(range(1,11), inertia_scores3, marker='o')
plt.title('The Elbow Method')
plt.ylabel("WCSS")
plt.xlabel('Number of Clusters')
plt.show()


# In[55]:


## With our chosen cluster, we can input it into the KMeans algorithm
kmeans3=KMeans(n_clusters=4) ## initialize algorithm with optimal clusters
kmeans3.fit(mall_df2)
mall_df2['Cluster Label']=kmeans3.labels_
mall_df2