#!/usr/bin/env python
# coding: utf-8

# # 1.DATA CLEANING- DATA WRANGLING

# In[11]:


#Importing libraries
import pandas as pd 
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
pd.set_option('display.max_columns', None)
get_ipython().run_line_magic('matplotlib', 'inline')


# In[12]:


#Loading,reading and storing dataset
df = pd.read_csv('marketing_data.csv')


# In[13]:


#Review of data fields
df


# ### Understanding data   
# ID=Customer's unique identifier   
# Year_Birth=Customer's birth year   
# Education=Customer's education level   
# Marital_Status=Customer's marital status   
# Income=Customer's yearly household income   
# Kidhome=Number of children in customer's household   
# Teenhome=Number of teenagers in customer's household   
# Dt_Customer=Date of customer's enrollment with the company   
# Recency=Number of days since customer's last purchase   
# MntWines=Amount spent on wine in the last 2 years   
# MntFruits=Amount spent on fruits in the last 2 years   
# MntMeatProducts=Amount spent on meat in the last 2 years   
# MntFishProducts=Amount spent on fish in the last 2 years   
# MntSweetProducts=Amount spent on sweets in the last 2 years   
# MntGoldProds=Amount spent on gold in the last 2 years   
# NumDealsPurchases=Number of purchases made with a discount   
# NumWebPurchases=Number of purchases made through the company's web site   
# NumCatalogPurchases=Number of purchases made using a catalogue   
# NumStorePurchases=Number of purchases made directly in stores   
# NumWebVisitsMonth=Number of visits to company's web site in the last month   
# AcceptedCmp3=1 if customer accepted the offer in the 3rd campaign, 0 otherwise   
# AcceptedCmp4=1 if customer accepted the offer in the 4th campaign, 0 otherwise   
# AcceptedCmp5=1 if customer accepted the offer in the 5th campaign, 0 otherwise   
# AcceptedCmp1=1 if customer accepted the offer in the 1st campaign, 0 otherwise   
# AcceptedCmp2=1 if customer accepted the offer in the 2nd campaign, 0 otherwise   
# Response=1 if customer accepted the offer in the last campaign, 0 otherwise   
# Complain=1 if customer complained in the last 2 years, 0 otherwise   
# Country=Customer's location   

# In[14]:


#Show the DataFrame's shape
df.shape


# In[15]:


# Standardize column names
df.columns=[e.lower().replace(' ', '') for e in df.columns]  
df.columns


# In[16]:


df.info()


# We have 18 numerical variables and 10 categorical variables: Education, Marital_Status, AcceptedCmp1-AcceptedCmp5,Response, Country and Complain

# In[17]:


#Transformon income to numerical
df['income'] = df['income'].str.replace('$', '')
df['income'] = df['income'].str.replace(',', '').astype('float')


# In[18]:


#Transformon AcceptedCmp1-5, Respomnse, Complain to object
df['acceptedcmp1'] = df['acceptedcmp1'].astype(str)
df['acceptedcmp2'] = df['acceptedcmp2'].astype(str)
df['acceptedcmp3'] = df['acceptedcmp3'].astype(str)
df['acceptedcmp4'] = df['acceptedcmp3'].astype(str)
df['acceptedcmp5'] = df['acceptedcmp5'].astype(str)
df['response'] = df['response'].astype(str)
df['complain'] = df['complain'].astype(str)


# In[19]:


#Checking unique values of dataset
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Valores únicos de la columna '{column}':")
    print(unique_values)
    print()


# In[20]:


#Checking for duplicate rows in the data to remove if any
df.duplicated().sum()


# In[21]:


##hecking null values, I will remove columns have more than 75%
df.isnull().sum().sort_values(ascending=False)


# In[22]:


#Plot Income variable to identify best strategy for imputation
plt.figure(figsize=(8,4))
sns.distplot(df['income'], kde=False, hist=True)
plt.title('Distribution', size=16)
plt.ylabel('count');


# Findings:
# 
# - Plot shows that most incomes are distributed between 0-100.000$ with a few outliers.
# 
# Choosing the median as the imputation strategy is a common approach when dealing with skewed distributions or the presence of outliers. The median is robust to outliers because it is not affected by extreme values, unlike the mean. By using the median, the imputed values will be representative of the central tendency of the non-null income values and provide a more accurate estimation compared to the mean. This provides a better approximation of the actual values and helps maintain the integrity and quality of the data.

# In[23]:


df['income'].plot(kind='box', figsize=(3,4), patch_artist=True)


# In[24]:


#Impute null values in Income with median value (to avoid skewing of the mean due to outliers):
df['income'] = df['income'].fillna(df['income'].median())


# In[25]:


plt.figure(figsize=(8,4))
sns.distplot(df['income'], kde=False, hist=True)
plt.title('Distribution', size=16)
plt.ylabel('count');


# ## DEALING WITH OUTLIERS

# In[26]:


#PLot outliers of numerical data

plt.show()# select columns to plot
df_to_plot = df.drop(columns=['id', 'acceptedcmp1', 'acceptedcmp2', 'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'response', 'complain']).select_dtypes(include=np.number)

# subplots
df_to_plot.plot(subplots=True, layout=(4,4), kind='box', figsize=(12,14), patch_artist=True)
plt.subplots_adjust(wspace=0.5);


# Multiple features contain outliers (see boxplots below), but the only that likely indicate data entry errors are Year_Birth <= 1900

# In[27]:


#Remove rows where Year_Birth <= 1900
df = df[df['year_birth'] > 1900].reset_index(drop=True)

plt.figure(figsize=(3,4))
df['year_birth'].plot(kind='box', patch_artist=True);


# ## Outliers Income column

# We identify Income outliers and then exclude them from the data frame

# In[28]:


Q1 = df.income.quantile(0.25)
Q3 = df.income.quantile(0.75)
IQR = Q3 - Q1

income_outliers = df[df.income > Q3 + 1.5*IQR]
df = df[df.income < Q3 + 1.5*IQR]
income_outliers.head()


# In[29]:


plt.figure(figsize=(8,4))
sns.distplot(df['income'], kde=False, hist=True)
plt.title('Distribution', size=16)
plt.ylabel('count');


# In[30]:


plt.figure(figsize=(3,4))
df['income'].plot(kind='box', patch_artist=True);


# In[31]:


#Counting outliers for each colum

for column in df_to_plot:
    q1 = np.percentile(df_to_plot[column], 25)
    q3 = np.percentile(df_to_plot[column], 75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    num_outliers = ((df_to_plot[column] < lower_bound) | (df_to_plot[column] > upper_bound)).sum()

    print("Column '{}': {}".format(column, num_outliers))


# In[32]:


#Counting percentage of outliers

for column in df_to_plot:
    q1 = np.percentile(df_to_plot[column], 25)
    q3 = np.percentile(df_to_plot[column], 75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outliers = ((df_to_plot[column] < lower_bound) | (df_to_plot[column] > upper_bound))
    num_outliers = outliers.sum()
    percentage = (num_outliers / len(df_to_plot[column])) * 100

    print("Column '{}': {:.2f}%".format(column, percentage))


# We are going to work on the predictions with the rest of the ouliers since once we segment the clients to make the predictions we can lose a lot of data and this could invalidate the model. Once we get the first results we can test the model without outliers.

# In[33]:


#Transform yaer_birth and dt_customer into date format
df['year_birth'] = pd.to_datetime(df['year_birth'], format='%Y')
df['dt_customer'] = pd.to_datetime(df['dt_customer'])


# In[34]:


#Transform yaer_birth and dt_customer into date format


# ## Feature engineering
# 
# Having a first look at the row data enables us to start thinking at some useful variables we could create in order to better understand our dataset and reveal precious information.
# 
# We will create several variables :
# 
# - Variable Age in replacement of the variable Year_birth
# - Variable Spending as the sum of the amount spent on the product categories
# - Variable Marital_Situation to group the different marital status in only 2 comprehensive categories : Couple(Married or Together) vs Alone (Divorced' 'Single', 'Widow' 'YOLO' 'Alone' 'Absurd'
# - Variable dependents in the home ('Dependents') can be engineered from the sum of 'Kidhome' and 'Teenhome'
# - The year of becoming a customer ('Year_Customer') can be engineered from 'Dt_Customer'
# - The total purchases ('TotalPurchases') can be engineered from the sum of all features containing the keyword 'Purchases'
# - The total number of campains accepted ('TotalCampaignsAcc') can be engineered from the sum of all features containing the keywords 'Cmp' and 'Response' (the latest campaign)
# 
# 
# We will remove the unused variables for this analysis

# In[35]:


counts = df['marital_status'].value_counts()
print(counts)


# In[36]:


#Age
df['age'] = datetime.now().year - df['year_birth'].dt.year

# Spending
mnt_cols = [col for col in df.columns if 'mnt' in col]
df['spending'] = df[mnt_cols].sum(axis=1)

#Marital Status

# Create a mapping dictionary for states

estado_mapeo = {
    'Divorced': 'single',
    'Single': 'single',
    'Married': 'couple',
    'Together': 'couple',
    'Widow':'single',
    'YOLO':'single',
    'Alone':'single',
    'Absurd':'single',
}

# Apply mapping to 'marital_status' column
df['marital_status'] = df['marital_status'].map(estado_mapeo)


# Dependents
df['dependents'] = df['kidhome'] + df['teenhome']

# Year becoming a Customer
df['year_customer'] = pd.DatetimeIndex(df['dt_customer']).year


# Total Purchases
purchases_cols = [col for col in df.columns if 'purchases' in col]
df['total_purchases'] = df[purchases_cols].sum(axis=1)

# Total Campaigns Accepted
campaigns_cols = [col for col in df.columns if 'cmp' in col] + ['response'] # 'Response' is for the latest campaign
df['total_campaigns_acc'] = df[campaigns_cols].sum(axis=1)


# In[ ]:


# In[37]:


#Drop columns we don´t need
df = df.drop(['year_birth', 'teenhome', 'kidhome','dt_customer'], axis=1)


# In[38]:


df.head(3)


# In[39]:


#Saving clean df
df.to_csv('marketing_analysis_clean.csv', index=False)


# In[40]:


suma_spending = df['spending'].sum()
suma_spending


# In[41]:


fila_cliente_1386 = df[df['id'] == 9365]

# Imprimir la fila seleccionada
print(fila_cliente_1386)


# In[42]:


df.shape


# ### Statistical summary 
# 

# In[43]:


df.describe()


# The describe function generates for us asummary, particularly useful as a first step in our preliminary investigation. Analyzing the statistical summary gives us insightful information in one look :   
# 
# -**Average income** is 52.227$ while median income is 51.381$. The distribution is right skewed with the possible presence of outliers   
# The maximum value is 666.666$ and the 3rd quartile 68.281$ dollars  so we could deduce that the distribution is skewed to the right with the possible presence of outliers.   
# -**Average spending** in the last 2 years is 605$ while median spending is 396$.   
# -**Average age** is 54 years old and the oldest customer is 83 years old.   
# -**Average store purchases** is the highest with 5.794.397 following by average of web purchases with 4.087.170.   
# - The highest average spending on products that have been sold in the last two years has been wine (303.995530), meat(166.916853, gold products(43.968708).   
# - The campaign with the highest mean response has been Response, which was the last campaign launched.

# # EDA

# In[44]:


#Split data into categorical and numerical  for exploration 


# In[45]:


df_cat=df.select_dtypes(include=['object'])
df_cat.head(5)


# In[46]:


#Save numerical data
df_num= df.select_dtypes(exclude=["object"])
df_num.head(3)


# ## CUSTOMER PROFILE ANALYSIS

# There are columsn relevant for customer profile:
# Education:ordinal
# Incomo:continuos
# Marital:nominal

# In[47]:


#Displaying categorical data

fig, ax = plt.subplots(1, 4, figsize=(15, 4))

sns.countplot(x=df_cat['marital_status'], ax=ax[0])
sns.countplot(x=df_cat['education'], ax=ax[1])
sns.countplot(x=df_cat['country'], ax=ax[2])


df_plot = df_cat.groupby(['marital_status', 'education']).size().reset_index().pivot(columns='marital_status', index='education', values=0)
df_plot.apply(lambda x: x / x.sum(), axis=1).plot(kind='bar', stacked=True, ax=ax[3], colormap='Paired')
ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

plt.tight_layout()
plt.show()


# In[48]:


fig, ax = plt.subplots(1, 7, figsize=(15, 4))

sns.countplot(x=df_cat['acceptedcmp1'], ax=ax[0])
sns.countplot(x=df_cat['acceptedcmp2'], ax=ax[1])
sns.countplot(x=df_cat['acceptedcmp3'], ax=ax[2])
sns.countplot(x=df_cat['acceptedcmp4'], ax=ax[3])
sns.countplot(x=df_cat['acceptedcmp5'], ax=ax[4])
sns.countplot(x=df_cat['response'], ax=ax[5])
sns.countplot(x=df_cat['complain'], ax=ax[6])

ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

plt.tight_layout()
plt.show()


# ## Association between marital status and education

# In[49]:


df_cat_con= df_cat[['marital_status', 'education']]
df_cat_con


# In[50]:


from scipy.stats import chi2_contingency
# Crear la tabla de contingencia
contingency_table= pd.crosstab(df_cat_con['marital_status'], df_cat_con['education'])

# Realizar la prueba de chi-cuadrado
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Mostrar la tabla de contingencia
print(contingency_table)

# Mostrar el valor de chi-cuadrado y el valor p
print(f"Chi-cuadrado: {chi2}")
print(f"Valor p: {p_value}")


# The result obtained indicates that the chi-square value is approximately 2.461 and the p-value is approximately 0.6516.
# 
# The chi-square value is a measure of the association between the variables "marital_status" (marital status) and "education" (education) in your data set. A larger chi-square value indicates a stronger association between the variables, while a value close to zero indicates that the variables are independent.
# 
# The p-value is the probability of obtaining a chi-square value equal to or more extreme than that observed if the variables were completely independent. In other words, it represents the probability that the association between the variables is simply the result of chance, given the size of the sample.
# 
# In your case, the p-value is 0.6516, which means that the probability of getting a chi-square value equal to or more extreme than 2.461 under the null hypothesis (that the variables are independent) is about 0.6516. Generally, if the p-value is greater than a predefined significance level (for example, 0.05), we do not have sufficient evidence to reject the null hypothesis and we can consider the variables to be independent.
# 
# Therefore, in this case, the result suggests that there is no significant association between marital status and education level in the data set you are analyzing.

# In[51]:


#Displaying numerical dataç
df_num.columns


# In[52]:


# Crear el lienzo y los ejes
fig, axes = plt.subplots(6, 4, figsize=(15, 20))

# Aplanar el arreglo de ejes para facilitar el acceso
axes = axes.flatten()

# Graficar los histogramas de las columnas numéricas
for i, col in enumerate(df_num.columns):
    sns.histplot(data=df_num, x=col, ax=axes[i], kde=True)

# Ajustar el espaciado entre subplots
plt.tight_layout()

# Mostrar el gráfico
plt.show()


# In[ ]:


# ## Features Correlation Analysis

# In[53]:


corr = df_num.corr()
ones = np.ones_like(corr, dtype="bool")
mask = np.triu(ones)
mask = mask[1:, :-1]
corr = corr.iloc[1:,:-1].copy()
fig, ax = plt.subplots(figsize=(40,30))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="Greens", linewidths=.6, cbar_kws={"shrink":.9})
ax.xaxis.tick_bottom()
plt.title("Correlations heatmap for numerical data".upper(), fontdict={"fontsize": 18}, loc="left")


# In[67]:


sns.set(rc ={'figure.figsize':(8,8)}, font_scale=0.5)
sns.heatmap(corr[corr>0.8], annot=True, fmt=".3f")


# In[54]:


#High correlation:.80 to 1
#Mean correlation:.60 to .80
#Low correlation: .40 to .60

#We see that our target to predict  has mean correlation with total purchases


# In[55]:


corr = df_num.corr()
high_corr = corr[corr > 0.80]
high_corr = high_corr.unstack().dropna()
high_corr = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)]

for index, value in high_corr.items():
    variable_1, variable_2 = index
    correlation_value = value
    print("Variables:", variable_1, "and", variable_2)
    print("Correlation:", correlation_value)
    print("------------------------")


# ## Correlation Results

# - Income is a proxy for several other features, such as spending capacity or total purchases. It shows a positive relationship with expenses on meat and wine, indicating that individuals with higher income tend to spend more on these items. On the other hand, income has a negative correlation with the number of children at home and the frequency of website visits, suggesting that individuals with higher income tend to have fewer children at home and visit websites less frequently.   
# 
# - Furthermore, the amount spent on wine is not only influenced by high income but also by the amount spent on meat. These two variables are interconnected, meaning that individuals who spend more on meat are also likely to spend more on wine. Moreover, individuals can purchase wine either from catalogs or from physical stores.   
# 
# - The number of children at home is negatively associated with spending and income, and consequently, wine consumption. This implies that individuals with more children tend to have lower income and spend less overall, including on wine. Conversely, higher income levels are also linked to a greater willingness to accept advertising campaigns.   
# 
# - In summary, income serves as a proxy for several factors, with positive correlations to spending,  meat and wine expenses, while showing negative associations with the number of children at home and website visits. The relationship between income, spending habits, and preferences for specific products such as wine can provide insights into consumer behavior and marketing strategies.   

# In[88]:


#VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

features = ['income', 'mntmeatproducts','mntwines','total_purchases']

def calculate_vif(df, features):    
    vif, tolerance, r2sq = {}, {}, {}
    # all the features that you want to examine
    for feature in features:
        # extract all the other features you will regress against
        X = [f for f in features if f != feature]        
        X, y = df[X], df[feature]
        # extract r-squared from the fit
        r2 = LinearRegression().fit(X, y).score(X, y)                
        
        # calculate tolerance
        r2sq[feature] = r2
        tolerance[feature] = 1 - r2
        # calculate VIF
        vif[feature] = 1/(tolerance[feature])
    # return VIF DataFrame
    return pd.DataFrame({'VIF': vif, 'Tolerance': tolerance, 'R2': r2sq})


# In[95]:


calculate_vif(df_num, df_num)


# In[192]:


#remving spending
features_2 = ['mntmeatproducts', 'mntwines', 'total_purchases', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods', 'numwebpurchases', 'numwebvisitsmonth', 'age', 'dependents']


# In[193]:


calculate_vif(df,features_2)


# **Interpreting R square k** - If $R^2_k$ equals zero, variable `k` is not correlated with any other independent variable.
# Usually, multicollinearity is a potential problem when $R^2_k$ is greater than `0.75` and, a serious problem when $R^2_k$ is greater than `0.9`.
# 
# **Interpretation of the variance inflation factor**: If $VIF_k = 1$, variable k is not correlated with any other independent variable. Multicollinearity is a potential problem when `VIFk` is greater than 5 and, a serious problem when it is greater than 10.
# 
# **Interpretation of tolerance** A tolerance of less than 0.20 or 0.10 indicates a multicollinearity problem.

# In[ ]:


# In[ ]:


# In[96]:


#Plot illustrating the effect of income on spending
plt.figure(figsize=(8, 8))
plt.scatter(df['income'], df['spending'])
plt.title('Income vs. Spending')
plt.xlabel('Income')
plt.ylabel('Spending')
plt.show()


# In[97]:


#Plot illustrating negative effect of having dependents (kids & teens) on spending
plt.figure(figsize=(4,4))
sns.boxplot(x='dependents', y='spending', data=df)


# In[98]:


#Plot illustrating positive effect of having dependents (kids & teens) on number of deals purchased
plt.figure(figsize=(4,4))
sns.boxplot(x='dependents', y='numdealspurchases', data=df);


# In[ ]:


# In[99]:


#REMOVING SPENDING AND TOTAL PURCHASE to check Multicolinearity


# In[100]:


corr_de = df_num.drop(['spending', 'total_purchases'], axis=1)


# In[101]:


fig, ax = plt.subplots(figsize=(40,30)  )
heatmap = sns.heatmap(corr_de.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')


# ## Segmentation of Customers

# ### Choosing columsn for segmentation

# I  choose the spending and income variables because they provide interesting information for the segmentation. 

# In[ ]:


# In[136]:


df_num.head(2)


# In[137]:


cluster = df_num.iloc[:,[1,15,]]

#income
#recency, mntwines	mntfruits	mntmeatproducts	mntfishproducts	mntsweetproducts	mntgoldprods	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases numwebvisitsmonth	age	spending,total_purchases


# In[135]:


cluster


# In[ ]:


# Standardizing numerical data with standard scaler

# In[105]:


#scaler = StandardScaler()

#cluster_encoded = scaler.fit_transform(cluster)


# In[106]:


#cluster_encoded 


# In[138]:


scaler = MinMaxScaler()
cluster_encoded= scaler.fit_transform(cluster)
cluster_encoded


# In[108]:


cluster_encoded.shape


# Choosing the number of clusters
# WCSS  ->  Within Clusters Sum of Squares

# In[109]:


## finding wcss value for different number of clusters

wcss = []

for i in range(1,11):
  kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
  kmeans.fit(cluster_encoded)

  wcss.append(kmeans.inertia_)


# In[110]:


# plot an elbow graph

sns.set()
plt.plot(range(1,11), wcss)
plt.title('The Elbow Point Graph')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()


# Optimum Number of Clusters = 4   
# Training the k-Means Clustering Model

# In[111]:


from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)

# return a label for each data point based on their cluster
Y = kmeans.fit_predict(cluster_encoded)

print(Y)


# In[112]:


#Visualization
plt.figure(figsize=(8,8))
plt.scatter(cluster_encoded[Y==0,0 ], cluster_encoded[Y==0,1 ], s=50, c='green', label='Cluster 1')
plt.scatter(cluster_encoded[Y==1,0 ], cluster_encoded[Y==1,1 ], s=50, c='red', label='Cluster 2')
plt.scatter(cluster_encoded[Y==2,0 ], cluster_encoded[Y==2,1 ], s=50, c='yellow', label='Cluster 3')
plt.scatter(cluster_encoded[Y==3,0 ], cluster_encoded[Y==3,1 ], s=50, c='blue', label='Cluster 4')
                      

plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=100, c='cyan', label='Centroids')


# Configura las etiquetas de los ejes y el título de la gráfica
plt.ylabel('Spending')
plt.xlabel('Income')
plt.title('Customers Groups')

# Muestra la gráfica
plt.show()


# In[113]:


kmeans.labels_


# In[114]:


cluster_1_data = cluster_encoded[Y == 0]
cluster_1_data


# In[ ]:


# In[ ]:


# In[115]:


# Aplica K-means para obtener los clusters
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(cluster_encoded)

# Obtiene las etiquetas asignadas a cada cliente
labels = kmeans.labels_

# Cuenta la cantidad de clientes en cada cluster
conteo_clientes = pd.Series(labels).value_counts()

# Grafica los grupos de clientes por ingresos y gastos
plt.figure(figsize=(8, 8))
plt.scatter(cluster_encoded[:, 0], cluster_encoded[:, 1], c=labels, cmap='viridis')

# Grafica los centroides de los clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=100, c='red', label='Centroids')

# Configura las etiquetas de los ejes y el título de la gráfica
plt.xlabel('Income')
plt.ylabel('Spending')
plt.title('Customers Groups')


# Muestra la cantidad de clientes en cada cluster
for cluster, count in conteo_clientes.items():
    centroid = kmeans.cluster_centers_[cluster]
    plt.text(centroid[0], centroid[1], f'{count}', fontsize=12, color='black', ha='center')

# Muestra la gráfica
plt.show()


# I create a copy of the original dataframe and assign the labels to it to be able to filter the rows for each label and analyze the customer groups
# 

# In[139]:


cluster2 = cluster.copy()


# In[140]:


cluster2['cluster_labels'] = kmeans.labels_
cluster2


# In[141]:


# Filtrar las filas por los valores de cluster_labels
cluster_0 = cluster2.loc[cluster2['cluster_labels'] == 0]
cluster_1 = cluster2.loc[cluster2['cluster_labels'] == 1]
cluster_2 = cluster2.loc[cluster2['cluster_labels'] == 2]
cluster_3 = cluster2.loc[cluster2['cluster_labels'] == 3]

# Imprimir los resultados
print("Cluster 0:")
print(cluster_0)
print()

print("Cluster 1:")
print(cluster_1)
print()

print("Cluster 2:")
print(cluster_2)
print()

print("Cluster 3:")
print(cluster_3)


# In[142]:


#Converting clusters to dataframe

cluster_0_df = pd.DataFrame(cluster_0)
cluster_1_df = pd.DataFrame(cluster_1)
cluster_2_df = pd.DataFrame(cluster_2)
cluster_3_df = pd.DataFrame(cluster_3)


# In[143]:


cluster_0_df


# In[144]:


cluster_0_df.shape


# In[145]:


cluster_0_df.describe()


# ## As a data analyst, looking at the Cluster 1 of customer segmentation, cluster_0_df,  we can make the following conclusions:   
# 
# - Cluster size: 736 clients.   
# 
# - Average income: The average income of clients in this cluster is approximately 28,541$   
# 
# - Average expenses: Clients in this cluster have an average expense of around 72.88$.   
# 
# - Variability: There is considerable variability in the income and expenses of clients in this cluster, as indicated by the standard deviation. This suggests that there are clients with different levels of income and expenses within the cluster.   
# 
# - Range of income and expenses: The range of income goes from 1,730$ to 43,269$, while the range of expenses goes from 5$ to 570$ monetary units.   
# 
# - Distribution of income and expenses: The median (central value) of income is approximately 29,968$, which means that half of the clients have incomes above this value and the other half have incomes below this value. The median of the expenses is approximately 53$.   
# 
# - Low Spend Segment: This cluster can be considered as a segment of customers with low spending levels, since the average spend is relatively low compared to the other clusters.   
# 
# - With these conclusions, we can infer that this group of clients has relatively low income and expenses compared to the other clusters. This may indicate a customer profile with lower spending capacity or more conservative consumption preferences. These observations can be useful for the marketing team in targeting their strategies and campaigns more effectively towards this customer segment, for example, by offering affordable products or services and promotions tailored to their level of spending.   

# ## CLUSTER 2 CONCLUSIONS

# In[146]:


cluster_1_df.shape


# In[147]:


cluster_1_df.describe()


# - Cluster size: 334 clients.
# - Average income: The average income of clients in Cluster 1 is approximately 78,690$. This suggests that the customers in this cluster have relatively higher income levels compared to Cluster 0.      
# - Average expenses: The average expenses of clients in Cluster 1 amount to around 1,688$. This indicates that the customers in this segment tend to have higher spending levels compared to Cluster 0.      
# - Variability: The standard deviation for both income and expenses within Cluster 1 suggests a moderate degree of variability. This implies that there are clients with varying income and spending patterns within this cluster.   
# - Range of income and expenses: The range of income in Cluster 1 spans from 51,381$ to 105,471$ while the range of expenses ranges from 1.112$ to 2,525$. This wide range indicates the presence of customers with different income and spending levels within this cluster.   
# - Distribution of income and expenses: The median income of clients in Cluster 1 is approximately 79,414$. This suggests that half of the clients have an income below this value, while the other half has an income above it.   
# 
# - Overall, Cluster 1 represents a segment of customers with relatively higher income and spending levels compared to Cluster 0. These customers may have a higher purchasing power and exhibit more liberal spending behavior. The marketing department can leverage these insights to develop strategies and campaigns that cater to this customer segment. For example, they can focus on offering premium products or services, creating promotions that align with the customers' higher spending capacity, and delivering messaging that resonates with their more indulgent consumption preferences.   

# ## CLUSTER 3 CONCLUSIONS

# In[148]:


cluster_2_df.shape


# In[149]:


cluster_2_df.describe()


# - Cluster size: 612 clients.       
# - Average income: The average income of clients in Cluster 3 is approximately 50,399$.       
# - Average expenses: Clients in Cluster 3 have an average expense of around 315$. This indicates that the customers in this segment tend to have moderate spending levels than Cluster 2.      
# - Variability: The standard deviation for both income and expenses within Cluster 2 suggests a moderate degree of variability. This implies that there are clients with different income and spending patterns within this cluster.   
# - Range of income and expenses: The range of income in Cluster 3 spans from 32,632$ to 73,395$ while the range of expenses ranges from 16$ to 839$. This wide range indicates the presence of customers with varying income and spending levels within this cluster.   
# - Distribution of income and expenses: The median income of clients in Cluster 2 is approximately 49,973$.       
# 
# From this analysis, we can conclude that Cluster 3 represents a segment of customers with a moderate income level and moderate spending behavior. These customers may have a balanced approach to their finances and exhibit stable spending patterns. The marketing department can consider targeting this customer segment with products or services that align with their moderate income and spending capacity. They can also focus on providing value-based offerings and promotions that cater to their stable spending behavior.   

# ## CLUSTER 4 CONCLUSIONS

# In[150]:


cluster_3_df.shape


# In[151]:


cluster_3_df.describe()


# - Cluster size: 547 clients.   
# - Average income: 67,507.23$.   
# - Average expenses: Clients in Cluster 3 have an average expense of around 985$.   
# - Variability: The standard deviation for both income and expenses within Cluster 3 indicates a moderate degree of variability, suggesting that there are clients with different income and spending patterns within this cluster.   
# - Range of income and expenses: The range of income in Cluster 3 spans from 2,447$ to 113,734$ while the range of expenses ranges from 277$ to 1,730$.   
# - Distribution of income and expenses: The median income of clients in Cluster 3 is approximately 66,886$ 
# 
# 
# Based on this analysis, we can conclude that Cluster 3 represents a segment of clients with a relatively higher average income and higher spending levels compared to other clusters. These clients may have a higher spending capacity and may be more inclined towards making larger purchases. The marketing department can consider targeting this customer segment with higher-end products or services, luxury offerings, or personalized marketing campaigns that cater to their higher spending power.

# In[ ]:


# # Answer the following questions of the CMO:

# ## What does the Average customer look like for our company?

# In[152]:


selected_columns = ['age', 'year_customer', 'income', 'dependents', 'recency']
selected_data = df[selected_columns]
mean_values = selected_data.mean().round(1)
mean_df = pd.DataFrame(mean_values, columns=['Mean'])
mean_df.index.name = 'Variable'
mean_df.columns = ['Average Customer']
mean_df.index = mean_df.index.rename('Characteristics')
mean_df


# ## What Products are best performing?

# Wines Followed by meats

# In[153]:


# Calcular la suma de las ventas de cada producto
sales = df[['mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']].sum()

# Crear el gráfico de pie
plt.figure(figsize=(8, 6))
plt.pie(sales, labels=sales.index, autopct='%1.1f%%')

# Configurar el título
plt.title('Product Performance by Sales')

# Mostrar el gráfico
plt.show()


# ## Which channels are underperforming?

# In[154]:


# Seleccionar las columnas de interés
channels = df[['numwebvisitsmonth', 'numstorepurchases', 'numcatalogpurchases', 'numwebpurchases', 'numdealspurchases']]

# Calcular la suma de cada canal
channel_purchases = channels.sum()

# Calcular el porcentaje de compras de cada canal
total_purchases = df['total_purchases'].sum()
channel_performance = (channel_purchases / total_purchases) * 100

# Crear el gráfico de pie
plt.figure(figsize=(8, 6))
plt.pie(channel_performance, labels=channel_performance.index, autopct='%1.1f%%')

# Configurar el título
plt.title('Channels Performance by Total Purchases')

# Mostrar el gráfico
plt.show()


# ## Which Marketing Campaigns were most successful?
# 
# The most successful campaign is the most recent (column name: Response)

# In[155]:


fig, ax = plt.subplots(1, 7, figsize=(15, 4))

sns.countplot(x=df_cat['acceptedcmp1'], ax=ax[0])
sns.countplot(x=df_cat['acceptedcmp2'], ax=ax[1])
sns.countplot(x=df_cat['acceptedcmp3'], ax=ax[2])
sns.countplot(x=df_cat['acceptedcmp4'], ax=ax[3])
sns.countplot(x=df_cat['acceptedcmp5'], ax=ax[4])
sns.countplot(x=df_cat['response'], ax=ax[5])

ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

plt.tight_layout()
plt.show()


# In[ ]:


# ## Purchase amount spending prediction: Spending/Income

# In[156]:


cluster


# In[157]:


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

#X-y split 
Y=cluster['spending']
X=cluster.drop('spending',axis=1)


# In[158]:


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# In[159]:


# Normalizar los datos en X_train y X_test usando StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# In[160]:


# Crear y ajustar el modelo de regresión lineal
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)


# In[161]:


# Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba
y_train_pred = regression_model.predict(X_train_scaled)
y_test_pred = regression_model.predict(X_test_scaled)


# In[162]:


# Calcular las métricas en conjunto de entrenamiento
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = sqrt(mse_train)


# In[163]:


# Calcular las métricas en conjunto de prueba
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = sqrt(mse_test)


# In[164]:


# Imprimir las métricas
print("Métricas en conjunto de entrenamiento:")
print("R^2:", r2_train)
print("MSE:", mse_train)
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("------------------------------")
print("Métricas en conjunto de prueba:")
print("R^2:", r2_test)
print("MSE:", mse_test)
print("MAE:", mae_test)
print("RMSE:", rmse_test)


# In[ ]:


# ## Purchase amount spending prediction with other variables

# In[165]:


df_cat


# In[166]:


df_cat_l=df_cat[['education', 'marital_status', 'country']]
df_cat_l= df_cat_l.reset_index()
df_cat_l = df_cat_l.drop(columns=['index'])
df_cat_l


# In[167]:


#Encodign categorical data 
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(drop='first').fit(df_cat_l)
encoder=encoder.transform(df_cat_l).toarray()
encoder


# In[168]:


df_num


# In[169]:


df_num_l=df_num[['income', 'numdealspurchases',	'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases','numwebvisitsmonth', 'age', 'dependents', 'spending']]
df_num_l= df_num_l.reset_index()
df_num_l = df_num_l.drop(columns=['index'])
df_num_l


# In[170]:


#X-y split 
y=df_num_l['spending']
X=df_num_l.drop(['spending'],axis=1)
X


# In[ ]:


# In[ ]:


# In[171]:


# Normalizar datos numéricos utilizando StandardScaler
scaler = StandardScaler()
X_es = scaler.fit_transform(X)
X_es


# In[172]:


X=np.concatenate((X_es ,encoder),axis=1)
X


# In[173]:


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)


# In[174]:


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled


# In[175]:


# Crear y ajustar el modelo de regresión lineal
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)


# In[176]:


# Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba
y_train_pred = regression_model.predict(X_train_scaled)
y_test_pred = regression_model.predict(X_test_scaled)


# In[177]:


# Calcular las métricas
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = sqrt(mse_train)

r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = sqrt(mse_test)


# In[178]:


# Imprimir las métricas
print("Métricas en conjunto de entrenamiento:")
print("R^2:", r2_train)
print("MSE:", mse_train)
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("---------------------------")
print("Métricas en conjunto de prueba:")
print("R^2:", r2_test)
print("MSE:", mse_test)
print("MAE:", mae_test)
print("RMSE:", rmse_test)


# 

# In[ ]:


# In[ ]:


# ## Prediction removing income of last prediciton

# In[179]:


df_num_l=df_num[['numdealspurchases',	'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases','numwebvisitsmonth', 'age', 'dependents', 'spending']]
df_num_l= df_num_l.reset_index()
df_num_l = df_num_l.drop(columns=['index'])
df_num_l


# In[180]:


#X-y split 
y=df_num_l['spending']
X=df_num_l.drop(['spending'],axis=1)
X


# In[181]:


# Normalizar datos numéricos utilizando StandardScaler
scaler = StandardScaler()
X_es = scaler.fit_transform(X)
X_es


# In[182]:


X=np.concatenate((X_es ,encoder),axis=1)
X


# In[183]:


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)


# In[184]:


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled


# In[185]:


# Crear y ajustar el modelo de regresión lineal
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)


# In[186]:


# Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba
y_train_pred = regression_model.predict(X_train_scaled)
y_test_pred = regression_model.predict(X_test_scaled)


# In[187]:


# Calcular las métricas
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = sqrt(mse_train)

r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = sqrt(mse_test)


# In[188]:


# Imprimir las métricas
print("Métricas en conjunto de entrenamiento:")
print("R^2:", r2_train)
print("MSE:", mse_train)
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("---------------------------")
print("Métricas en conjunto de prueba:")
print("R^2:", r2_test)
print("MSE:", mse_test)
print("MAE:", mae_test)
print("RMSE:", rmse_test)


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]: