#!/usr/bin/env python # coding: utf-8 # # 1.DATA CLEANING- DATA WRANGLING # In[11]: #Importing libraries import pandas as pd import numpy as np import seaborn as sns from datetime import datetime import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.cluster import KMeans pd.set_option('display.max_columns', None) get_ipython().run_line_magic('matplotlib', 'inline') # In[12]: #Loading,reading and storing dataset df = pd.read_csv('marketing_data.csv') # In[13]: #Review of data fields df # ### Understanding data # ID=Customer's unique identifier # Year_Birth=Customer's birth year # Education=Customer's education level # Marital_Status=Customer's marital status # Income=Customer's yearly household income # Kidhome=Number of children in customer's household # Teenhome=Number of teenagers in customer's household # Dt_Customer=Date of customer's enrollment with the company # Recency=Number of days since customer's last purchase # MntWines=Amount spent on wine in the last 2 years # MntFruits=Amount spent on fruits in the last 2 years # MntMeatProducts=Amount spent on meat in the last 2 years # MntFishProducts=Amount spent on fish in the last 2 years # MntSweetProducts=Amount spent on sweets in the last 2 years # MntGoldProds=Amount spent on gold in the last 2 years # NumDealsPurchases=Number of purchases made with a discount # NumWebPurchases=Number of purchases made through the company's web site # NumCatalogPurchases=Number of purchases made using a catalogue # NumStorePurchases=Number of purchases made directly in stores # NumWebVisitsMonth=Number of visits to company's web site in the last month # AcceptedCmp3=1 if customer accepted the offer in the 3rd campaign, 0 otherwise # AcceptedCmp4=1 if customer accepted the offer in the 4th campaign, 0 otherwise # AcceptedCmp5=1 if customer accepted the offer in the 5th campaign, 0 otherwise # AcceptedCmp1=1 if customer accepted the offer in the 1st campaign, 0 otherwise # AcceptedCmp2=1 if customer accepted the offer in the 2nd campaign, 0 otherwise # Response=1 if customer accepted the offer in the last campaign, 0 otherwise # Complain=1 if customer complained in the last 2 years, 0 otherwise # Country=Customer's location # In[14]: #Show the DataFrame's shape df.shape # In[15]: # Standardize column names df.columns=[e.lower().replace(' ', '') for e in df.columns] df.columns # In[16]: df.info() # We have 18 numerical variables and 10 categorical variables: Education, Marital_Status, AcceptedCmp1-AcceptedCmp5,Response, Country and Complain # In[17]: #Transformon income to numerical df['income'] = df['income'].str.replace('$', '') df['income'] = df['income'].str.replace(',', '').astype('float') # In[18]: #Transformon AcceptedCmp1-5, Respomnse, Complain to object df['acceptedcmp1'] = df['acceptedcmp1'].astype(str) df['acceptedcmp2'] = df['acceptedcmp2'].astype(str) df['acceptedcmp3'] = df['acceptedcmp3'].astype(str) df['acceptedcmp4'] = df['acceptedcmp3'].astype(str) df['acceptedcmp5'] = df['acceptedcmp5'].astype(str) df['response'] = df['response'].astype(str) df['complain'] = df['complain'].astype(str) # In[19]: #Checking unique values of dataset for column in df.columns: unique_values = df[column].unique() print(f"Valores únicos de la columna '{column}':") print(unique_values) print() # In[20]: #Checking for duplicate rows in the data to remove if any df.duplicated().sum() # In[21]: ##hecking null values, I will remove columns have more than 75% df.isnull().sum().sort_values(ascending=False) # In[22]: #Plot Income variable to identify best strategy for imputation plt.figure(figsize=(8,4)) sns.distplot(df['income'], kde=False, hist=True) plt.title('Distribution', size=16) plt.ylabel('count'); # Findings: # # - Plot shows that most incomes are distributed between 0-100.000$ with a few outliers. # # Choosing the median as the imputation strategy is a common approach when dealing with skewed distributions or the presence of outliers. The median is robust to outliers because it is not affected by extreme values, unlike the mean. By using the median, the imputed values will be representative of the central tendency of the non-null income values and provide a more accurate estimation compared to the mean. This provides a better approximation of the actual values and helps maintain the integrity and quality of the data. # In[23]: df['income'].plot(kind='box', figsize=(3,4), patch_artist=True) # In[24]: #Impute null values in Income with median value (to avoid skewing of the mean due to outliers): df['income'] = df['income'].fillna(df['income'].median()) # In[25]: plt.figure(figsize=(8,4)) sns.distplot(df['income'], kde=False, hist=True) plt.title('Distribution', size=16) plt.ylabel('count'); # ## DEALING WITH OUTLIERS # In[26]: #PLot outliers of numerical data plt.show()# select columns to plot df_to_plot = df.drop(columns=['id', 'acceptedcmp1', 'acceptedcmp2', 'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'response', 'complain']).select_dtypes(include=np.number) # subplots df_to_plot.plot(subplots=True, layout=(4,4), kind='box', figsize=(12,14), patch_artist=True) plt.subplots_adjust(wspace=0.5); # Multiple features contain outliers (see boxplots below), but the only that likely indicate data entry errors are Year_Birth <= 1900 # In[27]: #Remove rows where Year_Birth <= 1900 df = df[df['year_birth'] > 1900].reset_index(drop=True) plt.figure(figsize=(3,4)) df['year_birth'].plot(kind='box', patch_artist=True); # ## Outliers Income column # We identify Income outliers and then exclude them from the data frame # In[28]: Q1 = df.income.quantile(0.25) Q3 = df.income.quantile(0.75) IQR = Q3 - Q1 income_outliers = df[df.income > Q3 + 1.5*IQR] df = df[df.income < Q3 + 1.5*IQR] income_outliers.head() # In[29]: plt.figure(figsize=(8,4)) sns.distplot(df['income'], kde=False, hist=True) plt.title('Distribution', size=16) plt.ylabel('count'); # In[30]: plt.figure(figsize=(3,4)) df['income'].plot(kind='box', patch_artist=True); # In[31]: #Counting outliers for each colum for column in df_to_plot: q1 = np.percentile(df_to_plot[column], 25) q3 = np.percentile(df_to_plot[column], 75) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr num_outliers = ((df_to_plot[column] < lower_bound) | (df_to_plot[column] > upper_bound)).sum() print("Column '{}': {}".format(column, num_outliers)) # In[32]: #Counting percentage of outliers for column in df_to_plot: q1 = np.percentile(df_to_plot[column], 25) q3 = np.percentile(df_to_plot[column], 75) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr outliers = ((df_to_plot[column] < lower_bound) | (df_to_plot[column] > upper_bound)) num_outliers = outliers.sum() percentage = (num_outliers / len(df_to_plot[column])) * 100 print("Column '{}': {:.2f}%".format(column, percentage)) # We are going to work on the predictions with the rest of the ouliers since once we segment the clients to make the predictions we can lose a lot of data and this could invalidate the model. Once we get the first results we can test the model without outliers. # In[33]: #Transform yaer_birth and dt_customer into date format df['year_birth'] = pd.to_datetime(df['year_birth'], format='%Y') df['dt_customer'] = pd.to_datetime(df['dt_customer']) # In[34]: #Transform yaer_birth and dt_customer into date format # ## Feature engineering # # Having a first look at the row data enables us to start thinking at some useful variables we could create in order to better understand our dataset and reveal precious information. # # We will create several variables : # # - Variable Age in replacement of the variable Year_birth # - Variable Spending as the sum of the amount spent on the product categories # - Variable Marital_Situation to group the different marital status in only 2 comprehensive categories : Couple(Married or Together) vs Alone (Divorced' 'Single', 'Widow' 'YOLO' 'Alone' 'Absurd' # - Variable dependents in the home ('Dependents') can be engineered from the sum of 'Kidhome' and 'Teenhome' # - The year of becoming a customer ('Year_Customer') can be engineered from 'Dt_Customer' # - The total purchases ('TotalPurchases') can be engineered from the sum of all features containing the keyword 'Purchases' # - The total number of campains accepted ('TotalCampaignsAcc') can be engineered from the sum of all features containing the keywords 'Cmp' and 'Response' (the latest campaign) # # # We will remove the unused variables for this analysis # In[35]: counts = df['marital_status'].value_counts() print(counts) # In[36]: #Age df['age'] = datetime.now().year - df['year_birth'].dt.year # Spending mnt_cols = [col for col in df.columns if 'mnt' in col] df['spending'] = df[mnt_cols].sum(axis=1) #Marital Status # Create a mapping dictionary for states estado_mapeo = { 'Divorced': 'single', 'Single': 'single', 'Married': 'couple', 'Together': 'couple', 'Widow':'single', 'YOLO':'single', 'Alone':'single', 'Absurd':'single', } # Apply mapping to 'marital_status' column df['marital_status'] = df['marital_status'].map(estado_mapeo) # Dependents df['dependents'] = df['kidhome'] + df['teenhome'] # Year becoming a Customer df['year_customer'] = pd.DatetimeIndex(df['dt_customer']).year # Total Purchases purchases_cols = [col for col in df.columns if 'purchases' in col] df['total_purchases'] = df[purchases_cols].sum(axis=1) # Total Campaigns Accepted campaigns_cols = [col for col in df.columns if 'cmp' in col] + ['response'] # 'Response' is for the latest campaign df['total_campaigns_acc'] = df[campaigns_cols].sum(axis=1) # In[ ]: # In[37]: #Drop columns we don´t need df = df.drop(['year_birth', 'teenhome', 'kidhome','dt_customer'], axis=1) # In[38]: df.head(3) # In[39]: #Saving clean df df.to_csv('marketing_analysis_clean.csv', index=False) # In[40]: suma_spending = df['spending'].sum() suma_spending # In[41]: fila_cliente_1386 = df[df['id'] == 9365] # Imprimir la fila seleccionada print(fila_cliente_1386) # In[42]: df.shape # ### Statistical summary # # In[43]: df.describe() # The describe function generates for us asummary, particularly useful as a first step in our preliminary investigation. Analyzing the statistical summary gives us insightful information in one look : # # -**Average income** is 52.227$ while median income is 51.381$. The distribution is right skewed with the possible presence of outliers # The maximum value is 666.666$ and the 3rd quartile 68.281$ dollars so we could deduce that the distribution is skewed to the right with the possible presence of outliers. # -**Average spending** in the last 2 years is 605$ while median spending is 396$. # -**Average age** is 54 years old and the oldest customer is 83 years old. # -**Average store purchases** is the highest with 5.794.397 following by average of web purchases with 4.087.170. # - The highest average spending on products that have been sold in the last two years has been wine (303.995530), meat(166.916853, gold products(43.968708). # - The campaign with the highest mean response has been Response, which was the last campaign launched. # # EDA # In[44]: #Split data into categorical and numerical for exploration # In[45]: df_cat=df.select_dtypes(include=['object']) df_cat.head(5) # In[46]: #Save numerical data df_num= df.select_dtypes(exclude=["object"]) df_num.head(3) # ## CUSTOMER PROFILE ANALYSIS # There are columsn relevant for customer profile: # Education:ordinal # Incomo:continuos # Marital:nominal # In[47]: #Displaying categorical data fig, ax = plt.subplots(1, 4, figsize=(15, 4)) sns.countplot(x=df_cat['marital_status'], ax=ax[0]) sns.countplot(x=df_cat['education'], ax=ax[1]) sns.countplot(x=df_cat['country'], ax=ax[2]) df_plot = df_cat.groupby(['marital_status', 'education']).size().reset_index().pivot(columns='marital_status', index='education', values=0) df_plot.apply(lambda x: x / x.sum(), axis=1).plot(kind='bar', stacked=True, ax=ax[3], colormap='Paired') ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.tight_layout() plt.show() # In[48]: fig, ax = plt.subplots(1, 7, figsize=(15, 4)) sns.countplot(x=df_cat['acceptedcmp1'], ax=ax[0]) sns.countplot(x=df_cat['acceptedcmp2'], ax=ax[1]) sns.countplot(x=df_cat['acceptedcmp3'], ax=ax[2]) sns.countplot(x=df_cat['acceptedcmp4'], ax=ax[3]) sns.countplot(x=df_cat['acceptedcmp5'], ax=ax[4]) sns.countplot(x=df_cat['response'], ax=ax[5]) sns.countplot(x=df_cat['complain'], ax=ax[6]) ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.tight_layout() plt.show() # ## Association between marital status and education # In[49]: df_cat_con= df_cat[['marital_status', 'education']] df_cat_con # In[50]: from scipy.stats import chi2_contingency # Crear la tabla de contingencia contingency_table= pd.crosstab(df_cat_con['marital_status'], df_cat_con['education']) # Realizar la prueba de chi-cuadrado chi2, p_value, _, _ = chi2_contingency(contingency_table) # Mostrar la tabla de contingencia print(contingency_table) # Mostrar el valor de chi-cuadrado y el valor p print(f"Chi-cuadrado: {chi2}") print(f"Valor p: {p_value}") # The result obtained indicates that the chi-square value is approximately 2.461 and the p-value is approximately 0.6516. # # The chi-square value is a measure of the association between the variables "marital_status" (marital status) and "education" (education) in your data set. A larger chi-square value indicates a stronger association between the variables, while a value close to zero indicates that the variables are independent. # # The p-value is the probability of obtaining a chi-square value equal to or more extreme than that observed if the variables were completely independent. In other words, it represents the probability that the association between the variables is simply the result of chance, given the size of the sample. # # In your case, the p-value is 0.6516, which means that the probability of getting a chi-square value equal to or more extreme than 2.461 under the null hypothesis (that the variables are independent) is about 0.6516. Generally, if the p-value is greater than a predefined significance level (for example, 0.05), we do not have sufficient evidence to reject the null hypothesis and we can consider the variables to be independent. # # Therefore, in this case, the result suggests that there is no significant association between marital status and education level in the data set you are analyzing. # In[51]: #Displaying numerical dataç df_num.columns # In[52]: # Crear el lienzo y los ejes fig, axes = plt.subplots(6, 4, figsize=(15, 20)) # Aplanar el arreglo de ejes para facilitar el acceso axes = axes.flatten() # Graficar los histogramas de las columnas numéricas for i, col in enumerate(df_num.columns): sns.histplot(data=df_num, x=col, ax=axes[i], kde=True) # Ajustar el espaciado entre subplots plt.tight_layout() # Mostrar el gráfico plt.show() # In[ ]: # ## Features Correlation Analysis # In[53]: corr = df_num.corr() ones = np.ones_like(corr, dtype="bool") mask = np.triu(ones) mask = mask[1:, :-1] corr = corr.iloc[1:,:-1].copy() fig, ax = plt.subplots(figsize=(40,30)) sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="Greens", linewidths=.6, cbar_kws={"shrink":.9}) ax.xaxis.tick_bottom() plt.title("Correlations heatmap for numerical data".upper(), fontdict={"fontsize": 18}, loc="left") # In[67]: sns.set(rc ={'figure.figsize':(8,8)}, font_scale=0.5) sns.heatmap(corr[corr>0.8], annot=True, fmt=".3f") # In[54]: #High correlation:.80 to 1 #Mean correlation:.60 to .80 #Low correlation: .40 to .60 #We see that our target to predict has mean correlation with total purchases # In[55]: corr = df_num.corr() high_corr = corr[corr > 0.80] high_corr = high_corr.unstack().dropna() high_corr = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)] for index, value in high_corr.items(): variable_1, variable_2 = index correlation_value = value print("Variables:", variable_1, "and", variable_2) print("Correlation:", correlation_value) print("------------------------") # ## Correlation Results # - Income is a proxy for several other features, such as spending capacity or total purchases. It shows a positive relationship with expenses on meat and wine, indicating that individuals with higher income tend to spend more on these items. On the other hand, income has a negative correlation with the number of children at home and the frequency of website visits, suggesting that individuals with higher income tend to have fewer children at home and visit websites less frequently. # # - Furthermore, the amount spent on wine is not only influenced by high income but also by the amount spent on meat. These two variables are interconnected, meaning that individuals who spend more on meat are also likely to spend more on wine. Moreover, individuals can purchase wine either from catalogs or from physical stores. # # - The number of children at home is negatively associated with spending and income, and consequently, wine consumption. This implies that individuals with more children tend to have lower income and spend less overall, including on wine. Conversely, higher income levels are also linked to a greater willingness to accept advertising campaigns. # # - In summary, income serves as a proxy for several factors, with positive correlations to spending, meat and wine expenses, while showing negative associations with the number of children at home and website visits. The relationship between income, spending habits, and preferences for specific products such as wine can provide insights into consumer behavior and marketing strategies. # In[88]: #VIF from statsmodels.stats.outliers_influence import variance_inflation_factor from statsmodels.tools.tools import add_constant features = ['income', 'mntmeatproducts','mntwines','total_purchases'] def calculate_vif(df, features): vif, tolerance, r2sq = {}, {}, {} # all the features that you want to examine for feature in features: # extract all the other features you will regress against X = [f for f in features if f != feature] X, y = df[X], df[feature] # extract r-squared from the fit r2 = LinearRegression().fit(X, y).score(X, y) # calculate tolerance r2sq[feature] = r2 tolerance[feature] = 1 - r2 # calculate VIF vif[feature] = 1/(tolerance[feature]) # return VIF DataFrame return pd.DataFrame({'VIF': vif, 'Tolerance': tolerance, 'R2': r2sq}) # In[95]: calculate_vif(df_num, df_num) # In[192]: #remving spending features_2 = ['mntmeatproducts', 'mntwines', 'total_purchases', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods', 'numwebpurchases', 'numwebvisitsmonth', 'age', 'dependents'] # In[193]: calculate_vif(df,features_2) # **Interpreting R square k** - If $R^2_k$ equals zero, variable `k` is not correlated with any other independent variable. # Usually, multicollinearity is a potential problem when $R^2_k$ is greater than `0.75` and, a serious problem when $R^2_k$ is greater than `0.9`. # # **Interpretation of the variance inflation factor**: If $VIF_k = 1$, variable k is not correlated with any other independent variable. Multicollinearity is a potential problem when `VIFk` is greater than 5 and, a serious problem when it is greater than 10. # # **Interpretation of tolerance** A tolerance of less than 0.20 or 0.10 indicates a multicollinearity problem. # In[ ]: # In[ ]: # In[96]: #Plot illustrating the effect of income on spending plt.figure(figsize=(8, 8)) plt.scatter(df['income'], df['spending']) plt.title('Income vs. Spending') plt.xlabel('Income') plt.ylabel('Spending') plt.show() # In[97]: #Plot illustrating negative effect of having dependents (kids & teens) on spending plt.figure(figsize=(4,4)) sns.boxplot(x='dependents', y='spending', data=df) # In[98]: #Plot illustrating positive effect of having dependents (kids & teens) on number of deals purchased plt.figure(figsize=(4,4)) sns.boxplot(x='dependents', y='numdealspurchases', data=df); # In[ ]: # In[99]: #REMOVING SPENDING AND TOTAL PURCHASE to check Multicolinearity # In[100]: corr_de = df_num.drop(['spending', 'total_purchases'], axis=1) # In[101]: fig, ax = plt.subplots(figsize=(40,30) ) heatmap = sns.heatmap(corr_de.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG') # ## Segmentation of Customers # ### Choosing columsn for segmentation # I choose the spending and income variables because they provide interesting information for the segmentation. # In[ ]: # In[136]: df_num.head(2) # In[137]: cluster = df_num.iloc[:,[1,15,]] #income #recency, mntwines mntfruits mntmeatproducts mntfishproducts mntsweetproducts mntgoldprods numdealspurchases numwebpurchases numcatalogpurchases numstorepurchases numwebvisitsmonth age spending,total_purchases # In[135]: cluster # In[ ]: # Standardizing numerical data with standard scaler # In[105]: #scaler = StandardScaler() #cluster_encoded = scaler.fit_transform(cluster) # In[106]: #cluster_encoded # In[138]: scaler = MinMaxScaler() cluster_encoded= scaler.fit_transform(cluster) cluster_encoded # In[108]: cluster_encoded.shape # Choosing the number of clusters # WCSS -> Within Clusters Sum of Squares # In[109]: ## finding wcss value for different number of clusters wcss = [] for i in range(1,11): kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42) kmeans.fit(cluster_encoded) wcss.append(kmeans.inertia_) # In[110]: # plot an elbow graph sns.set() plt.plot(range(1,11), wcss) plt.title('The Elbow Point Graph') plt.xlabel('Number of Clusters') plt.ylabel('WCSS') plt.show() # Optimum Number of Clusters = 4 # Training the k-Means Clustering Model # In[111]: from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42) # return a label for each data point based on their cluster Y = kmeans.fit_predict(cluster_encoded) print(Y) # In[112]: #Visualization plt.figure(figsize=(8,8)) plt.scatter(cluster_encoded[Y==0,0 ], cluster_encoded[Y==0,1 ], s=50, c='green', label='Cluster 1') plt.scatter(cluster_encoded[Y==1,0 ], cluster_encoded[Y==1,1 ], s=50, c='red', label='Cluster 2') plt.scatter(cluster_encoded[Y==2,0 ], cluster_encoded[Y==2,1 ], s=50, c='yellow', label='Cluster 3') plt.scatter(cluster_encoded[Y==3,0 ], cluster_encoded[Y==3,1 ], s=50, c='blue', label='Cluster 4') plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=100, c='cyan', label='Centroids') # Configura las etiquetas de los ejes y el título de la gráfica plt.ylabel('Spending') plt.xlabel('Income') plt.title('Customers Groups') # Muestra la gráfica plt.show() # In[113]: kmeans.labels_ # In[114]: cluster_1_data = cluster_encoded[Y == 0] cluster_1_data # In[ ]: # In[ ]: # In[115]: # Aplica K-means para obtener los clusters kmeans = KMeans(n_clusters=4, random_state=42) kmeans.fit(cluster_encoded) # Obtiene las etiquetas asignadas a cada cliente labels = kmeans.labels_ # Cuenta la cantidad de clientes en cada cluster conteo_clientes = pd.Series(labels).value_counts() # Grafica los grupos de clientes por ingresos y gastos plt.figure(figsize=(8, 8)) plt.scatter(cluster_encoded[:, 0], cluster_encoded[:, 1], c=labels, cmap='viridis') # Grafica los centroides de los clusters plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=100, c='red', label='Centroids') # Configura las etiquetas de los ejes y el título de la gráfica plt.xlabel('Income') plt.ylabel('Spending') plt.title('Customers Groups') # Muestra la cantidad de clientes en cada cluster for cluster, count in conteo_clientes.items(): centroid = kmeans.cluster_centers_[cluster] plt.text(centroid[0], centroid[1], f'{count}', fontsize=12, color='black', ha='center') # Muestra la gráfica plt.show() # I create a copy of the original dataframe and assign the labels to it to be able to filter the rows for each label and analyze the customer groups # # In[139]: cluster2 = cluster.copy() # In[140]: cluster2['cluster_labels'] = kmeans.labels_ cluster2 # In[141]: # Filtrar las filas por los valores de cluster_labels cluster_0 = cluster2.loc[cluster2['cluster_labels'] == 0] cluster_1 = cluster2.loc[cluster2['cluster_labels'] == 1] cluster_2 = cluster2.loc[cluster2['cluster_labels'] == 2] cluster_3 = cluster2.loc[cluster2['cluster_labels'] == 3] # Imprimir los resultados print("Cluster 0:") print(cluster_0) print() print("Cluster 1:") print(cluster_1) print() print("Cluster 2:") print(cluster_2) print() print("Cluster 3:") print(cluster_3) # In[142]: #Converting clusters to dataframe cluster_0_df = pd.DataFrame(cluster_0) cluster_1_df = pd.DataFrame(cluster_1) cluster_2_df = pd.DataFrame(cluster_2) cluster_3_df = pd.DataFrame(cluster_3) # In[143]: cluster_0_df # In[144]: cluster_0_df.shape # In[145]: cluster_0_df.describe() # ## As a data analyst, looking at the Cluster 1 of customer segmentation, cluster_0_df, we can make the following conclusions: # # - Cluster size: 736 clients. # # - Average income: The average income of clients in this cluster is approximately 28,541$ # # - Average expenses: Clients in this cluster have an average expense of around 72.88$. # # - Variability: There is considerable variability in the income and expenses of clients in this cluster, as indicated by the standard deviation. This suggests that there are clients with different levels of income and expenses within the cluster. # # - Range of income and expenses: The range of income goes from 1,730$ to 43,269$, while the range of expenses goes from 5$ to 570$ monetary units. # # - Distribution of income and expenses: The median (central value) of income is approximately 29,968$, which means that half of the clients have incomes above this value and the other half have incomes below this value. The median of the expenses is approximately 53$. # # - Low Spend Segment: This cluster can be considered as a segment of customers with low spending levels, since the average spend is relatively low compared to the other clusters. # # - With these conclusions, we can infer that this group of clients has relatively low income and expenses compared to the other clusters. This may indicate a customer profile with lower spending capacity or more conservative consumption preferences. These observations can be useful for the marketing team in targeting their strategies and campaigns more effectively towards this customer segment, for example, by offering affordable products or services and promotions tailored to their level of spending. # ## CLUSTER 2 CONCLUSIONS # In[146]: cluster_1_df.shape # In[147]: cluster_1_df.describe() # - Cluster size: 334 clients. # - Average income: The average income of clients in Cluster 1 is approximately 78,690$. This suggests that the customers in this cluster have relatively higher income levels compared to Cluster 0. # - Average expenses: The average expenses of clients in Cluster 1 amount to around 1,688$. This indicates that the customers in this segment tend to have higher spending levels compared to Cluster 0. # - Variability: The standard deviation for both income and expenses within Cluster 1 suggests a moderate degree of variability. This implies that there are clients with varying income and spending patterns within this cluster. # - Range of income and expenses: The range of income in Cluster 1 spans from 51,381$ to 105,471$ while the range of expenses ranges from 1.112$ to 2,525$. This wide range indicates the presence of customers with different income and spending levels within this cluster. # - Distribution of income and expenses: The median income of clients in Cluster 1 is approximately 79,414$. This suggests that half of the clients have an income below this value, while the other half has an income above it. # # - Overall, Cluster 1 represents a segment of customers with relatively higher income and spending levels compared to Cluster 0. These customers may have a higher purchasing power and exhibit more liberal spending behavior. The marketing department can leverage these insights to develop strategies and campaigns that cater to this customer segment. For example, they can focus on offering premium products or services, creating promotions that align with the customers' higher spending capacity, and delivering messaging that resonates with their more indulgent consumption preferences. # ## CLUSTER 3 CONCLUSIONS # In[148]: cluster_2_df.shape # In[149]: cluster_2_df.describe() # - Cluster size: 612 clients. # - Average income: The average income of clients in Cluster 3 is approximately 50,399$. # - Average expenses: Clients in Cluster 3 have an average expense of around 315$. This indicates that the customers in this segment tend to have moderate spending levels than Cluster 2. # - Variability: The standard deviation for both income and expenses within Cluster 2 suggests a moderate degree of variability. This implies that there are clients with different income and spending patterns within this cluster. # - Range of income and expenses: The range of income in Cluster 3 spans from 32,632$ to 73,395$ while the range of expenses ranges from 16$ to 839$. This wide range indicates the presence of customers with varying income and spending levels within this cluster. # - Distribution of income and expenses: The median income of clients in Cluster 2 is approximately 49,973$. # # From this analysis, we can conclude that Cluster 3 represents a segment of customers with a moderate income level and moderate spending behavior. These customers may have a balanced approach to their finances and exhibit stable spending patterns. The marketing department can consider targeting this customer segment with products or services that align with their moderate income and spending capacity. They can also focus on providing value-based offerings and promotions that cater to their stable spending behavior. # ## CLUSTER 4 CONCLUSIONS # In[150]: cluster_3_df.shape # In[151]: cluster_3_df.describe() # - Cluster size: 547 clients. # - Average income: 67,507.23$. # - Average expenses: Clients in Cluster 3 have an average expense of around 985$. # - Variability: The standard deviation for both income and expenses within Cluster 3 indicates a moderate degree of variability, suggesting that there are clients with different income and spending patterns within this cluster. # - Range of income and expenses: The range of income in Cluster 3 spans from 2,447$ to 113,734$ while the range of expenses ranges from 277$ to 1,730$. # - Distribution of income and expenses: The median income of clients in Cluster 3 is approximately 66,886$ # # # Based on this analysis, we can conclude that Cluster 3 represents a segment of clients with a relatively higher average income and higher spending levels compared to other clusters. These clients may have a higher spending capacity and may be more inclined towards making larger purchases. The marketing department can consider targeting this customer segment with higher-end products or services, luxury offerings, or personalized marketing campaigns that cater to their higher spending power. # In[ ]: # # Answer the following questions of the CMO: # ## What does the Average customer look like for our company? # In[152]: selected_columns = ['age', 'year_customer', 'income', 'dependents', 'recency'] selected_data = df[selected_columns] mean_values = selected_data.mean().round(1) mean_df = pd.DataFrame(mean_values, columns=['Mean']) mean_df.index.name = 'Variable' mean_df.columns = ['Average Customer'] mean_df.index = mean_df.index.rename('Characteristics') mean_df # ## What Products are best performing? # Wines Followed by meats # In[153]: # Calcular la suma de las ventas de cada producto sales = df[['mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']].sum() # Crear el gráfico de pie plt.figure(figsize=(8, 6)) plt.pie(sales, labels=sales.index, autopct='%1.1f%%') # Configurar el título plt.title('Product Performance by Sales') # Mostrar el gráfico plt.show() # ## Which channels are underperforming? # In[154]: # Seleccionar las columnas de interés channels = df[['numwebvisitsmonth', 'numstorepurchases', 'numcatalogpurchases', 'numwebpurchases', 'numdealspurchases']] # Calcular la suma de cada canal channel_purchases = channels.sum() # Calcular el porcentaje de compras de cada canal total_purchases = df['total_purchases'].sum() channel_performance = (channel_purchases / total_purchases) * 100 # Crear el gráfico de pie plt.figure(figsize=(8, 6)) plt.pie(channel_performance, labels=channel_performance.index, autopct='%1.1f%%') # Configurar el título plt.title('Channels Performance by Total Purchases') # Mostrar el gráfico plt.show() # ## Which Marketing Campaigns were most successful? # # The most successful campaign is the most recent (column name: Response) # In[155]: fig, ax = plt.subplots(1, 7, figsize=(15, 4)) sns.countplot(x=df_cat['acceptedcmp1'], ax=ax[0]) sns.countplot(x=df_cat['acceptedcmp2'], ax=ax[1]) sns.countplot(x=df_cat['acceptedcmp3'], ax=ax[2]) sns.countplot(x=df_cat['acceptedcmp4'], ax=ax[3]) sns.countplot(x=df_cat['acceptedcmp5'], ax=ax[4]) sns.countplot(x=df_cat['response'], ax=ax[5]) ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.tight_layout() plt.show() # In[ ]: # ## Purchase amount spending prediction: Spending/Income # In[156]: cluster # In[157]: from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error from math import sqrt #X-y split Y=cluster['spending'] X=cluster.drop('spending',axis=1) # In[158]: X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) # In[159]: # Normalizar los datos en X_train y X_test usando StandardScaler scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # In[160]: # Crear y ajustar el modelo de regresión lineal regression_model = LinearRegression() regression_model.fit(X_train_scaled, y_train) # In[161]: # Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba y_train_pred = regression_model.predict(X_train_scaled) y_test_pred = regression_model.predict(X_test_scaled) # In[162]: # Calcular las métricas en conjunto de entrenamiento r2_train = r2_score(y_train, y_train_pred) mse_train = mean_squared_error(y_train, y_train_pred) mae_train = mean_absolute_error(y_train, y_train_pred) rmse_train = sqrt(mse_train) # In[163]: # Calcular las métricas en conjunto de prueba r2_test = r2_score(y_test, y_test_pred) mse_test = mean_squared_error(y_test, y_test_pred) mae_test = mean_absolute_error(y_test, y_test_pred) rmse_test = sqrt(mse_test) # In[164]: # Imprimir las métricas print("Métricas en conjunto de entrenamiento:") print("R^2:", r2_train) print("MSE:", mse_train) print("MAE:", mae_train) print("RMSE:", rmse_train) print("------------------------------") print("Métricas en conjunto de prueba:") print("R^2:", r2_test) print("MSE:", mse_test) print("MAE:", mae_test) print("RMSE:", rmse_test) # In[ ]: # ## Purchase amount spending prediction with other variables # In[165]: df_cat # In[166]: df_cat_l=df_cat[['education', 'marital_status', 'country']] df_cat_l= df_cat_l.reset_index() df_cat_l = df_cat_l.drop(columns=['index']) df_cat_l # In[167]: #Encodign categorical data from sklearn.preprocessing import OneHotEncoder encoder=OneHotEncoder(drop='first').fit(df_cat_l) encoder=encoder.transform(df_cat_l).toarray() encoder # In[168]: df_num # In[169]: df_num_l=df_num[['income', 'numdealspurchases', 'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases','numwebvisitsmonth', 'age', 'dependents', 'spending']] df_num_l= df_num_l.reset_index() df_num_l = df_num_l.drop(columns=['index']) df_num_l # In[170]: #X-y split y=df_num_l['spending'] X=df_num_l.drop(['spending'],axis=1) X # In[ ]: # In[ ]: # In[171]: # Normalizar datos numéricos utilizando StandardScaler scaler = StandardScaler() X_es = scaler.fit_transform(X) X_es # In[172]: X=np.concatenate((X_es ,encoder),axis=1) X # In[173]: X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42) # In[174]: scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) X_test_scaled # In[175]: # Crear y ajustar el modelo de regresión lineal regression_model = LinearRegression() regression_model.fit(X_train_scaled, y_train) # In[176]: # Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba y_train_pred = regression_model.predict(X_train_scaled) y_test_pred = regression_model.predict(X_test_scaled) # In[177]: # Calcular las métricas r2_train = r2_score(y_train, y_train_pred) mse_train = mean_squared_error(y_train, y_train_pred) mae_train = mean_absolute_error(y_train, y_train_pred) rmse_train = sqrt(mse_train) r2_test = r2_score(y_test, y_test_pred) mse_test = mean_squared_error(y_test, y_test_pred) mae_test = mean_absolute_error(y_test, y_test_pred) rmse_test = sqrt(mse_test) # In[178]: # Imprimir las métricas print("Métricas en conjunto de entrenamiento:") print("R^2:", r2_train) print("MSE:", mse_train) print("MAE:", mae_train) print("RMSE:", rmse_train) print("---------------------------") print("Métricas en conjunto de prueba:") print("R^2:", r2_test) print("MSE:", mse_test) print("MAE:", mae_test) print("RMSE:", rmse_test) # # In[ ]: # In[ ]: # ## Prediction removing income of last prediciton # In[179]: df_num_l=df_num[['numdealspurchases', 'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases','numwebvisitsmonth', 'age', 'dependents', 'spending']] df_num_l= df_num_l.reset_index() df_num_l = df_num_l.drop(columns=['index']) df_num_l # In[180]: #X-y split y=df_num_l['spending'] X=df_num_l.drop(['spending'],axis=1) X # In[181]: # Normalizar datos numéricos utilizando StandardScaler scaler = StandardScaler() X_es = scaler.fit_transform(X) X_es # In[182]: X=np.concatenate((X_es ,encoder),axis=1) X # In[183]: X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42) # In[184]: scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) X_test_scaled # In[185]: # Crear y ajustar el modelo de regresión lineal regression_model = LinearRegression() regression_model.fit(X_train_scaled, y_train) # In[186]: # Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba y_train_pred = regression_model.predict(X_train_scaled) y_test_pred = regression_model.predict(X_test_scaled) # In[187]: # Calcular las métricas r2_train = r2_score(y_train, y_train_pred) mse_train = mean_squared_error(y_train, y_train_pred) mae_train = mean_absolute_error(y_train, y_train_pred) rmse_train = sqrt(mse_train) r2_test = r2_score(y_test, y_test_pred) mse_test = mean_squared_error(y_test, y_test_pred) mae_test = mean_absolute_error(y_test, y_test_pred) rmse_test = sqrt(mse_test) # In[188]: # Imprimir las métricas print("Métricas en conjunto de entrenamiento:") print("R^2:", r2_train) print("MSE:", mse_train) print("MAE:", mae_train) print("RMSE:", rmse_train) print("---------------------------") print("Métricas en conjunto de prueba:") print("R^2:", r2_test) print("MSE:", mse_test) print("MAE:", mae_test) print("RMSE:", rmse_test) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: