#Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
pd.set_option('display.max_columns', None)
%matplotlib inline
#Loading,reading and storing dataset
df = pd.read_csv('marketing_data.csv')
#Review of data fields
df
ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1826 | 1970 | Graduation | Divorced | $84,835.00 | 0 | 0 | 6/16/14 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
1 | 1 | 1961 | Graduation | Single | $57,091.00 | 0 | 0 | 6/15/14 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA |
2 | 10476 | 1958 | Graduation | Married | $67,267.00 | 0 | 1 | 5/13/14 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
3 | 1386 | 1967 | Graduation | Together | $32,474.00 | 1 | 1 | 5/11/14 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
4 | 5371 | 1989 | Graduation | Single | $21,474.00 | 1 | 0 | 4/8/14 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2235 | 10142 | 1976 | PhD | Divorced | $66,476.00 | 0 | 1 | 3/7/13 | 99 | 372 | 18 | 126 | 47 | 48 | 78 | 2 | 5 | 2 | 11 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
2236 | 5263 | 1977 | 2n Cycle | Married | $31,056.00 | 1 | 0 | 1/22/13 | 99 | 5 | 10 | 13 | 3 | 8 | 16 | 1 | 1 | 0 | 3 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SP |
2237 | 22 | 1976 | Graduation | Divorced | $46,310.00 | 1 | 0 | 12/3/12 | 99 | 185 | 2 | 88 | 15 | 5 | 14 | 2 | 6 | 1 | 5 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SP |
2238 | 528 | 1978 | Graduation | Married | $65,819.00 | 0 | 0 | 11/29/12 | 99 | 267 | 38 | 701 | 149 | 165 | 63 | 1 | 5 | 4 | 10 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | IND |
2239 | 4070 | 1969 | PhD | Married | $94,871.00 | 0 | 2 | 9/1/12 | 99 | 169 | 24 | 553 | 188 | 0 | 144 | 1 | 8 | 5 | 4 | 7 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | CA |
2240 rows × 28 columns
ID=Customer's unique identifier
Year_Birth=Customer's birth year
Education=Customer's education level
Marital_Status=Customer's marital status
Income=Customer's yearly household income
Kidhome=Number of children in customer's household
Teenhome=Number of teenagers in customer's household
Dt_Customer=Date of customer's enrollment with the company
Recency=Number of days since customer's last purchase
MntWines=Amount spent on wine in the last 2 years
MntFruits=Amount spent on fruits in the last 2 years
MntMeatProducts=Amount spent on meat in the last 2 years
MntFishProducts=Amount spent on fish in the last 2 years
MntSweetProducts=Amount spent on sweets in the last 2 years
MntGoldProds=Amount spent on gold in the last 2 years
NumDealsPurchases=Number of purchases made with a discount
NumWebPurchases=Number of purchases made through the company's web site
NumCatalogPurchases=Number of purchases made using a catalogue
NumStorePurchases=Number of purchases made directly in stores
NumWebVisitsMonth=Number of visits to company's web site in the last month
AcceptedCmp3=1 if customer accepted the offer in the 3rd campaign, 0 otherwise
AcceptedCmp4=1 if customer accepted the offer in the 4th campaign, 0 otherwise
AcceptedCmp5=1 if customer accepted the offer in the 5th campaign, 0 otherwise
AcceptedCmp1=1 if customer accepted the offer in the 1st campaign, 0 otherwise
AcceptedCmp2=1 if customer accepted the offer in the 2nd campaign, 0 otherwise
Response=1 if customer accepted the offer in the last campaign, 0 otherwise
Complain=1 if customer complained in the last 2 years, 0 otherwise
Country=Customer's location
#Show the DataFrame's shape
df.shape
(2240, 28)
# Standardize column names
df.columns=[e.lower().replace(' ', '') for e in df.columns]
df.columns
Index(['id', 'year_birth', 'education', 'marital_status', 'income', 'kidhome', 'teenhome', 'dt_customer', 'recency', 'mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods', 'numdealspurchases', 'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases', 'numwebvisitsmonth', 'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'acceptedcmp1', 'acceptedcmp2', 'response', 'complain', 'country'], dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 2240 non-null int64 1 year_birth 2240 non-null int64 2 education 2240 non-null object 3 marital_status 2240 non-null object 4 income 2216 non-null object 5 kidhome 2240 non-null int64 6 teenhome 2240 non-null int64 7 dt_customer 2240 non-null object 8 recency 2240 non-null int64 9 mntwines 2240 non-null int64 10 mntfruits 2240 non-null int64 11 mntmeatproducts 2240 non-null int64 12 mntfishproducts 2240 non-null int64 13 mntsweetproducts 2240 non-null int64 14 mntgoldprods 2240 non-null int64 15 numdealspurchases 2240 non-null int64 16 numwebpurchases 2240 non-null int64 17 numcatalogpurchases 2240 non-null int64 18 numstorepurchases 2240 non-null int64 19 numwebvisitsmonth 2240 non-null int64 20 acceptedcmp3 2240 non-null int64 21 acceptedcmp4 2240 non-null int64 22 acceptedcmp5 2240 non-null int64 23 acceptedcmp1 2240 non-null int64 24 acceptedcmp2 2240 non-null int64 25 response 2240 non-null int64 26 complain 2240 non-null int64 27 country 2240 non-null object dtypes: int64(23), object(5) memory usage: 490.1+ KB
We have 18 numerical variables and 10 categorical variables: Education, Marital_Status, AcceptedCmp1-AcceptedCmp5,Response, Country and Complain
#Transformon income to numerical
df['income'] = df['income'].str.replace('$', '')
df['income'] = df['income'].str.replace(',', '').astype('float')
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\1407881261.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. df['income'] = df['income'].str.replace('$', '')
#Transformon AcceptedCmp1-5, Respomnse, Complain to object
df['acceptedcmp1'] = df['acceptedcmp1'].astype(str)
df['acceptedcmp2'] = df['acceptedcmp2'].astype(str)
df['acceptedcmp3'] = df['acceptedcmp3'].astype(str)
df['acceptedcmp4'] = df['acceptedcmp3'].astype(str)
df['acceptedcmp5'] = df['acceptedcmp5'].astype(str)
df['response'] = df['response'].astype(str)
df['complain'] = df['complain'].astype(str)
#Checking unique values of dataset
for column in df.columns:
unique_values = df[column].unique()
print(f"Valores únicos de la columna '{column}':")
print(unique_values)
print()
Valores únicos de la columna 'id': [ 1826 1 10476 ... 22 528 4070] Valores únicos de la columna 'year_birth': [1970 1961 1958 1967 1989 1954 1947 1979 1959 1981 1969 1977 1960 1966 1976 1965 1956 1975 1971 1986 1972 1974 1990 1987 1984 1968 1955 1983 1973 1978 1952 1962 1964 1982 1963 1957 1980 1945 1949 1948 1953 1946 1985 1992 1944 1951 1988 1950 1994 1993 1991 1893 1996 1995 1899 1943 1941 1940 1900] Valores únicos de la columna 'education': ['Graduation' 'PhD' '2n Cycle' 'Master' 'Basic'] Valores únicos de la columna 'marital_status': ['Divorced' 'Single' 'Married' 'Together' 'Widow' 'YOLO' 'Alone' 'Absurd'] Valores únicos de la columna 'income': [84835. 57091. 67267. ... 46310. 65819. 94871.] Valores únicos de la columna 'kidhome': [0 1 2] Valores únicos de la columna 'teenhome': [0 1 2] Valores únicos de la columna 'dt_customer': ['6/16/14' '6/15/14' '5/13/14' '5/11/14' '4/8/14' '3/17/14' '1/29/14' '1/18/14' '1/11/14' '12/27/13' '12/9/13' '12/7/13' '10/16/13' '10/5/13' '9/11/13' '8/1/13' '7/23/13' '7/1/13' '5/28/13' '3/26/13' '3/15/13' '2/12/13' '11/23/12' '10/13/12' '9/14/12' '6/29/14' '5/31/14' '5/30/14' '4/27/14' '4/11/14' '10/29/13' '10/9/13' '5/10/13' '5/9/13' '4/25/13' '4/20/13' '3/30/13' '3/1/13' '2/14/13' '1/11/13' '1/3/13' '12/19/12' '12/15/12' '12/2/12' '9/17/12' '9/11/12' '5/12/14' '4/28/14' '3/29/14' '3/6/14' '3/4/14' '2/4/14' '2/3/14' '1/1/14' '12/12/13' '11/15/13' '9/20/13' '9/5/13' '8/31/13' '7/30/13' '7/27/13' '6/22/13' '1/5/13' '11/21/12' '11/11/12' '9/28/12' '9/27/12' '9/7/12' '8/13/12' '8/11/12' '8/2/12' '6/25/14' '5/28/14' '4/14/14' '3/10/14' '2/27/14' '2/7/14' '1/28/14' '11/17/13' '11/7/13' '10/17/13' '10/13/13' '10/12/13' '9/30/13' '7/3/13' '6/10/13' '5/29/13' '4/29/13' '3/10/13' '1/2/13' '11/2/12' '10/18/12' '10/1/12' '9/3/12' '8/26/12' '5/23/14' '5/17/14' '4/21/14' '3/23/14' '12/16/13' '11/26/13' '11/14/13' '11/6/13' '10/6/13' '9/27/13' '9/18/13' '9/9/13' '7/18/13' '7/8/13' '5/27/13' '3/5/13' '2/20/13' '1/12/13' '12/24/12' '11/19/12' '3/28/14' '2/24/14' '9/2/13' '8/20/13' '6/23/13' '5/5/13' '4/5/13' '1/4/13' '12/27/12' '11/10/12' '10/29/12' '9/22/12' '3/31/14' '3/21/14' '2/9/14' '9/23/13' '6/27/13' '3/28/13' '3/12/13' '1/16/13' '1/8/13' '12/29/12' '12/12/12' '11/25/12' '9/21/12' '9/9/12' '9/5/12' '8/17/12' '6/22/14' '5/1/14' '1/3/14' '10/11/13' '8/13/13' '6/9/13' '5/7/13' '10/2/12' '9/12/12' '3/19/14' '3/3/14' '2/22/14' '1/24/14' '12/4/13' '11/28/13' '11/5/13' '10/3/13' '8/9/13' '8/7/13' '7/17/13' '7/9/13' '6/11/13' '5/17/13' '3/23/13' '2/19/13' '1/19/13' '1/10/13' '1/1/13' '11/12/12' '5/18/14' '3/30/14' '1/30/14' '1/26/14' '1/22/14' '1/15/14' '12/13/13' '8/4/13' '5/1/13' '4/24/13' '4/3/13' '2/3/13' '11/16/12' '8/3/12' '4/18/14' '4/1/14' '3/18/14' '2/10/14' '11/23/13' '11/21/13' '10/2/13' '7/21/13' '6/18/13' '3/24/13' '12/6/12' '11/9/12' '2/14/14' '10/22/13' '10/4/13' '9/21/13' '8/5/13' '7/14/13' '7/4/13' '4/12/13' '4/10/13' '4/8/13' '3/31/13' '3/17/13' '1/21/13' '12/10/12' '9/24/12' '8/6/12' '6/18/14' '4/5/14' '12/21/13' '10/27/13' '10/21/13' '9/19/13' '9/4/13' '6/25/13' '4/27/13' '4/18/13' '12/30/12' '8/22/12' '8/8/12' '6/19/14' '4/20/14' '2/28/14' '12/17/13' '11/25/13' '10/28/13' '8/15/13' '7/5/13' '6/19/13' '6/16/13' '4/22/13' '3/19/13' '2/23/13' '2/15/13' '10/31/12' '10/7/12' '8/9/12' '5/6/14' '4/15/14' '3/5/14' '2/19/14' '9/7/13' '8/6/13' '7/25/13' '4/30/13' '9/10/12' '3/20/14' '9/28/13' '9/24/13' '2/16/13' '11/22/12' '9/18/12' '8/16/12' '6/5/14' '4/13/14' '4/10/14' '4/3/14' '2/12/14' '12/15/13' '10/30/13' '8/26/13' '2/2/13' '1/25/13' '11/17/12' '11/13/12' '11/7/12' '11/1/12' '10/16/12' '5/8/14' '3/2/14' '6/24/13' '6/13/13' '4/23/13' '4/15/13' '1/29/13' '10/30/12' '10/23/12' '4/17/14' '2/25/14' '12/11/13' '10/10/13' '5/20/13' '5/18/13' '4/7/13' '3/3/13' '12/7/12' '11/28/12' '10/27/12' '9/15/12' '6/17/14' '5/29/14' '3/1/14' '2/15/14' '12/23/13' '11/29/13' '10/25/13' '8/17/13' '6/6/13' '3/29/13' '9/23/12' '8/30/12' '8/1/12' '2/8/14' '1/25/14' '11/27/13' '10/19/13' '3/7/13' '2/28/13' '1/17/13' '11/20/12' '11/5/12' '11/3/12' '8/31/12' '8/12/12' '5/15/14' '4/12/14' '4/6/14' '2/6/14' '7/29/13' '6/29/13' '6/17/13' '6/8/13' '5/26/13' '11/8/12' '8/4/12' '4/30/14' '4/7/14' '3/12/14' '4/13/13' '2/13/13' '6/3/14' '3/25/14' '2/17/14' '2/5/14' '1/27/14' '1/14/14' '7/11/13' '6/2/13' '6/1/13' '5/4/13' '3/18/13' '12/3/12' '11/24/12' '10/26/12' '6/20/14' '1/19/14' '1/9/14' '12/29/13' '12/26/13' '12/8/13' '11/20/13' '8/23/13' '8/19/13' '7/24/13' '10/6/12' '8/18/12' '5/7/14' '11/9/13' '8/25/13' '5/16/13' '4/1/13' '3/27/13' '2/8/13' '9/20/12' '5/22/14' '12/30/13' '11/2/13' '8/21/13' '7/12/13' '6/28/13' '6/4/13' '5/31/13' '3/6/13' '2/18/13' '9/26/12' '8/19/12' '5/2/14' '4/29/14' '2/2/14' '1/5/14' '12/5/13' '11/18/13' '9/10/13' '8/3/13' '2/21/13' '2/10/13' '1/31/13' '12/9/12' '9/29/12' '6/9/14' '4/2/14' '3/24/14' '1/23/14' '9/16/13' '9/12/13' '7/15/13' '3/9/13' '2/9/13' '12/14/12' '10/17/12' '6/23/14' '6/12/14' '6/7/14' '4/9/14' '2/13/14' '12/6/13' '10/20/13' '6/20/13' '5/8/13' '3/11/13' '9/6/12' '3/9/14' '2/11/14' '10/8/13' '8/28/13' '7/6/13' '5/30/13' '5/22/13' '4/2/13' '3/20/13' '3/14/13' '1/22/13' '9/8/12' '8/25/12' '8/14/12' '11/19/13' '6/3/13' '12/21/12' '10/10/12' '8/7/12' '12/24/13' '12/14/13' '5/15/13' '5/6/13' '1/7/13' '11/29/12' '4/24/14' '3/8/14' '7/16/13' '2/22/13' '1/20/13' '1/13/13' '12/25/12' '12/11/12' '6/27/14' '3/16/14' '11/3/13' '9/25/13' '9/15/13' '9/1/13' '8/2/13' '8/27/12' '4/4/14' '9/22/13' '12/22/12' '12/16/12' '8/20/12' '1/7/14' '12/1/13' '9/26/13' '2/25/13' '10/24/12' '10/22/12' '7/31/12' '5/19/14' '5/3/14' '4/16/14' '12/31/13' '12/2/13' '7/22/13' '4/21/13' '4/11/13' '3/22/14' '2/6/13' '12/4/12' '11/6/12' '8/28/12' '7/2/13' '10/12/12' '5/16/14' '4/25/14' '11/13/13' '9/6/13' '11/18/12' '10/15/12' '6/14/14' '1/17/14' '2/7/13' '12/20/13' '9/13/13' '1/6/13' '5/26/14' '1/13/14' '8/8/13' '4/6/13' '2/26/14' '5/14/13' '8/24/12' '5/27/14' '2/23/14' '1/10/14' '7/19/13' '3/25/13' '2/11/13' '1/15/13' '12/5/12' '6/13/14' '6/2/14' '11/1/13' '8/16/13' '2/17/13' '2/4/13' '10/19/12' '6/26/14' '10/23/13' '4/14/13' '10/28/12' '10/1/13' '3/8/13' '11/14/12' '1/12/14' '11/4/13' '8/22/13' '6/21/13' '1/23/13' '10/21/12' '10/4/12' '1/31/14' '1/21/14' '12/28/13' '8/11/13' '5/13/13' '9/2/12' '6/24/14' '6/8/14' '5/24/14' '10/18/13' '9/17/13' '8/14/13' '7/20/13' '6/30/13' '5/11/13' '4/16/13' '5/25/14' '5/10/14' '5/4/14' '8/29/13' '3/22/13' '6/4/14' '5/23/13' '2/1/13' '2/16/14' '10/24/13' '3/2/13' '12/18/12' '11/4/12' '6/11/14' '6/14/13' '6/10/14' '5/5/14' '4/19/14' '8/18/13' '2/26/13' '8/30/13' '6/12/13' '5/12/13' '10/9/12' '11/10/13' '8/24/13' '9/4/12' '2/27/13' '1/6/14' '7/7/13' '11/26/12' '8/29/12' '5/2/13' '3/4/13' '1/27/13' '8/23/12' '10/14/13' '12/23/12' '12/1/12' '8/5/12' '8/27/13' '12/17/12' '6/21/14' '3/26/14' '11/22/13' '8/21/12' '4/22/14' '10/26/13' '5/9/14' '4/17/13' '3/21/13' '1/24/13' '12/28/12' '3/13/14' '2/1/14' '10/15/13' '1/14/13' '10/5/12' '7/13/13' '4/23/14' '2/18/14' '11/12/13' '8/12/13' '12/31/12' '6/28/14' '12/3/13' '12/26/12' '7/30/12' '1/2/14' '4/19/13' '1/26/13' '10/14/12' '9/30/12' '3/11/14' '9/14/13' '7/28/13' '5/19/13' '4/28/13' '1/9/13' '10/20/12' '7/31/13' '5/21/13' '9/25/12' '5/3/13' '12/8/12' '3/27/14' '12/18/13' '11/30/13' '8/10/13' '3/16/13' '11/30/12' '3/7/14' '12/19/13' '10/25/12' '12/25/13' '1/4/14' '11/8/13' '11/27/12' '7/26/13' '12/20/12' '10/11/12' '4/26/14' '12/22/13' '6/26/13' '5/24/13' '8/15/12' '12/10/13' '9/19/12' '8/10/12' '6/6/14' '5/25/13' '4/9/13' '9/1/12'] Valores únicos de la columna 'recency': [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99] Valores únicos de la columna 'mntwines': [ 189 464 134 10 6 336 769 78 384 450 140 431 3 16 63 18 53 5 213 275 40 308 266 80 454 27 184 155 423 7 408 1 1285 71 1248 378 12 1200 709 94 4 539 13 670 158 283 496 292 46 34 167 318 522 67 28 58 2 229 14 622 362 38 1074 983 262 739 610 50 650 9 458 520 20 345 22 42 463 260 180 62 421 154 502 145 322 1099 890 863 448 399 85 97 35 315 738 179 381 247 711 30 288 212 173 604 482 531 230 33 784 600 168 493 29 835 1296 760 70 68 325 303 121 561 462 376 341 595 23 530 594 852 194 216 428 1092 559 606 11 588 316 279 1462 546 277 948 664 268 199 73 96 587 56 45 8 508 234 992 125 174 1478 1001 392 388 65 177 577 460 219 31 117 236 120 200 532 297 151 997 797 823 966 37 416 314 342 629 201 964 72 123 159 209 100 1170 387 357 912 625 420 641 712 465 39 514 565 667 66 129 185 1023 338 647 163 19 182 779 32 298 488 817 459 492 558 383 1043 400 691 783 43 777 402 144 840 88 74 26 15 162 1302 47 0 918 172 931 666 24 25 391 81 674 267 224 1239 412 1205 304 76 1004 584 422 269 613 113 261 856 452 658 451 395 688 1035 365 181 331 256 836 233 881 305 778 69 79 344 1184 490 1349 153 84 1000 52 301 519 238 620 572 380 443 252 796 410 507 55 327 397 105 1126 280 108 102 820 86 217 352 240 124 736 1006 580 112 227 290 284 656 953 626 547 795 895 899 178 265 349 848 962 347 426 48 1076 36 733 523 1083 98 99 138 509 527 340 434 208 438 1308 64 673 1050 787 833 605 800 861 161 359 324 370 245 631 1156 398 122 386 379 243 358 258 846 724 770 815 819 479 938 909 156 940 51 141 160 557 196 295 231 1230 91 1060 17 302 728 110 995 704 415 241 135 254 957 293 21 754 822 373 1171 371 801 244 562 707 356 743 235 393 721 896 879 367 693 816 483 1181 491 755 671 1009 972 598 139 702 239 183 771 83 170 59 1063 789 228 512 355 204 556 317 897 187 826 513 313 329 703 571 1218 478 1332 1032 296 635 294 913 731 281 489 551 471 176 534 768 652 1142 41 207 299 525 676 494 353 1253 521 977 603 967 273 425 1394 171 554 191 131 1073 157 226 186 1276 130 614 1288 563 95 210 306 505 758 824 980 526 690 202 792 545 109 382 132 750 223 57 407 90 61 82 220 517 615 480 752 741 456 143 375 1215 106 871 368 1193 441 87 437 965 510 574 332 642 354 390 476 627 165 806 763 205 1003 790 1148 984 941 799 406 128 377 919 198 215 1493 1090 536 1149 586 77 794 774 328 901 866 753 1048 960 888 654 499 576 44 735 548 206 611 713 593 1311 1166 1486 515 1492 533 575 726 445 1259 248 1111 710 1241 529 447 570 867 757 882 291 333 749 203 619 320 311 1103 404 411 211 446 374 221 1047 1206 274 889 146 1115 54 251 166 75 364 1016 432 270 722 516 350 403 218 560 729 1224 1121 503 1245 1459 621 430 979 116 853 93 127 689 686 630 466 910 389 1379 504 60 1067 264 101 581 952 925 457 1017 249 104 111 618 1039 107 225 440 1168 89 255 424 944 908 597 847 543 742 968 708 152 747 958 330 637 418 986 825 1298 1132 746 812 1315 307 860 1396 583 473 553 675 1324 582 1449 829 335 612 175 197 623 544 714 537 285 366 1252 934 518 734 1045 717 830 1020 680 524 1013 864 164 653 369 1012 133 188 811 639 444 372 928 272 92 555 351 263 982 495 312 737 751 346 684 309 115 136 756 1279 538 920 433 192 453 321 169] Valores únicos de la columna 'mntfruits': [104 5 11 0 16 130 80 26 4 82 10 6 1 9 2 21 174 7 42 12 22 45 169 3 35 36 51 8 50 37 76 17 107 105 81 53 96 86 32 19 193 63 83 28 49 34 69 40 48 13 20 148 73 23 103 64 61 142 97 117 134 60 25 30 153 58 33 57 14 24 18 106 88 133 99 68 72 38 129 93 74 27 185 15 79 162 71 56 168 98 44 172 54 140 194 91 183 151 197 178 189 102 155 115 77 90 114 39 59 199 154 123 108 137 66 31 43 120 84 29 112 46 160 159 65 111 147 143 161 144 47 181 89 62 41 132 67 138 55 184 122 75 70 85 149 152 100 164 101 126 87 92 166 124 190 131 163 127] Valores únicos de la columna 'mntmeatproducts': [ 379 64 59 1 24 411 252 11 102 535 61 441 8 12 57 2 5 3 76 68 23 73 300 37 171 256 80 706 21 9 449 112 6 349 189 17 204 115 33 816 249 179 38 460 4 981 13 7 43 407 257 26 18 140 16 431 22 518 184 309 125 28 653 780 356 154 528 333 559 348 44 20 536 202 132 459 50 45 292 547 30 41 67 322 232 520 215 159 217 69 100 471 469 192 849 560 14 350 444 206 223 380 311 466 751 785 113 291 83 678 786 207 56 273 214 592 503 228 161 88 128 48 86 240 60 96 898 29 99 77 694 81 403 91 218 46 71 422 31 873 111 168 10 89 269 293 282 241 53 19 15 70 172 137 142 40 79 594 278 569 271 170 242 452 456 84 538 732 548 850 259 651 391 90 298 1725 537 697 687 32 622 209 97 35 731 106 804 42 243 0 98 842 253 124 108 25 413 235 230 145 212 238 320 495 319 424 74 109 599 570 410 483 39 565 670 117 103 575 101 501 27 689 58 482 673 167 195 286 733 165 827 590 364 52 1622 104 211 49 711 75 373 216 119 267 279 54 389 177 160 480 812 545 303 144 323 797 194 127 180 134 649 497 352 400 396 107 199 549 399 573 297 558 136 62 522 420 314 186 92 572 63 388 239 915 367 95 157 143 890 72 82 754 372 114 196 36 761 512 835 131 133 499 417 853 555 175 255 625 219 181 164 925 690 509 51 153 120 270 231 562 655 595 141 78 345 151 951 130 514 845 247 288 66 65 47 317 222 601 523 272 193 779 254 376 155 353 152 792 510 860 818 561 631 203 93 929 470 447 201 974 426 753 110 263 617 178 305 708 182 156 250 158 921 234 398 280 85 746 445 613 384 500 385 401 162 118 265 275 34 174 185 274 221 368 123 476 169 704 639 281 1607 550 377 462 597 147 138 432 397 604 375 374 540 505 507 454 224 226 183 815 961 446 546 567 749 553 264 359 287 329 493 607 428 363 843 55 790 899 337 94 614 387 883 294 260 640 530 335 768 554 116 304 685 473 334 129 276 534 425 461 341 568 442 419 332 672 414 935 163 369 266 747 713 740 87 611 591 295 405 724 602 464 756 758 603 248 149 750 188 586 693 490 674 940 606 324 716 408 430 913 717 494 519 735 205 342 237 932 813 654 360 139 409 315 176 135 415 984 612 213 435 122 227 491 629 395 465 742 338 864 968 392 487 121 946 498 421 736 197 936 166 382 450 455 635 233 351 832 801 838 354 1582 757 650 774 208 126 701] Valores únicos de la columna 'mntfishproducts': [111 7 15 0 11 240 21 73 80 3 2 13 4 25 65 8 50 106 138 43 97 6 38 30 20 189 224 16 150 32 10 134 193 180 140 137 28 27 19 98 168 63 76 82 39 205 86 52 46 84 172 119 49 229 42 29 116 114 45 17 259 127 33 78 130 145 218 12 110 62 71 247 1 51 91 26 23 69 34 72 124 99 185 89 47 182 160 136 64 175 162 216 142 207 41 101 108 192 55 59 40 31 24 123 166 201 58 90 169 219 37 125 85 77 151 242 95 234 253 258 36 227 93 188 104 128 94 54 141 250 159 121 232 184 120 179 158 153 35 171 112 202 56 173 81 132 164 75 197 210 60 68 199 181 237 129 156 149 167 231 102 220 212 198 67 208 133 103 254 177 44 246 223 146 48 186 225 147 61 5 194 115] Valores únicos de la columna 'mntsweetproducts': [189 0 2 32 34 98 13 20 16 4 1 3 7 8 19 30 197 14 89 172 29 160 12 5 28 60 23 35 92 138 10 80 42 21 167 50 75 53 9 178 6 26 25 99 101 123 82 96 68 37 48 176 49 73 69 58 44 62 128 151 133 11 134 36 41 148 15 51 22 262 18 97 54 77 76 121 45 64 142 198 83 55 67 149 24 175 162 17 40 137 71 94 114 38 74 46 43 102 65 141 110 152 263 27 33 112 70 47 115 59 85 126 61 163 91 95 31 120 116 125 144 122 57 56 81 106 88 185 130 107 143 66 105 111 108 179 118 93 103 84 161 147 194 72 192 109 150 86 153 165 187 78 132 191 174 87 196 157 169 39 136 139 100 129 166 173 188 182 156 79 63 195 127 145 146 124 113] Valores únicos de la columna 'mntgoldprods': [218 37 30 0 34 43 65 7 5 26 4 102 32 321 22 2 10 23 44 3 197 17 20 29 16 172 14 45 12 6 9 125 27 1 13 8 66 262 11 15 54 129 39 35 21 40 97 67 90 31 145 42 33 62 41 24 143 50 47 109 168 28 150 91 53 128 48 148 80 25 51 191 108 107 69 121 147 57 71 64 63 61 55 181 135 160 56 86 119 19 112 153 130 18 58 133 152 95 83 88 134 38 68 76 140 79 99 52 116 138 166 59 241 157 114 219 231 110 183 205 74 36 177 192 246 127 122 96 49 77 362 72 144 120 141 248 82 196 139 46 93 190 75 174 170 182 78 169 106 60 92 233 146 89 198 176 171 242 111 158 101 124 118 232 227 203 81 142 117 200 84 85 224 207 154 216 70 151 73 132 94 223 137 100 247 163 126 103 149 162 185 204 173 245 195 161 98 131 187 215 159 249 210 180 115 178 229 155 291 199 175 165 123] Valores únicos de la columna 'numdealspurchases': [ 1 2 3 0 4 12 7 5 6 11 9 8 10 15 13] Valores únicos de la columna 'numwebpurchases': [ 4 7 3 1 10 2 6 5 25 8 9 0 11 27 23] Valores únicos de la columna 'numcatalogpurchases': [ 4 3 2 0 1 7 10 6 8 5 9 11 28 22] Valores únicos de la columna 'numstorepurchases': [ 6 7 5 2 3 9 10 0 8 4 13 12 1 11] Valores únicos de la columna 'numwebvisitsmonth': [ 1 5 2 7 6 4 8 3 9 0 17 13 10 14 19 20] Valores únicos de la columna 'acceptedcmp3': ['0' '1'] Valores únicos de la columna 'acceptedcmp4': ['0' '1'] Valores únicos de la columna 'acceptedcmp5': ['0' '1'] Valores únicos de la columna 'acceptedcmp1': ['0' '1'] Valores únicos de la columna 'acceptedcmp2': ['0' '1'] Valores únicos de la columna 'response': ['1' '0'] Valores únicos de la columna 'complain': ['0' '1'] Valores únicos de la columna 'country': ['SP' 'CA' 'US' 'AUS' 'GER' 'IND' 'SA' 'ME']
#Checking for duplicate rows in the data to remove if any
df.duplicated().sum()
0
##hecking null values, I will remove columns have more than 75%
df.isnull().sum().sort_values(ascending=False)
income 24 id 0 numdealspurchases 0 complain 0 response 0 acceptedcmp2 0 acceptedcmp1 0 acceptedcmp5 0 acceptedcmp4 0 acceptedcmp3 0 numwebvisitsmonth 0 numstorepurchases 0 numcatalogpurchases 0 numwebpurchases 0 mntgoldprods 0 year_birth 0 mntsweetproducts 0 mntfishproducts 0 mntmeatproducts 0 mntfruits 0 mntwines 0 recency 0 dt_customer 0 teenhome 0 kidhome 0 marital_status 0 education 0 country 0 dtype: int64
#Plot Income variable to identify best strategy for imputation
plt.figure(figsize=(8,4))
sns.distplot(df['income'], kde=False, hist=True)
plt.title('Distribution', size=16)
plt.ylabel('count');
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Findings:
Choosing the median as the imputation strategy is a common approach when dealing with skewed distributions or the presence of outliers. The median is robust to outliers because it is not affected by extreme values, unlike the mean. By using the median, the imputed values will be representative of the central tendency of the non-null income values and provide a more accurate estimation compared to the mean. This provides a better approximation of the actual values and helps maintain the integrity and quality of the data.
df['income'].plot(kind='box', figsize=(3,4), patch_artist=True)
<AxesSubplot:>
#Impute null values in Income with median value (to avoid skewing of the mean due to outliers):
df['income'] = df['income'].fillna(df['income'].median())
plt.figure(figsize=(8,4))
sns.distplot(df['income'], kde=False, hist=True)
plt.title('Distribution', size=16)
plt.ylabel('count');
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
#PLot outliers of numerical data
plt.show()# select columns to plot
df_to_plot = df.drop(columns=['id', 'acceptedcmp1', 'acceptedcmp2', 'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'response', 'complain']).select_dtypes(include=np.number)
# subplots
df_to_plot.plot(subplots=True, layout=(4,4), kind='box', figsize=(12,14), patch_artist=True)
plt.subplots_adjust(wspace=0.5);
Multiple features contain outliers (see boxplots below), but the only that likely indicate data entry errors are Year_Birth <= 1900
#Remove rows where Year_Birth <= 1900
df = df[df['year_birth'] > 1900].reset_index(drop=True)
plt.figure(figsize=(3,4))
df['year_birth'].plot(kind='box', patch_artist=True);
We identify Income outliers and then exclude them from the data frame
Q1 = df.income.quantile(0.25)
Q3 = df.income.quantile(0.75)
IQR = Q3 - Q1
income_outliers = df[df.income > Q3 + 1.5*IQR]
df = df[df.income < Q3 + 1.5*IQR]
income_outliers.head()
id | year_birth | education | marital_status | income | kidhome | teenhome | dt_customer | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | acceptedcmp3 | acceptedcmp4 | acceptedcmp5 | acceptedcmp1 | acceptedcmp2 | response | complain | country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
325 | 4931 | 1977 | Graduation | Together | 157146.0 | 0 | 0 | 4/29/13 | 13 | 1 | 0 | 1725 | 2 | 1 | 1 | 0 | 0 | 28 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SA |
497 | 1501 | 1982 | PhD | Married | 160803.0 | 0 | 0 | 8/4/12 | 21 | 55 | 16 | 1622 | 17 | 3 | 4 | 15 | 0 | 28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
526 | 9432 | 1977 | Graduation | Together | 666666.0 | 1 | 0 | 6/2/13 | 23 | 9 | 14 | 18 | 8 | 1 | 12 | 4 | 3 | 1 | 3 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SA |
730 | 1503 | 1976 | PhD | Together | 162397.0 | 1 | 1 | 6/3/13 | 31 | 85 | 1 | 16 | 2 | 1 | 2 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SP |
851 | 5336 | 1971 | Master | Together | 157733.0 | 1 | 0 | 6/4/13 | 37 | 39 | 1 | 9 | 2 | 0 | 8 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SP |
plt.figure(figsize=(8,4))
sns.distplot(df['income'], kde=False, hist=True)
plt.title('Distribution', size=16)
plt.ylabel('count');
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
plt.figure(figsize=(3,4))
df['income'].plot(kind='box', patch_artist=True);
#Counting outliers for each colum
for column in df_to_plot:
q1 = np.percentile(df_to_plot[column], 25)
q3 = np.percentile(df_to_plot[column], 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
num_outliers = ((df_to_plot[column] < lower_bound) | (df_to_plot[column] > upper_bound)).sum()
print("Column '{}': {}".format(column, num_outliers))
Column 'year_birth': 3 Column 'income': 8 Column 'kidhome': 0 Column 'teenhome': 0 Column 'recency': 0 Column 'mntwines': 35 Column 'mntfruits': 227 Column 'mntmeatproducts': 175 Column 'mntfishproducts': 223 Column 'mntsweetproducts': 248 Column 'mntgoldprods': 207 Column 'numdealspurchases': 86 Column 'numwebpurchases': 4 Column 'numcatalogpurchases': 23 Column 'numstorepurchases': 0 Column 'numwebvisitsmonth': 8
#Counting percentage of outliers
for column in df_to_plot:
q1 = np.percentile(df_to_plot[column], 25)
q3 = np.percentile(df_to_plot[column], 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = ((df_to_plot[column] < lower_bound) | (df_to_plot[column] > upper_bound))
num_outliers = outliers.sum()
percentage = (num_outliers / len(df_to_plot[column])) * 100
print("Column '{}': {:.2f}%".format(column, percentage))
Column 'year_birth': 0.13% Column 'income': 0.36% Column 'kidhome': 0.00% Column 'teenhome': 0.00% Column 'recency': 0.00% Column 'mntwines': 1.56% Column 'mntfruits': 10.13% Column 'mntmeatproducts': 7.81% Column 'mntfishproducts': 9.96% Column 'mntsweetproducts': 11.07% Column 'mntgoldprods': 9.24% Column 'numdealspurchases': 3.84% Column 'numwebpurchases': 0.18% Column 'numcatalogpurchases': 1.03% Column 'numstorepurchases': 0.00% Column 'numwebvisitsmonth': 0.36%
We are going to work on the predictions with the rest of the ouliers since once we segment the clients to make the predictions we can lose a lot of data and this could invalidate the model. Once we get the first results we can test the model without outliers.
#Transform yaer_birth and dt_customer into date format
df['year_birth'] = pd.to_datetime(df['year_birth'], format='%Y')
df['dt_customer'] = pd.to_datetime(df['dt_customer'])
#Transform yaer_birth and dt_customer into date format
Having a first look at the row data enables us to start thinking at some useful variables we could create in order to better understand our dataset and reveal precious information.
We will create several variables :
We will remove the unused variables for this analysis
counts = df['marital_status'].value_counts()
print(counts)
Married 861 Together 575 Single 479 Divorced 230 Widow 77 Alone 3 YOLO 2 Absurd 2 Name: marital_status, dtype: int64
#Age
df['age'] = datetime.now().year - df['year_birth'].dt.year
# Spending
mnt_cols = [col for col in df.columns if 'mnt' in col]
df['spending'] = df[mnt_cols].sum(axis=1)
#Marital Status
# Create a mapping dictionary for states
estado_mapeo = {
'Divorced': 'single',
'Single': 'single',
'Married': 'couple',
'Together': 'couple',
'Widow':'single',
'YOLO':'single',
'Alone':'single',
'Absurd':'single',
}
# Apply mapping to 'marital_status' column
df['marital_status'] = df['marital_status'].map(estado_mapeo)
# Dependents
df['dependents'] = df['kidhome'] + df['teenhome']
# Year becoming a Customer
df['year_customer'] = pd.DatetimeIndex(df['dt_customer']).year
# Total Purchases
purchases_cols = [col for col in df.columns if 'purchases' in col]
df['total_purchases'] = df[purchases_cols].sum(axis=1)
# Total Campaigns Accepted
campaigns_cols = [col for col in df.columns if 'cmp' in col] + ['response'] # 'Response' is for the latest campaign
df['total_campaigns_acc'] = df[campaigns_cols].sum(axis=1)
#Drop columns we don´t need
df = df.drop(['year_birth', 'teenhome', 'kidhome','dt_customer'], axis=1)
df.head(3)
id | education | marital_status | income | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | acceptedcmp3 | acceptedcmp4 | acceptedcmp5 | acceptedcmp1 | acceptedcmp2 | response | complain | country | age | spending | dependents | year_customer | total_purchases | total_campaigns_acc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1826 | Graduation | single | 84835.0 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP | 53 | 1190 | 0 | 2014 | 15 | 1.0 |
1 | 1 | Graduation | single | 57091.0 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA | 62 | 577 | 0 | 2014 | 18 | 11.0 |
2 | 10476 | Graduation | couple | 67267.0 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US | 65 | 251 | 1 | 2014 | 11 | 0.0 |
#Saving clean df
df.to_csv('marketing_analysis_clean.csv', index=False)
suma_spending = df['spending'].sum()
suma_spending
1349751
fila_cliente_1386 = df[df['id'] == 9365]
# Imprimir la fila seleccionada
print(fila_cliente_1386)
id education marital_status income recency mntwines mntfruits \ 1321 9365 PhD single 60000.0 57 1048 0 mntmeatproducts mntfishproducts mntsweetproducts mntgoldprods \ 1321 217 0 0 12 numdealspurchases numwebpurchases numcatalogpurchases \ 1321 5 11 3 numstorepurchases numwebvisitsmonth acceptedcmp3 acceptedcmp4 \ 1321 5 6 0 0 acceptedcmp5 acceptedcmp1 acceptedcmp2 response complain country age \ 1321 0 0 0 0 0 SP 47 spending dependents year_customer total_purchases \ 1321 1277 1 2012 24 total_campaigns_acc 1321 0.0
df.shape
(2229, 30)
df.describe()
id | income | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | age | spending | dependents | year_customer | total_purchases | total_campaigns_acc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 | 2229.000000 |
mean | 5589.296097 | 51619.504262 | 49.106326 | 304.991476 | 26.348587 | 165.283984 | 37.642441 | 27.161507 | 44.113055 | 2.319874 | 4.100045 | 2.636608 | 5.812472 | 5.334231 | 54.111261 | 605.541050 | 0.951996 | 2013.027367 | 14.869000 | 8123.388964 |
std | 3244.840538 | 20601.216248 | 28.946476 | 336.761943 | 39.764060 | 219.336589 | 54.700548 | 41.338945 | 52.091255 | 1.895252 | 2.775461 | 2.796161 | 3.242144 | 2.414981 | 11.707430 | 601.032228 | 0.751976 | 0.684966 | 7.622187 | 28669.066626 |
min | 0.000000 | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 27.000000 | 5.000000 | 0.000000 | 2012.000000 | 0.000000 | 0.000000 |
25% | 2829.000000 | 35416.000000 | 24.000000 | 24.000000 | 2.000000 | 16.000000 | 3.000000 | 1.000000 | 9.000000 | 1.000000 | 2.000000 | 0.000000 | 3.000000 | 3.000000 | 46.000000 | 69.000000 | 0.000000 | 2013.000000 | 8.000000 | 0.000000 |
50% | 5455.000000 | 51381.500000 | 49.000000 | 176.000000 | 8.000000 | 67.000000 | 12.000000 | 8.000000 | 24.000000 | 2.000000 | 4.000000 | 2.000000 | 5.000000 | 6.000000 | 53.000000 | 397.000000 | 1.000000 | 2013.000000 | 15.000000 | 0.000000 |
75% | 8420.000000 | 68118.000000 | 74.000000 | 505.000000 | 33.000000 | 231.000000 | 50.000000 | 34.000000 | 56.000000 | 3.000000 | 6.000000 | 4.000000 | 8.000000 | 7.000000 | 64.000000 | 1044.000000 | 1.000000 | 2013.000000 | 21.000000 | 0.000000 |
max | 11191.000000 | 113734.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | 263.000000 | 362.000000 | 15.000000 | 27.000000 | 28.000000 | 13.000000 | 20.000000 | 83.000000 | 2525.000000 | 3.000000 | 2014.000000 | 43.000000 | 111111.000000 |
The describe function generates for us asummary, particularly useful as a first step in our preliminary investigation. Analyzing the statistical summary gives us insightful information in one look :
-Average income is 52.227$ while median income is 51.381$. The distribution is right skewed with the possible presence of outliers
The maximum value is 666.666$ and the 3rd quartile 68.281$ dollars so we could deduce that the distribution is skewed to the right with the possible presence of outliers.
-Average spending in the last 2 years is 605$ while median spending is 396$.
-Average age is 54 years old and the oldest customer is 83 years old.
-Average store purchases is the highest with 5.794.397 following by average of web purchases with 4.087.170.
#Split data into categorical and numerical for exploration
df_cat=df.select_dtypes(include=['object'])
df_cat.head(5)
education | marital_status | acceptedcmp3 | acceptedcmp4 | acceptedcmp5 | acceptedcmp1 | acceptedcmp2 | response | complain | country | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Graduation | single | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
1 | Graduation | single | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA |
2 | Graduation | couple | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
3 | Graduation | couple | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
4 | Graduation | single | 1 | 1 | 0 | 0 | 0 | 1 | 0 | SP |
#Save numerical data
df_num= df.select_dtypes(exclude=["object"])
df_num.head(3)
id | income | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | age | spending | dependents | year_customer | total_purchases | total_campaigns_acc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1826 | 84835.0 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 53 | 1190 | 0 | 2014 | 15 | 1.0 |
1 | 1 | 57091.0 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 62 | 577 | 0 | 2014 | 18 | 11.0 |
2 | 10476 | 67267.0 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 65 | 251 | 1 | 2014 | 11 | 0.0 |
There are columsn relevant for customer profile: Education:ordinal Incomo:continuos Marital:nominal
#Displaying categorical data
fig, ax = plt.subplots(1, 4, figsize=(15, 4))
sns.countplot(x=df_cat['marital_status'], ax=ax[0])
sns.countplot(x=df_cat['education'], ax=ax[1])
sns.countplot(x=df_cat['country'], ax=ax[2])
df_plot = df_cat.groupby(['marital_status', 'education']).size().reset_index().pivot(columns='marital_status', index='education', values=0)
df_plot.apply(lambda x: x / x.sum(), axis=1).plot(kind='bar', stacked=True, ax=ax[3], colormap='Paired')
ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 7, figsize=(15, 4))
sns.countplot(x=df_cat['acceptedcmp1'], ax=ax[0])
sns.countplot(x=df_cat['acceptedcmp2'], ax=ax[1])
sns.countplot(x=df_cat['acceptedcmp3'], ax=ax[2])
sns.countplot(x=df_cat['acceptedcmp4'], ax=ax[3])
sns.countplot(x=df_cat['acceptedcmp5'], ax=ax[4])
sns.countplot(x=df_cat['response'], ax=ax[5])
sns.countplot(x=df_cat['complain'], ax=ax[6])
ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.tight_layout()
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
df_cat_con= df_cat[['marital_status', 'education']]
df_cat_con
marital_status | education | |
---|---|---|
0 | single | Graduation |
1 | single | Graduation |
2 | couple | Graduation |
3 | couple | Graduation |
4 | single | Graduation |
... | ... | ... |
2232 | single | PhD |
2233 | couple | 2n Cycle |
2234 | single | Graduation |
2235 | couple | Graduation |
2236 | couple | PhD |
2229 rows × 2 columns
from scipy.stats import chi2_contingency
# Crear la tabla de contingencia
contingency_table= pd.crosstab(df_cat_con['marital_status'], df_cat_con['education'])
# Realizar la prueba de chi-cuadrado
chi2, p_value, _, _ = chi2_contingency(contingency_table)
# Mostrar la tabla de contingencia
print(contingency_table)
# Mostrar el valor de chi-cuadrado y el valor p
print(f"Chi-cuadrado: {chi2}")
print(f"Valor p: {p_value}")
education 2n Cycle Basic Graduation Master PhD marital_status couple 138 34 717 243 304 single 63 20 407 126 177 Chi-cuadrado: 2.4612027478482332 Valor p: 0.651596254517518
The result obtained indicates that the chi-square value is approximately 2.461 and the p-value is approximately 0.6516.
The chi-square value is a measure of the association between the variables "marital_status" (marital status) and "education" (education) in your data set. A larger chi-square value indicates a stronger association between the variables, while a value close to zero indicates that the variables are independent.
The p-value is the probability of obtaining a chi-square value equal to or more extreme than that observed if the variables were completely independent. In other words, it represents the probability that the association between the variables is simply the result of chance, given the size of the sample.
In your case, the p-value is 0.6516, which means that the probability of getting a chi-square value equal to or more extreme than 2.461 under the null hypothesis (that the variables are independent) is about 0.6516. Generally, if the p-value is greater than a predefined significance level (for example, 0.05), we do not have sufficient evidence to reject the null hypothesis and we can consider the variables to be independent.
Therefore, in this case, the result suggests that there is no significant association between marital status and education level in the data set you are analyzing.
#Displaying numerical dataç
df_num.columns
Index(['id', 'income', 'recency', 'mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods', 'numdealspurchases', 'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases', 'numwebvisitsmonth', 'age', 'spending', 'dependents', 'year_customer', 'total_purchases', 'total_campaigns_acc'], dtype='object')
# Crear el lienzo y los ejes
fig, axes = plt.subplots(6, 4, figsize=(15, 20))
# Aplanar el arreglo de ejes para facilitar el acceso
axes = axes.flatten()
# Graficar los histogramas de las columnas numéricas
for i, col in enumerate(df_num.columns):
sns.histplot(data=df_num, x=col, ax=axes[i], kde=True)
# Ajustar el espaciado entre subplots
plt.tight_layout()
# Mostrar el gráfico
plt.show()
corr = df_num.corr()
ones = np.ones_like(corr, dtype="bool")
mask = np.triu(ones)
mask = mask[1:, :-1]
corr = corr.iloc[1:,:-1].copy()
fig, ax = plt.subplots(figsize=(40,30))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="Greens", linewidths=.6, cbar_kws={"shrink":.9})
ax.xaxis.tick_bottom()
plt.title("Correlations heatmap for numerical data".upper(), fontdict={"fontsize": 18}, loc="left")
Text(0.0, 1.0, 'CORRELATIONS HEATMAP FOR NUMERICAL DATA')
sns.set(rc ={'figure.figsize':(8,8)}, font_scale=0.5)
sns.heatmap(corr[corr>0.8], annot=True, fmt=".3f")
<AxesSubplot:>
#High correlation:.80 to 1
#Mean correlation:.60 to .80
#Low correlation: .40 to .60
#We see that our target to predict has mean correlation with total purchases
corr = df_num.corr()
high_corr = corr[corr > 0.80]
high_corr = high_corr.unstack().dropna()
high_corr = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)]
for index, value in high_corr.items():
variable_1, variable_2 = index
correlation_value = value
print("Variables:", variable_1, "and", variable_2)
print("Correlation:", correlation_value)
print("------------------------")
Variables: income and spending Correlation: 0.8202215056408394 ------------------------ Variables: mntwines and spending Correlation: 0.8959193067149777 ------------------------ Variables: mntmeatproducts and spending Correlation: 0.8534350636911707 ------------------------ Variables: numstorepurchases and total_purchases Correlation: 0.8317509633068462 ------------------------ Variables: spending and income Correlation: 0.8202215056408394 ------------------------ Variables: spending and mntwines Correlation: 0.8959193067149777 ------------------------ Variables: spending and mntmeatproducts Correlation: 0.8534350636911707 ------------------------ Variables: total_purchases and numstorepurchases Correlation: 0.8317509633068462 ------------------------
Income is a proxy for several other features, such as spending capacity or total purchases. It shows a positive relationship with expenses on meat and wine, indicating that individuals with higher income tend to spend more on these items. On the other hand, income has a negative correlation with the number of children at home and the frequency of website visits, suggesting that individuals with higher income tend to have fewer children at home and visit websites less frequently.
Furthermore, the amount spent on wine is not only influenced by high income but also by the amount spent on meat. These two variables are interconnected, meaning that individuals who spend more on meat are also likely to spend more on wine. Moreover, individuals can purchase wine either from catalogs or from physical stores.
The number of children at home is negatively associated with spending and income, and consequently, wine consumption. This implies that individuals with more children tend to have lower income and spend less overall, including on wine. Conversely, higher income levels are also linked to a greater willingness to accept advertising campaigns.
In summary, income serves as a proxy for several factors, with positive correlations to spending, meat and wine expenses, while showing negative associations with the number of children at home and website visits. The relationship between income, spending habits, and preferences for specific products such as wine can provide insights into consumer behavior and marketing strategies.
#VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
features = ['income', 'mntmeatproducts','mntwines','total_purchases']
def calculate_vif(df, features):
vif, tolerance, r2sq = {}, {}, {}
# all the features that you want to examine
for feature in features:
# extract all the other features you will regress against
X = [f for f in features if f != feature]
X, y = df[X], df[feature]
# extract r-squared from the fit
r2 = LinearRegression().fit(X, y).score(X, y)
# calculate tolerance
r2sq[feature] = r2
tolerance[feature] = 1 - r2
# calculate VIF
vif[feature] = 1/(tolerance[feature])
# return VIF DataFrame
return pd.DataFrame({'VIF': vif, 'Tolerance': tolerance, 'R2': r2sq})
calculate_vif(df_num, df_num)
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature]) C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars vif[feature] = 1/(tolerance[feature])
VIF | Tolerance | R2 | |
---|---|---|---|
id | 1.009077 | 0.991004 | 0.008996 |
income | 5.114370 | 0.195528 | 0.804472 |
recency | 1.011846 | 0.988293 | 0.011707 |
mntwines | inf | 0.000000 | 1.000000 |
mntfruits | inf | 0.000000 | 1.000000 |
mntmeatproducts | inf | 0.000000 | 1.000000 |
mntfishproducts | inf | 0.000000 | 1.000000 |
mntsweetproducts | inf | 0.000000 | 1.000000 |
mntgoldprods | inf | 0.000000 | 1.000000 |
numdealspurchases | inf | 0.000000 | 1.000000 |
numwebpurchases | inf | 0.000000 | 1.000000 |
numcatalogpurchases | inf | 0.000000 | 1.000000 |
numstorepurchases | inf | 0.000000 | 1.000000 |
numwebvisitsmonth | 2.776575 | 0.360156 | 0.639844 |
age | 1.121263 | 0.891851 | 0.108149 |
spending | inf | 0.000000 | 1.000000 |
dependents | 2.027792 | 0.493147 | 0.506853 |
year_customer | 1.236980 | 0.808421 | 0.191579 |
total_purchases | inf | 0.000000 | 1.000000 |
total_campaigns_acc | 1.090735 | 0.916813 | 0.083187 |
#remving spending
features_2 = ['mntmeatproducts', 'mntwines', 'total_purchases', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods', 'numwebpurchases', 'numwebvisitsmonth', 'age', 'dependents']
calculate_vif(df,features_2)
VIF | Tolerance | R2 | |
---|---|---|---|
mntmeatproducts | 2.507714 | 0.398770 | 0.601230 |
mntwines | 2.423295 | 0.412661 | 0.587339 |
total_purchases | 4.861589 | 0.205694 | 0.794306 |
mntfishproducts | 1.964291 | 0.509090 | 0.490910 |
mntsweetproducts | 1.798715 | 0.555952 | 0.444048 |
mntgoldprods | 1.446274 | 0.691432 | 0.308568 |
numwebpurchases | 3.036062 | 0.329374 | 0.670626 |
numwebvisitsmonth | 1.689992 | 0.591719 | 0.408281 |
age | 1.096738 | 0.911794 | 0.088206 |
dependents | 1.551958 | 0.644347 | 0.355653 |
Interpreting R square k - If $R^2_k$ equals zero, variable k
is not correlated with any other independent variable.
Usually, multicollinearity is a potential problem when $R^2_k$ is greater than 0.75
and, a serious problem when $R^2_k$ is greater than 0.9
.
Interpretation of the variance inflation factor: If $VIF_k = 1$, variable k is not correlated with any other independent variable. Multicollinearity is a potential problem when VIFk
is greater than 5 and, a serious problem when it is greater than 10.
Interpretation of tolerance A tolerance of less than 0.20 or 0.10 indicates a multicollinearity problem.
#Plot illustrating the effect of income on spending
plt.figure(figsize=(8, 8))
plt.scatter(df['income'], df['spending'])
plt.title('Income vs. Spending')
plt.xlabel('Income')
plt.ylabel('Spending')
plt.show()
#Plot illustrating negative effect of having dependents (kids & teens) on spending
plt.figure(figsize=(4,4))
sns.boxplot(x='dependents', y='spending', data=df)
<AxesSubplot:xlabel='dependents', ylabel='spending'>
#Plot illustrating positive effect of having dependents (kids & teens) on number of deals purchased
plt.figure(figsize=(4,4))
sns.boxplot(x='dependents', y='numdealspurchases', data=df);
#REMOVING SPENDING AND TOTAL PURCHASE to check Multicolinearity
corr_de = df_num.drop(['spending', 'total_purchases'], axis=1)
fig, ax = plt.subplots(figsize=(40,30) )
heatmap = sns.heatmap(corr_de.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
I choose the spending and income variables because they provide interesting information for the segmentation.
df_num.head(2)
id | income | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | age | spending | dependents | year_customer | total_purchases | total_campaigns_acc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1826 | 84835.0 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 53 | 1190 | 0 | 2014 | 15 | 1.0 |
1 | 1 | 57091.0 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 62 | 577 | 0 | 2014 | 18 | 11.0 |
cluster = df_num.iloc[:,[1,15,]]
#income
#recency, mntwines mntfruits mntmeatproducts mntfishproducts mntsweetproducts mntgoldprods numdealspurchases numwebpurchases numcatalogpurchases numstorepurchases numwebvisitsmonth age spending,total_purchases
cluster
1
Standardizing numerical data with standard scaler
#scaler = StandardScaler()
#cluster_encoded = scaler.fit_transform(cluster)
#cluster_encoded
scaler = MinMaxScaler()
cluster_encoded= scaler.fit_transform(cluster)
cluster_encoded
array([[0.74198243, 0.4702381 ], [0.49427699, 0.22698413], [0.58513089, 0.09761905], ..., [0.3980215 , 0.12063492], [0.57220278, 0.5468254 ], [0.83158637, 0.42579365]])
cluster_encoded.shape
(2229, 2)
Choosing the number of clusters WCSS -> Within Clusters Sum of Squares
## finding wcss value for different number of clusters
wcss = []
for i in range(1,11):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
kmeans.fit(cluster_encoded)
wcss.append(kmeans.inertia_)
# plot an elbow graph
sns.set()
plt.plot(range(1,11), wcss)
plt.title('The Elbow Point Graph')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()
Optimum Number of Clusters = 4
Training the k-Means Clustering Model
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
# return a label for each data point based on their cluster
Y = kmeans.fit_predict(cluster_encoded)
print(Y)
[3 2 2 ... 2 3 3]
#Visualization
plt.figure(figsize=(8,8))
plt.scatter(cluster_encoded[Y==0,0 ], cluster_encoded[Y==0,1 ], s=50, c='green', label='Cluster 1')
plt.scatter(cluster_encoded[Y==1,0 ], cluster_encoded[Y==1,1 ], s=50, c='red', label='Cluster 2')
plt.scatter(cluster_encoded[Y==2,0 ], cluster_encoded[Y==2,1 ], s=50, c='yellow', label='Cluster 3')
plt.scatter(cluster_encoded[Y==3,0 ], cluster_encoded[Y==3,1 ], s=50, c='blue', label='Cluster 4')
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=100, c='cyan', label='Centroids')
# Configura las etiquetas de los ejes y el título de la gráfica
plt.ylabel('Spending')
plt.xlabel('Income')
plt.title('Customers Groups')
# Muestra la gráfica
plt.show()
kmeans.labels_
array([3, 2, 2, ..., 2, 3, 3])
cluster_1_data = cluster_encoded[Y == 0]
cluster_1_data
array([[0.2744902 , 0.00238095], [0.17627942, 0.03412698], [0.22447413, 0.0265873 ], ..., [0.16427985, 0.01150794], [0.16427985, 0.01150794], [0.26182993, 0.01984127]])
# Aplica K-means para obtener los clusters
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(cluster_encoded)
# Obtiene las etiquetas asignadas a cada cliente
labels = kmeans.labels_
# Cuenta la cantidad de clientes en cada cluster
conteo_clientes = pd.Series(labels).value_counts()
# Grafica los grupos de clientes por ingresos y gastos
plt.figure(figsize=(8, 8))
plt.scatter(cluster_encoded[:, 0], cluster_encoded[:, 1], c=labels, cmap='viridis')
# Grafica los centroides de los clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=100, c='red', label='Centroids')
# Configura las etiquetas de los ejes y el título de la gráfica
plt.xlabel('Income')
plt.ylabel('Spending')
plt.title('Customers Groups')
# Muestra la cantidad de clientes en cada cluster
for cluster, count in conteo_clientes.items():
centroid = kmeans.cluster_centers_[cluster]
plt.text(centroid[0], centroid[1], f'{count}', fontsize=12, color='black', ha='center')
# Muestra la gráfica
plt.show()
I create a copy of the original dataframe and assign the labels to it to be able to filter the rows for each label and analyze the customer groups
cluster2 = cluster.copy()
cluster2['cluster_labels'] = kmeans.labels_
cluster2
income | spending | cluster_labels | |
---|---|---|---|
0 | 84835.0 | 1190 | 3 |
1 | 57091.0 | 577 | 2 |
2 | 67267.0 | 251 | 2 |
3 | 32474.0 | 11 | 0 |
4 | 21474.0 | 91 | 0 |
... | ... | ... | ... |
2232 | 66476.0 | 689 | 3 |
2233 | 31056.0 | 55 | 0 |
2234 | 46310.0 | 309 | 2 |
2235 | 65819.0 | 1383 | 3 |
2236 | 94871.0 | 1078 | 3 |
2229 rows × 3 columns
# Filtrar las filas por los valores de cluster_labels
cluster_0 = cluster2.loc[cluster2['cluster_labels'] == 0]
cluster_1 = cluster2.loc[cluster2['cluster_labels'] == 1]
cluster_2 = cluster2.loc[cluster2['cluster_labels'] == 2]
cluster_3 = cluster2.loc[cluster2['cluster_labels'] == 3]
# Imprimir los resultados
print("Cluster 0:")
print(cluster_0)
print()
print("Cluster 1:")
print(cluster_1)
print()
print("Cluster 2:")
print(cluster_2)
print()
print("Cluster 3:")
print(cluster_3)
Cluster 0: income spending cluster_labels 3 32474.0 11 0 4 21474.0 91 0 13 26872.0 72 0 14 4428.0 359 0 17 32173.0 22 0 ... ... ... ... 2220 7500.0 100 0 2226 42231.0 37 0 2227 20130.0 34 0 2228 20130.0 34 0 2233 31056.0 55 0 [736 rows x 3 columns] Cluster 1: income spending cluster_labels 31 79529.0 1638 1 35 95169.0 1901 1 38 84865.0 1688 1 41 66465.0 1485 1 45 64260.0 1574 1 ... ... ... ... 2205 84906.0 1631 1 2206 84906.0 1631 1 2210 90687.0 1779 1 2216 83273.0 1392 1 2217 81702.0 1633 1 [334 rows x 3 columns] Cluster 2: income spending cluster_labels 1 57091.0 577 2 2 67267.0 251 2 7 44931.0 96 2 8 65324.0 544 2 9 65324.0 544 2 ... ... ... ... 2222 44794.0 65 2 2223 44794.0 65 2 2229 42429.0 67 2 2230 42429.0 67 2 2234 46310.0 309 2 [612 rows x 3 columns] Cluster 3: income spending cluster_labels 0 84835.0 1190 3 5 71691.0 1192 3 6 63564.0 1215 3 10 81044.0 1208 3 12 67786.0 1156 3 ... ... ... ... 2225 62568.0 953 3 2231 78901.0 734 3 2232 66476.0 689 3 2235 65819.0 1383 3 2236 94871.0 1078 3 [547 rows x 3 columns]
#Converting clusters to dataframe
cluster_0_df = pd.DataFrame(cluster_0)
cluster_1_df = pd.DataFrame(cluster_1)
cluster_2_df = pd.DataFrame(cluster_2)
cluster_3_df = pd.DataFrame(cluster_3)
cluster_0_df
income | spending | cluster_labels | |
---|---|---|---|
3 | 32474.0 | 11 | 0 |
4 | 21474.0 | 91 | 0 |
13 | 26872.0 | 72 | 0 |
14 | 4428.0 | 359 | 0 |
17 | 32173.0 | 22 | 0 |
... | ... | ... | ... |
2220 | 7500.0 | 100 | 0 |
2226 | 42231.0 | 37 | 0 |
2227 | 20130.0 | 34 | 0 |
2228 | 20130.0 | 34 | 0 |
2233 | 31056.0 | 55 | 0 |
736 rows × 3 columns
cluster_0_df.shape
(736, 3)
cluster_0_df.describe()
income | spending | cluster_labels | |
---|---|---|---|
count | 736.000000 | 736.000000 | 736.0 |
mean | 28541.343750 | 72.880435 | 0.0 |
std | 8635.690514 | 69.255511 | 0.0 |
min | 1730.000000 | 5.000000 | 0.0 |
25% | 22678.750000 | 32.000000 | 0.0 |
50% | 29968.500000 | 53.500000 | 0.0 |
75% | 35528.250000 | 84.000000 | 0.0 |
max | 43269.000000 | 570.000000 | 0.0 |
Cluster size: 736 clients.
Average income: The average income of clients in this cluster is approximately 28,541$
Average expenses: Clients in this cluster have an average expense of around 72.88$.
Variability: There is considerable variability in the income and expenses of clients in this cluster, as indicated by the standard deviation. This suggests that there are clients with different levels of income and expenses within the cluster.
Range of income and expenses: The range of income goes from 1,730$ to 43,269$, while the range of expenses goes from 5$ to 570$ monetary units.
Distribution of income and expenses: The median (central value) of income is approximately 29,968$, which means that half of the clients have incomes above this value and the other half have incomes below this value. The median of the expenses is approximately 53$.
Low Spend Segment: This cluster can be considered as a segment of customers with low spending levels, since the average spend is relatively low compared to the other clusters.
With these conclusions, we can infer that this group of clients has relatively low income and expenses compared to the other clusters. This may indicate a customer profile with lower spending capacity or more conservative consumption preferences. These observations can be useful for the marketing team in targeting their strategies and campaigns more effectively towards this customer segment, for example, by offering affordable products or services and promotions tailored to their level of spending.
cluster_1_df.shape
(334, 3)
cluster_1_df.describe()
income | spending | cluster_labels | |
---|---|---|---|
count | 334.000000 | 334.000000 | 334.0 |
mean | 78690.004491 | 1688.784431 | 1.0 |
std | 8955.332848 | 272.797511 | 0.0 |
min | 51381.500000 | 1112.000000 | 1.0 |
25% | 72270.750000 | 1479.750000 | 1.0 |
50% | 79414.500000 | 1634.000000 | 1.0 |
75% | 83842.250000 | 1865.750000 | 1.0 |
max | 105471.000000 | 2525.000000 | 1.0 |
Cluster size: 334 clients.
Average income: The average income of clients in Cluster 1 is approximately 78,690$. This suggests that the customers in this cluster have relatively higher income levels compared to Cluster 0.
Average expenses: The average expenses of clients in Cluster 1 amount to around 1,688$. This indicates that the customers in this segment tend to have higher spending levels compared to Cluster 0.
Variability: The standard deviation for both income and expenses within Cluster 1 suggests a moderate degree of variability. This implies that there are clients with varying income and spending patterns within this cluster.
Range of income and expenses: The range of income in Cluster 1 spans from 51,381$ to 105,471$ while the range of expenses ranges from 1.112$ to 2,525$. This wide range indicates the presence of customers with different income and spending levels within this cluster.
Distribution of income and expenses: The median income of clients in Cluster 1 is approximately 79,414$. This suggests that half of the clients have an income below this value, while the other half has an income above it.
Overall, Cluster 1 represents a segment of customers with relatively higher income and spending levels compared to Cluster 0. These customers may have a higher purchasing power and exhibit more liberal spending behavior. The marketing department can leverage these insights to develop strategies and campaigns that cater to this customer segment. For example, they can focus on offering premium products or services, creating promotions that align with the customers' higher spending capacity, and delivering messaging that resonates with their more indulgent consumption preferences.
cluster_2_df.shape
(612, 3)
cluster_2_df.describe()
income | spending | cluster_labels | |
---|---|---|---|
count | 612.000000 | 612.000000 | 612.0 |
mean | 50399.556373 | 315.490196 | 2.0 |
std | 7572.817956 | 190.461525 | 0.0 |
min | 32632.000000 | 16.000000 | 2.0 |
25% | 44767.750000 | 147.750000 | 2.0 |
50% | 49973.500000 | 307.500000 | 2.0 |
75% | 55305.250000 | 455.750000 | 2.0 |
max | 73395.000000 | 839.000000 | 2.0 |
From this analysis, we can conclude that Cluster 3 represents a segment of customers with a moderate income level and moderate spending behavior. These customers may have a balanced approach to their finances and exhibit stable spending patterns. The marketing department can consider targeting this customer segment with products or services that align with their moderate income and spending capacity. They can also focus on providing value-based offerings and promotions that cater to their stable spending behavior.
cluster_3_df.shape
(547, 3)
cluster_3_df.describe()
income | spending | cluster_labels | |
---|---|---|---|
count | 547.000000 | 547.000000 | 547.0 |
mean | 67507.232176 | 985.332724 | 3.0 |
std | 9879.242251 | 206.307558 | 0.0 |
min | 2447.000000 | 277.000000 | 3.0 |
25% | 61194.500000 | 824.000000 | 3.0 |
50% | 66886.000000 | 988.000000 | 3.0 |
75% | 74279.000000 | 1150.000000 | 3.0 |
max | 113734.000000 | 1730.000000 | 3.0 |
Based on this analysis, we can conclude that Cluster 3 represents a segment of clients with a relatively higher average income and higher spending levels compared to other clusters. These clients may have a higher spending capacity and may be more inclined towards making larger purchases. The marketing department can consider targeting this customer segment with higher-end products or services, luxury offerings, or personalized marketing campaigns that cater to their higher spending power.
selected_columns = ['age', 'year_customer', 'income', 'dependents', 'recency']
selected_data = df[selected_columns]
mean_values = selected_data.mean().round(1)
mean_df = pd.DataFrame(mean_values, columns=['Mean'])
mean_df.index.name = 'Variable'
mean_df.columns = ['Average Customer']
mean_df.index = mean_df.index.rename('Characteristics')
mean_df
Average Customer | |
---|---|
Characteristics | |
age | 54.1 |
year_customer | 2013.0 |
income | 51619.5 |
dependents | 1.0 |
recency | 49.1 |
Wines Followed by meats
# Calcular la suma de las ventas de cada producto
sales = df[['mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']].sum()
# Crear el gráfico de pie
plt.figure(figsize=(8, 6))
plt.pie(sales, labels=sales.index, autopct='%1.1f%%')
# Configurar el título
plt.title('Product Performance by Sales')
# Mostrar el gráfico
plt.show()
# Seleccionar las columnas de interés
channels = df[['numwebvisitsmonth', 'numstorepurchases', 'numcatalogpurchases', 'numwebpurchases', 'numdealspurchases']]
# Calcular la suma de cada canal
channel_purchases = channels.sum()
# Calcular el porcentaje de compras de cada canal
total_purchases = df['total_purchases'].sum()
channel_performance = (channel_purchases / total_purchases) * 100
# Crear el gráfico de pie
plt.figure(figsize=(8, 6))
plt.pie(channel_performance, labels=channel_performance.index, autopct='%1.1f%%')
# Configurar el título
plt.title('Channels Performance by Total Purchases')
# Mostrar el gráfico
plt.show()
The most successful campaign is the most recent (column name: Response)
fig, ax = plt.subplots(1, 7, figsize=(15, 4))
sns.countplot(x=df_cat['acceptedcmp1'], ax=ax[0])
sns.countplot(x=df_cat['acceptedcmp2'], ax=ax[1])
sns.countplot(x=df_cat['acceptedcmp3'], ax=ax[2])
sns.countplot(x=df_cat['acceptedcmp4'], ax=ax[3])
sns.countplot(x=df_cat['acceptedcmp5'], ax=ax[4])
sns.countplot(x=df_cat['response'], ax=ax[5])
ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.tight_layout()
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
cluster
income | spending | |
---|---|---|
0 | 84835.0 | 1190 |
1 | 57091.0 | 577 |
2 | 67267.0 | 251 |
3 | 32474.0 | 11 |
4 | 21474.0 | 91 |
... | ... | ... |
2232 | 66476.0 | 689 |
2233 | 31056.0 | 55 |
2234 | 46310.0 | 309 |
2235 | 65819.0 | 1383 |
2236 | 94871.0 | 1078 |
2229 rows × 2 columns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
#X-y split
Y=cluster['spending']
X=cluster.drop('spending',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Normalizar los datos en X_train y X_test usando StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Crear y ajustar el modelo de regresión lineal
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)
LinearRegression()
# Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba
y_train_pred = regression_model.predict(X_train_scaled)
y_test_pred = regression_model.predict(X_test_scaled)
# Calcular las métricas en conjunto de entrenamiento
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = sqrt(mse_train)
# Calcular las métricas en conjunto de prueba
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = sqrt(mse_test)
# Imprimir las métricas
print("Métricas en conjunto de entrenamiento:")
print("R^2:", r2_train)
print("MSE:", mse_train)
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("------------------------------")
print("Métricas en conjunto de prueba:")
print("R^2:", r2_test)
print("MSE:", mse_test)
print("MAE:", mae_test)
print("RMSE:", rmse_test)
Métricas en conjunto de entrenamiento: R^2: 0.6784919073987676 MSE: 118045.350349332 MAE: 268.46887171988595 RMSE: 343.57728439076413 ------------------------------ Métricas en conjunto de prueba: R^2: 0.6469237899197506 MSE: 118895.62031097068 MAE: 267.66162931031494 RMSE: 344.8124422218124
df_cat
education | marital_status | acceptedcmp3 | acceptedcmp4 | acceptedcmp5 | acceptedcmp1 | acceptedcmp2 | response | complain | country | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Graduation | single | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
1 | Graduation | single | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA |
2 | Graduation | couple | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
3 | Graduation | couple | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
4 | Graduation | single | 1 | 1 | 0 | 0 | 0 | 1 | 0 | SP |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2232 | PhD | single | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
2233 | 2n Cycle | couple | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SP |
2234 | Graduation | single | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SP |
2235 | Graduation | couple | 0 | 0 | 0 | 0 | 0 | 0 | 0 | IND |
2236 | PhD | couple | 0 | 0 | 1 | 0 | 0 | 1 | 0 | CA |
2229 rows × 10 columns
df_cat_l=df_cat[['education', 'marital_status', 'country']]
df_cat_l= df_cat_l.reset_index()
df_cat_l = df_cat_l.drop(columns=['index'])
df_cat_l
education | marital_status | country | |
---|---|---|---|
0 | Graduation | single | SP |
1 | Graduation | single | CA |
2 | Graduation | couple | US |
3 | Graduation | couple | AUS |
4 | Graduation | single | SP |
... | ... | ... | ... |
2224 | PhD | single | US |
2225 | 2n Cycle | couple | SP |
2226 | Graduation | single | SP |
2227 | Graduation | couple | IND |
2228 | PhD | couple | CA |
2229 rows × 3 columns
#Encodign categorical data
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(drop='first').fit(df_cat_l)
encoder=encoder.transform(df_cat_l).toarray()
encoder
array([[0., 1., 0., ..., 0., 1., 0.], [0., 1., 0., ..., 0., 0., 0.], [0., 1., 0., ..., 0., 0., 1.], ..., [0., 1., 0., ..., 0., 1., 0.], [0., 1., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]])
df_num
id | income | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | age | spending | dependents | year_customer | total_purchases | total_campaigns_acc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1826 | 84835.0 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 53 | 1190 | 0 | 2014 | 15 | 1.0 |
1 | 1 | 57091.0 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 62 | 577 | 0 | 2014 | 18 | 11.0 |
2 | 10476 | 67267.0 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 65 | 251 | 1 | 2014 | 11 | 0.0 |
3 | 1386 | 32474.0 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 56 | 11 | 2 | 2014 | 4 | 0.0 |
4 | 5371 | 21474.0 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 34 | 91 | 1 | 2014 | 8 | 110001.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2232 | 10142 | 66476.0 | 99 | 372 | 18 | 126 | 47 | 48 | 78 | 2 | 5 | 2 | 11 | 4 | 47 | 689 | 1 | 2013 | 20 | 0.0 |
2233 | 5263 | 31056.0 | 99 | 5 | 10 | 13 | 3 | 8 | 16 | 1 | 1 | 0 | 3 | 8 | 46 | 55 | 1 | 2013 | 5 | 0.0 |
2234 | 22 | 46310.0 | 99 | 185 | 2 | 88 | 15 | 5 | 14 | 2 | 6 | 1 | 5 | 8 | 47 | 309 | 1 | 2012 | 14 | 0.0 |
2235 | 528 | 65819.0 | 99 | 267 | 38 | 701 | 149 | 165 | 63 | 1 | 5 | 4 | 10 | 3 | 45 | 1383 | 0 | 2012 | 20 | 0.0 |
2236 | 4070 | 94871.0 | 99 | 169 | 24 | 553 | 188 | 0 | 144 | 1 | 8 | 5 | 4 | 7 | 54 | 1078 | 2 | 2012 | 18 | 1001.0 |
2229 rows × 20 columns
df_num_l=df_num[['income', 'numdealspurchases', 'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases','numwebvisitsmonth', 'age', 'dependents', 'spending']]
df_num_l= df_num_l.reset_index()
df_num_l = df_num_l.drop(columns=['index'])
df_num_l
income | numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | age | dependents | spending | |
---|---|---|---|---|---|---|---|---|---|
0 | 84835.0 | 1 | 4 | 4 | 6 | 1 | 53 | 0 | 1190 |
1 | 57091.0 | 1 | 7 | 3 | 7 | 5 | 62 | 0 | 577 |
2 | 67267.0 | 1 | 3 | 2 | 5 | 2 | 65 | 1 | 251 |
3 | 32474.0 | 1 | 1 | 0 | 2 | 7 | 56 | 2 | 11 |
4 | 21474.0 | 2 | 3 | 1 | 2 | 7 | 34 | 1 | 91 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2224 | 66476.0 | 2 | 5 | 2 | 11 | 4 | 47 | 1 | 689 |
2225 | 31056.0 | 1 | 1 | 0 | 3 | 8 | 46 | 1 | 55 |
2226 | 46310.0 | 2 | 6 | 1 | 5 | 8 | 47 | 1 | 309 |
2227 | 65819.0 | 1 | 5 | 4 | 10 | 3 | 45 | 0 | 1383 |
2228 | 94871.0 | 1 | 8 | 5 | 4 | 7 | 54 | 2 | 1078 |
2229 rows × 9 columns
#X-y split
y=df_num_l['spending']
X=df_num_l.drop(['spending'],axis=1)
X
income | numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | age | dependents | |
---|---|---|---|---|---|---|---|---|
0 | 84835.0 | 1 | 4 | 4 | 6 | 1 | 53 | 0 |
1 | 57091.0 | 1 | 7 | 3 | 7 | 5 | 62 | 0 |
2 | 67267.0 | 1 | 3 | 2 | 5 | 2 | 65 | 1 |
3 | 32474.0 | 1 | 1 | 0 | 2 | 7 | 56 | 2 |
4 | 21474.0 | 2 | 3 | 1 | 2 | 7 | 34 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2224 | 66476.0 | 2 | 5 | 2 | 11 | 4 | 47 | 1 |
2225 | 31056.0 | 1 | 1 | 0 | 3 | 8 | 46 | 1 |
2226 | 46310.0 | 2 | 6 | 1 | 5 | 8 | 47 | 1 |
2227 | 65819.0 | 1 | 5 | 4 | 10 | 3 | 45 | 0 |
2228 | 94871.0 | 1 | 8 | 5 | 4 | 7 | 54 | 2 |
2229 rows × 8 columns
# Normalizar datos numéricos utilizando StandardScaler
scaler = StandardScaler()
X_es = scaler.fit_transform(X)
X_es
array([[ 1.6126693 , -0.69656739, -0.03605431, ..., -1.79512893, -0.09494056, -1.26627659], [ 0.2656505 , -0.69656739, 1.04508986, ..., -0.13842988, 0.67397452, -1.26627659], [ 0.75971276, -0.69656739, -0.3964357 , ..., -1.38095417, 0.93027955, 0.06385089], ..., [-0.25778554, -0.1688146 , 0.68470847, ..., 1.10409441, -0.60755062, 0.06385089], [ 0.68940988, -0.69656739, 0.32432708, ..., -0.96677941, -0.77842064, -1.26627659], [ 2.09993432, -0.69656739, 1.40547125, ..., 0.68991965, -0.00950556, 1.39397838]])
X=np.concatenate((X_es ,encoder),axis=1)
X
array([[ 1.6126693 , -0.69656739, -0.03605431, ..., 0. , 1. , 0. ], [ 0.2656505 , -0.69656739, 1.04508986, ..., 0. , 0. , 0. ], [ 0.75971276, -0.69656739, -0.3964357 , ..., 0. , 0. , 1. ], ..., [-0.25778554, -0.1688146 , 0.68470847, ..., 0. , 1. , 0. ], [ 0.68940988, -0.69656739, 0.32432708, ..., 0. , 0. , 0. ], [ 2.09993432, -0.69656739, 1.40547125, ..., 0. , 0. , 0. ]])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled
array([[-0.61140256, -0.68389372, -1.11115159, ..., -0.41585133, -0.96225045, -0.22786351], [ 0.70526191, 0.33801644, 1.75100978, ..., -0.41585133, 1.03923048, -0.22786351], [ 0.14579357, 0.84897152, -0.39561125, ..., -0.41585133, -0.96225045, -0.22786351], ..., [-0.15850648, -0.68389372, -1.11115159, ..., -0.41585133, 1.03923048, -0.22786351], [ 1.10278267, -0.68389372, -0.03784108, ..., 2.40470553, -0.96225045, -0.22786351], [ 0.82513769, -0.68389372, 1.75100978, ..., -0.41585133, -0.96225045, -0.22786351]])
# Crear y ajustar el modelo de regresión lineal
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)
LinearRegression()
# Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba
y_train_pred = regression_model.predict(X_train_scaled)
y_test_pred = regression_model.predict(X_test_scaled)
# Calcular las métricas
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = sqrt(mse_train)
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = sqrt(mse_test)
# Imprimir las métricas
print("Métricas en conjunto de entrenamiento:")
print("R^2:", r2_train)
print("MSE:", mse_train)
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("---------------------------")
print("Métricas en conjunto de prueba:")
print("R^2:", r2_test)
print("MSE:", mse_test)
print("MAE:", mae_test)
print("RMSE:", rmse_test)
Métricas en conjunto de entrenamiento: R^2: 0.8156524015032155 MSE: 68735.41992095124 MAE: 188.13034539025102 RMSE: 262.1744074484602 --------------------------- Métricas en conjunto de prueba: R^2: 0.8082373964076417 MSE: 63965.346222295244 MAE: 178.48083942988046 RMSE: 252.91371299772428
df_num_l=df_num[['numdealspurchases', 'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases','numwebvisitsmonth', 'age', 'dependents', 'spending']]
df_num_l= df_num_l.reset_index()
df_num_l = df_num_l.drop(columns=['index'])
df_num_l
numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | age | dependents | spending | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 4 | 4 | 6 | 1 | 53 | 0 | 1190 |
1 | 1 | 7 | 3 | 7 | 5 | 62 | 0 | 577 |
2 | 1 | 3 | 2 | 5 | 2 | 65 | 1 | 251 |
3 | 1 | 1 | 0 | 2 | 7 | 56 | 2 | 11 |
4 | 2 | 3 | 1 | 2 | 7 | 34 | 1 | 91 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2224 | 2 | 5 | 2 | 11 | 4 | 47 | 1 | 689 |
2225 | 1 | 1 | 0 | 3 | 8 | 46 | 1 | 55 |
2226 | 2 | 6 | 1 | 5 | 8 | 47 | 1 | 309 |
2227 | 1 | 5 | 4 | 10 | 3 | 45 | 0 | 1383 |
2228 | 1 | 8 | 5 | 4 | 7 | 54 | 2 | 1078 |
2229 rows × 8 columns
#X-y split
y=df_num_l['spending']
X=df_num_l.drop(['spending'],axis=1)
X
numdealspurchases | numwebpurchases | numcatalogpurchases | numstorepurchases | numwebvisitsmonth | age | dependents | |
---|---|---|---|---|---|---|---|
0 | 1 | 4 | 4 | 6 | 1 | 53 | 0 |
1 | 1 | 7 | 3 | 7 | 5 | 62 | 0 |
2 | 1 | 3 | 2 | 5 | 2 | 65 | 1 |
3 | 1 | 1 | 0 | 2 | 7 | 56 | 2 |
4 | 2 | 3 | 1 | 2 | 7 | 34 | 1 |
... | ... | ... | ... | ... | ... | ... | ... |
2224 | 2 | 5 | 2 | 11 | 4 | 47 | 1 |
2225 | 1 | 1 | 0 | 3 | 8 | 46 | 1 |
2226 | 2 | 6 | 1 | 5 | 8 | 47 | 1 |
2227 | 1 | 5 | 4 | 10 | 3 | 45 | 0 |
2228 | 1 | 8 | 5 | 4 | 7 | 54 | 2 |
2229 rows × 7 columns
# Normalizar datos numéricos utilizando StandardScaler
scaler = StandardScaler()
X_es = scaler.fit_transform(X)
X_es
array([[-0.69656739, -0.03605431, 0.48770355, ..., -1.79512893, -0.09494056, -1.26627659], [-0.69656739, 1.04508986, 0.12999009, ..., -0.13842988, 0.67397452, -1.26627659], [-0.69656739, -0.3964357 , -0.22772338, ..., -1.38095417, 0.93027955, 0.06385089], ..., [-0.1688146 , 0.68470847, -0.58543684, ..., 1.10409441, -0.60755062, 0.06385089], [-0.69656739, 0.32432708, 0.48770355, ..., -0.96677941, -0.77842064, -1.26627659], [-0.69656739, 1.40547125, 0.84541702, ..., 0.68991965, -0.00950556, 1.39397838]])
X=np.concatenate((X_es ,encoder),axis=1)
X
array([[-0.69656739, -0.03605431, 0.48770355, ..., 0. , 1. , 0. ], [-0.69656739, 1.04508986, 0.12999009, ..., 0. , 0. , 0. ], [-0.69656739, -0.3964357 , -0.22772338, ..., 0. , 0. , 1. ], ..., [-0.1688146 , 0.68470847, -0.58543684, ..., 0. , 1. , 0. ], [-0.69656739, 0.32432708, 0.48770355, ..., 0. , 0. , 0. ], [-0.69656739, 1.40547125, 0.84541702, ..., 0. , 0. , 0. ]])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled
array([[-0.68389372, -1.11115159, -0.92884441, ..., -0.41585133, -0.96225045, -0.22786351], [ 0.33801644, 1.75100978, 0.85168801, ..., -0.41585133, 1.03923048, -0.22786351], [ 0.84897152, -0.39561125, -0.57273793, ..., -0.41585133, -0.96225045, -0.22786351], ..., [-0.68389372, -1.11115159, -0.92884441, ..., -0.41585133, 1.03923048, -0.22786351], [-0.68389372, -0.03784108, 0.13947504, ..., 2.40470553, -0.96225045, -0.22786351], [-0.68389372, 1.75100978, 1.56390097, ..., -0.41585133, -0.96225045, -0.22786351]])
# Crear y ajustar el modelo de regresión lineal
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)
LinearRegression()
# Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba
y_train_pred = regression_model.predict(X_train_scaled)
y_test_pred = regression_model.predict(X_test_scaled)
# Calcular las métricas
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = sqrt(mse_train)
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = sqrt(mse_test)
# Imprimir las métricas
print("Métricas en conjunto de entrenamiento:")
print("R^2:", r2_train)
print("MSE:", mse_train)
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("---------------------------")
print("Métricas en conjunto de prueba:")
print("R^2:", r2_test)
print("MSE:", mse_test)
print("MAE:", mae_test)
print("RMSE:", rmse_test)
Métricas en conjunto de entrenamiento: R^2: 0.7532939530028419 MSE: 91986.2470445113 MAE: 205.98088193206826 RMSE: 303.29234583897977 --------------------------- Métricas en conjunto de prueba: R^2: 0.7535298353825518 MSE: 82213.8890371711 MAE: 194.52090187890965 RMSE: 286.7296445036179