1.DATA CLEANING- DATA WRANGLING¶

In [11]:

#Importing libraries
import pandas as pd 
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
pd.set_option('display.max_columns', None)
%matplotlib inline

In [12]:

#Loading,reading and storing dataset
df = pd.read_csv('marketing_data.csv')

In [13]:

#Review of data fields
df

Out[13]:

	ID	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds	NumDealsPurchases	NumWebPurchases	NumCatalogPurchases	NumStorePurchases	NumWebVisitsMonth	AcceptedCmp3	AcceptedCmp4	AcceptedCmp5	AcceptedCmp1	AcceptedCmp2	Response	Complain	Country
0	1826	1970	Graduation	Divorced	$84,835.00	0	0	6/16/14	0	189	104	379	111	189	218	1	4	4	6	1	0	0	0	0	0	1	0	SP
1	1	1961	Graduation	Single	$57,091.00	0	0	6/15/14	0	464	5	64	7	0	37	1	7	3	7	5	0	0	0	0	1	1	0	CA
2	10476	1958	Graduation	Married	$67,267.00	0	1	5/13/14	0	134	11	59	15	2	30	1	3	2	5	2	0	0	0	0	0	0	0	US
3	1386	1967	Graduation	Together	$32,474.00	1	1	5/11/14	0	10	0	1	0	0	0	1	1	0	2	7	0	0	0	0	0	0	0	AUS
4	5371	1989	Graduation	Single	$21,474.00	1	0	4/8/14	0	6	16	24	11	0	34	2	3	1	2	7	1	0	0	0	0	1	0	SP
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2235	10142	1976	PhD	Divorced	$66,476.00	0	1	3/7/13	99	372	18	126	47	48	78	2	5	2	11	4	0	0	0	0	0	0	0	US
2236	5263	1977	2n Cycle	Married	$31,056.00	1	0	1/22/13	99	5	10	13	3	8	16	1	1	0	3	8	0	0	0	0	0	0	0	SP
2237	22	1976	Graduation	Divorced	$46,310.00	1	0	12/3/12	99	185	2	88	15	5	14	2	6	1	5	8	0	0	0	0	0	0	0	SP
2238	528	1978	Graduation	Married	$65,819.00	0	0	11/29/12	99	267	38	701	149	165	63	1	5	4	10	3	0	0	0	0	0	0	0	IND
2239	4070	1969	PhD	Married	$94,871.00	0	2	9/1/12	99	169	24	553	188	0	144	1	8	5	4	7	0	1	1	0	0	1	0	CA

2240 rows × 28 columns

Understanding data¶

ID=Customer's unique identifier
Year_Birth=Customer's birth year
Education=Customer's education level
Marital_Status=Customer's marital status
Income=Customer's yearly household income
Kidhome=Number of children in customer's household
Teenhome=Number of teenagers in customer's household
Dt_Customer=Date of customer's enrollment with the company
Recency=Number of days since customer's last purchase
MntWines=Amount spent on wine in the last 2 years
MntFruits=Amount spent on fruits in the last 2 years
MntMeatProducts=Amount spent on meat in the last 2 years
MntFishProducts=Amount spent on fish in the last 2 years
MntSweetProducts=Amount spent on sweets in the last 2 years
MntGoldProds=Amount spent on gold in the last 2 years
NumDealsPurchases=Number of purchases made with a discount
NumWebPurchases=Number of purchases made through the company's web site
NumCatalogPurchases=Number of purchases made using a catalogue
NumStorePurchases=Number of purchases made directly in stores
NumWebVisitsMonth=Number of visits to company's web site in the last month
AcceptedCmp3=1 if customer accepted the offer in the 3rd campaign, 0 otherwise
AcceptedCmp4=1 if customer accepted the offer in the 4th campaign, 0 otherwise
AcceptedCmp5=1 if customer accepted the offer in the 5th campaign, 0 otherwise
AcceptedCmp1=1 if customer accepted the offer in the 1st campaign, 0 otherwise
AcceptedCmp2=1 if customer accepted the offer in the 2nd campaign, 0 otherwise
Response=1 if customer accepted the offer in the last campaign, 0 otherwise
Complain=1 if customer complained in the last 2 years, 0 otherwise
Country=Customer's location

In [14]:

#Show the DataFrame's shape
df.shape

Out[14]:

(2240, 28)

In [15]:

 # Standardize column names
df.columns=[e.lower().replace(' ', '') for e in df.columns]  
df.columns

Out[15]:

Index(['id', 'year_birth', 'education', 'marital_status', 'income', 'kidhome',
       'teenhome', 'dt_customer', 'recency', 'mntwines', 'mntfruits',
       'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts',
       'mntgoldprods', 'numdealspurchases', 'numwebpurchases',
       'numcatalogpurchases', 'numstorepurchases', 'numwebvisitsmonth',
       'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'acceptedcmp1',
       'acceptedcmp2', 'response', 'complain', 'country'],
      dtype='object')

In [16]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   2240 non-null   int64 
 1   year_birth           2240 non-null   int64 
 2   education            2240 non-null   object
 3   marital_status       2240 non-null   object
 4   income               2216 non-null   object
 5   kidhome              2240 non-null   int64 
 6   teenhome             2240 non-null   int64 
 7   dt_customer          2240 non-null   object
 8   recency              2240 non-null   int64 
 9   mntwines             2240 non-null   int64 
 10  mntfruits            2240 non-null   int64 
 11  mntmeatproducts      2240 non-null   int64 
 12  mntfishproducts      2240 non-null   int64 
 13  mntsweetproducts     2240 non-null   int64 
 14  mntgoldprods         2240 non-null   int64 
 15  numdealspurchases    2240 non-null   int64 
 16  numwebpurchases      2240 non-null   int64 
 17  numcatalogpurchases  2240 non-null   int64 
 18  numstorepurchases    2240 non-null   int64 
 19  numwebvisitsmonth    2240 non-null   int64 
 20  acceptedcmp3         2240 non-null   int64 
 21  acceptedcmp4         2240 non-null   int64 
 22  acceptedcmp5         2240 non-null   int64 
 23  acceptedcmp1         2240 non-null   int64 
 24  acceptedcmp2         2240 non-null   int64 
 25  response             2240 non-null   int64 
 26  complain             2240 non-null   int64 
 27  country              2240 non-null   object
dtypes: int64(23), object(5)
memory usage: 490.1+ KB

We have 18 numerical variables and 10 categorical variables: Education, Marital_Status, AcceptedCmp1-AcceptedCmp5,Response, Country and Complain

In [17]:

#Transformon income to numerical
df['income'] = df['income'].str.replace('$', '')
df['income'] = df['income'].str.replace(',', '').astype('float')

C:\Users\danie\AppData\Local\Temp\ipykernel_26540\1407881261.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  df['income'] = df['income'].str.replace('$', '')

In [18]:

#Transformon AcceptedCmp1-5, Respomnse, Complain to object
df['acceptedcmp1'] = df['acceptedcmp1'].astype(str)
df['acceptedcmp2'] = df['acceptedcmp2'].astype(str)
df['acceptedcmp3'] = df['acceptedcmp3'].astype(str)
df['acceptedcmp4'] = df['acceptedcmp3'].astype(str)
df['acceptedcmp5'] = df['acceptedcmp5'].astype(str)
df['response'] = df['response'].astype(str)
df['complain'] = df['complain'].astype(str)

In [19]:

#Checking unique values of dataset
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Valores únicos de la columna '{column}':")
    print(unique_values)
    print()

Valores únicos de la columna 'id':
[ 1826     1 10476 ...    22   528  4070]

Valores únicos de la columna 'year_birth':
[1970 1961 1958 1967 1989 1954 1947 1979 1959 1981 1969 1977 1960 1966
 1976 1965 1956 1975 1971 1986 1972 1974 1990 1987 1984 1968 1955 1983
 1973 1978 1952 1962 1964 1982 1963 1957 1980 1945 1949 1948 1953 1946
 1985 1992 1944 1951 1988 1950 1994 1993 1991 1893 1996 1995 1899 1943
 1941 1940 1900]

Valores únicos de la columna 'education':
['Graduation' 'PhD' '2n Cycle' 'Master' 'Basic']

Valores únicos de la columna 'marital_status':
['Divorced' 'Single' 'Married' 'Together' 'Widow' 'YOLO' 'Alone' 'Absurd']

Valores únicos de la columna 'income':
[84835. 57091. 67267. ... 46310. 65819. 94871.]

Valores únicos de la columna 'kidhome':
[0 1 2]

Valores únicos de la columna 'teenhome':
[0 1 2]

Valores únicos de la columna 'dt_customer':
['6/16/14' '6/15/14' '5/13/14' '5/11/14' '4/8/14' '3/17/14' '1/29/14'
 '1/18/14' '1/11/14' '12/27/13' '12/9/13' '12/7/13' '10/16/13' '10/5/13'
 '9/11/13' '8/1/13' '7/23/13' '7/1/13' '5/28/13' '3/26/13' '3/15/13'
 '2/12/13' '11/23/12' '10/13/12' '9/14/12' '6/29/14' '5/31/14' '5/30/14'
 '4/27/14' '4/11/14' '10/29/13' '10/9/13' '5/10/13' '5/9/13' '4/25/13'
 '4/20/13' '3/30/13' '3/1/13' '2/14/13' '1/11/13' '1/3/13' '12/19/12'
 '12/15/12' '12/2/12' '9/17/12' '9/11/12' '5/12/14' '4/28/14' '3/29/14'
 '3/6/14' '3/4/14' '2/4/14' '2/3/14' '1/1/14' '12/12/13' '11/15/13'
 '9/20/13' '9/5/13' '8/31/13' '7/30/13' '7/27/13' '6/22/13' '1/5/13'
 '11/21/12' '11/11/12' '9/28/12' '9/27/12' '9/7/12' '8/13/12' '8/11/12'
 '8/2/12' '6/25/14' '5/28/14' '4/14/14' '3/10/14' '2/27/14' '2/7/14'
 '1/28/14' '11/17/13' '11/7/13' '10/17/13' '10/13/13' '10/12/13' '9/30/13'
 '7/3/13' '6/10/13' '5/29/13' '4/29/13' '3/10/13' '1/2/13' '11/2/12'
 '10/18/12' '10/1/12' '9/3/12' '8/26/12' '5/23/14' '5/17/14' '4/21/14'
 '3/23/14' '12/16/13' '11/26/13' '11/14/13' '11/6/13' '10/6/13' '9/27/13'
 '9/18/13' '9/9/13' '7/18/13' '7/8/13' '5/27/13' '3/5/13' '2/20/13'
 '1/12/13' '12/24/12' '11/19/12' '3/28/14' '2/24/14' '9/2/13' '8/20/13'
 '6/23/13' '5/5/13' '4/5/13' '1/4/13' '12/27/12' '11/10/12' '10/29/12'
 '9/22/12' '3/31/14' '3/21/14' '2/9/14' '9/23/13' '6/27/13' '3/28/13'
 '3/12/13' '1/16/13' '1/8/13' '12/29/12' '12/12/12' '11/25/12' '9/21/12'
 '9/9/12' '9/5/12' '8/17/12' '6/22/14' '5/1/14' '1/3/14' '10/11/13'
 '8/13/13' '6/9/13' '5/7/13' '10/2/12' '9/12/12' '3/19/14' '3/3/14'
 '2/22/14' '1/24/14' '12/4/13' '11/28/13' '11/5/13' '10/3/13' '8/9/13'
 '8/7/13' '7/17/13' '7/9/13' '6/11/13' '5/17/13' '3/23/13' '2/19/13'
 '1/19/13' '1/10/13' '1/1/13' '11/12/12' '5/18/14' '3/30/14' '1/30/14'
 '1/26/14' '1/22/14' '1/15/14' '12/13/13' '8/4/13' '5/1/13' '4/24/13'
 '4/3/13' '2/3/13' '11/16/12' '8/3/12' '4/18/14' '4/1/14' '3/18/14'
 '2/10/14' '11/23/13' '11/21/13' '10/2/13' '7/21/13' '6/18/13' '3/24/13'
 '12/6/12' '11/9/12' '2/14/14' '10/22/13' '10/4/13' '9/21/13' '8/5/13'
 '7/14/13' '7/4/13' '4/12/13' '4/10/13' '4/8/13' '3/31/13' '3/17/13'
 '1/21/13' '12/10/12' '9/24/12' '8/6/12' '6/18/14' '4/5/14' '12/21/13'
 '10/27/13' '10/21/13' '9/19/13' '9/4/13' '6/25/13' '4/27/13' '4/18/13'
 '12/30/12' '8/22/12' '8/8/12' '6/19/14' '4/20/14' '2/28/14' '12/17/13'
 '11/25/13' '10/28/13' '8/15/13' '7/5/13' '6/19/13' '6/16/13' '4/22/13'
 '3/19/13' '2/23/13' '2/15/13' '10/31/12' '10/7/12' '8/9/12' '5/6/14'
 '4/15/14' '3/5/14' '2/19/14' '9/7/13' '8/6/13' '7/25/13' '4/30/13'
 '9/10/12' '3/20/14' '9/28/13' '9/24/13' '2/16/13' '11/22/12' '9/18/12'
 '8/16/12' '6/5/14' '4/13/14' '4/10/14' '4/3/14' '2/12/14' '12/15/13'
 '10/30/13' '8/26/13' '2/2/13' '1/25/13' '11/17/12' '11/13/12' '11/7/12'
 '11/1/12' '10/16/12' '5/8/14' '3/2/14' '6/24/13' '6/13/13' '4/23/13'
 '4/15/13' '1/29/13' '10/30/12' '10/23/12' '4/17/14' '2/25/14' '12/11/13'
 '10/10/13' '5/20/13' '5/18/13' '4/7/13' '3/3/13' '12/7/12' '11/28/12'
 '10/27/12' '9/15/12' '6/17/14' '5/29/14' '3/1/14' '2/15/14' '12/23/13'
 '11/29/13' '10/25/13' '8/17/13' '6/6/13' '3/29/13' '9/23/12' '8/30/12'
 '8/1/12' '2/8/14' '1/25/14' '11/27/13' '10/19/13' '3/7/13' '2/28/13'
 '1/17/13' '11/20/12' '11/5/12' '11/3/12' '8/31/12' '8/12/12' '5/15/14'
 '4/12/14' '4/6/14' '2/6/14' '7/29/13' '6/29/13' '6/17/13' '6/8/13'
 '5/26/13' '11/8/12' '8/4/12' '4/30/14' '4/7/14' '3/12/14' '4/13/13'
 '2/13/13' '6/3/14' '3/25/14' '2/17/14' '2/5/14' '1/27/14' '1/14/14'
 '7/11/13' '6/2/13' '6/1/13' '5/4/13' '3/18/13' '12/3/12' '11/24/12'
 '10/26/12' '6/20/14' '1/19/14' '1/9/14' '12/29/13' '12/26/13' '12/8/13'
 '11/20/13' '8/23/13' '8/19/13' '7/24/13' '10/6/12' '8/18/12' '5/7/14'
 '11/9/13' '8/25/13' '5/16/13' '4/1/13' '3/27/13' '2/8/13' '9/20/12'
 '5/22/14' '12/30/13' '11/2/13' '8/21/13' '7/12/13' '6/28/13' '6/4/13'
 '5/31/13' '3/6/13' '2/18/13' '9/26/12' '8/19/12' '5/2/14' '4/29/14'
 '2/2/14' '1/5/14' '12/5/13' '11/18/13' '9/10/13' '8/3/13' '2/21/13'
 '2/10/13' '1/31/13' '12/9/12' '9/29/12' '6/9/14' '4/2/14' '3/24/14'
 '1/23/14' '9/16/13' '9/12/13' '7/15/13' '3/9/13' '2/9/13' '12/14/12'
 '10/17/12' '6/23/14' '6/12/14' '6/7/14' '4/9/14' '2/13/14' '12/6/13'
 '10/20/13' '6/20/13' '5/8/13' '3/11/13' '9/6/12' '3/9/14' '2/11/14'
 '10/8/13' '8/28/13' '7/6/13' '5/30/13' '5/22/13' '4/2/13' '3/20/13'
 '3/14/13' '1/22/13' '9/8/12' '8/25/12' '8/14/12' '11/19/13' '6/3/13'
 '12/21/12' '10/10/12' '8/7/12' '12/24/13' '12/14/13' '5/15/13' '5/6/13'
 '1/7/13' '11/29/12' '4/24/14' '3/8/14' '7/16/13' '2/22/13' '1/20/13'
 '1/13/13' '12/25/12' '12/11/12' '6/27/14' '3/16/14' '11/3/13' '9/25/13'
 '9/15/13' '9/1/13' '8/2/13' '8/27/12' '4/4/14' '9/22/13' '12/22/12'
 '12/16/12' '8/20/12' '1/7/14' '12/1/13' '9/26/13' '2/25/13' '10/24/12'
 '10/22/12' '7/31/12' '5/19/14' '5/3/14' '4/16/14' '12/31/13' '12/2/13'
 '7/22/13' '4/21/13' '4/11/13' '3/22/14' '2/6/13' '12/4/12' '11/6/12'
 '8/28/12' '7/2/13' '10/12/12' '5/16/14' '4/25/14' '11/13/13' '9/6/13'
 '11/18/12' '10/15/12' '6/14/14' '1/17/14' '2/7/13' '12/20/13' '9/13/13'
 '1/6/13' '5/26/14' '1/13/14' '8/8/13' '4/6/13' '2/26/14' '5/14/13'
 '8/24/12' '5/27/14' '2/23/14' '1/10/14' '7/19/13' '3/25/13' '2/11/13'
 '1/15/13' '12/5/12' '6/13/14' '6/2/14' '11/1/13' '8/16/13' '2/17/13'
 '2/4/13' '10/19/12' '6/26/14' '10/23/13' '4/14/13' '10/28/12' '10/1/13'
 '3/8/13' '11/14/12' '1/12/14' '11/4/13' '8/22/13' '6/21/13' '1/23/13'
 '10/21/12' '10/4/12' '1/31/14' '1/21/14' '12/28/13' '8/11/13' '5/13/13'
 '9/2/12' '6/24/14' '6/8/14' '5/24/14' '10/18/13' '9/17/13' '8/14/13'
 '7/20/13' '6/30/13' '5/11/13' '4/16/13' '5/25/14' '5/10/14' '5/4/14'
 '8/29/13' '3/22/13' '6/4/14' '5/23/13' '2/1/13' '2/16/14' '10/24/13'
 '3/2/13' '12/18/12' '11/4/12' '6/11/14' '6/14/13' '6/10/14' '5/5/14'
 '4/19/14' '8/18/13' '2/26/13' '8/30/13' '6/12/13' '5/12/13' '10/9/12'
 '11/10/13' '8/24/13' '9/4/12' '2/27/13' '1/6/14' '7/7/13' '11/26/12'
 '8/29/12' '5/2/13' '3/4/13' '1/27/13' '8/23/12' '10/14/13' '12/23/12'
 '12/1/12' '8/5/12' '8/27/13' '12/17/12' '6/21/14' '3/26/14' '11/22/13'
 '8/21/12' '4/22/14' '10/26/13' '5/9/14' '4/17/13' '3/21/13' '1/24/13'
 '12/28/12' '3/13/14' '2/1/14' '10/15/13' '1/14/13' '10/5/12' '7/13/13'
 '4/23/14' '2/18/14' '11/12/13' '8/12/13' '12/31/12' '6/28/14' '12/3/13'
 '12/26/12' '7/30/12' '1/2/14' '4/19/13' '1/26/13' '10/14/12' '9/30/12'
 '3/11/14' '9/14/13' '7/28/13' '5/19/13' '4/28/13' '1/9/13' '10/20/12'
 '7/31/13' '5/21/13' '9/25/12' '5/3/13' '12/8/12' '3/27/14' '12/18/13'
 '11/30/13' '8/10/13' '3/16/13' '11/30/12' '3/7/14' '12/19/13' '10/25/12'
 '12/25/13' '1/4/14' '11/8/13' '11/27/12' '7/26/13' '12/20/12' '10/11/12'
 '4/26/14' '12/22/13' '6/26/13' '5/24/13' '8/15/12' '12/10/13' '9/19/12'
 '8/10/12' '6/6/14' '5/25/13' '4/9/13' '9/1/12']

Valores únicos de la columna 'recency':
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]

Valores únicos de la columna 'mntwines':
[ 189  464  134   10    6  336  769   78  384  450  140  431    3   16
   63   18   53    5  213  275   40  308  266   80  454   27  184  155
  423    7  408    1 1285   71 1248  378   12 1200  709   94    4  539
   13  670  158  283  496  292   46   34  167  318  522   67   28   58
    2  229   14  622  362   38 1074  983  262  739  610   50  650    9
  458  520   20  345   22   42  463  260  180   62  421  154  502  145
  322 1099  890  863  448  399   85   97   35  315  738  179  381  247
  711   30  288  212  173  604  482  531  230   33  784  600  168  493
   29  835 1296  760   70   68  325  303  121  561  462  376  341  595
   23  530  594  852  194  216  428 1092  559  606   11  588  316  279
 1462  546  277  948  664  268  199   73   96  587   56   45    8  508
  234  992  125  174 1478 1001  392  388   65  177  577  460  219   31
  117  236  120  200  532  297  151  997  797  823  966   37  416  314
  342  629  201  964   72  123  159  209  100 1170  387  357  912  625
  420  641  712  465   39  514  565  667   66  129  185 1023  338  647
  163   19  182  779   32  298  488  817  459  492  558  383 1043  400
  691  783   43  777  402  144  840   88   74   26   15  162 1302   47
    0  918  172  931  666   24   25  391   81  674  267  224 1239  412
 1205  304   76 1004  584  422  269  613  113  261  856  452  658  451
  395  688 1035  365  181  331  256  836  233  881  305  778   69   79
  344 1184  490 1349  153   84 1000   52  301  519  238  620  572  380
  443  252  796  410  507   55  327  397  105 1126  280  108  102  820
   86  217  352  240  124  736 1006  580  112  227  290  284  656  953
  626  547  795  895  899  178  265  349  848  962  347  426   48 1076
   36  733  523 1083   98   99  138  509  527  340  434  208  438 1308
   64  673 1050  787  833  605  800  861  161  359  324  370  245  631
 1156  398  122  386  379  243  358  258  846  724  770  815  819  479
  938  909  156  940   51  141  160  557  196  295  231 1230   91 1060
   17  302  728  110  995  704  415  241  135  254  957  293   21  754
  822  373 1171  371  801  244  562  707  356  743  235  393  721  896
  879  367  693  816  483 1181  491  755  671 1009  972  598  139  702
  239  183  771   83  170   59 1063  789  228  512  355  204  556  317
  897  187  826  513  313  329  703  571 1218  478 1332 1032  296  635
  294  913  731  281  489  551  471  176  534  768  652 1142   41  207
  299  525  676  494  353 1253  521  977  603  967  273  425 1394  171
  554  191  131 1073  157  226  186 1276  130  614 1288  563   95  210
  306  505  758  824  980  526  690  202  792  545  109  382  132  750
  223   57  407   90   61   82  220  517  615  480  752  741  456  143
  375 1215  106  871  368 1193  441   87  437  965  510  574  332  642
  354  390  476  627  165  806  763  205 1003  790 1148  984  941  799
  406  128  377  919  198  215 1493 1090  536 1149  586   77  794  774
  328  901  866  753 1048  960  888  654  499  576   44  735  548  206
  611  713  593 1311 1166 1486  515 1492  533  575  726  445 1259  248
 1111  710 1241  529  447  570  867  757  882  291  333  749  203  619
  320  311 1103  404  411  211  446  374  221 1047 1206  274  889  146
 1115   54  251  166   75  364 1016  432  270  722  516  350  403  218
  560  729 1224 1121  503 1245 1459  621  430  979  116  853   93  127
  689  686  630  466  910  389 1379  504   60 1067  264  101  581  952
  925  457 1017  249  104  111  618 1039  107  225  440 1168   89  255
  424  944  908  597  847  543  742  968  708  152  747  958  330  637
  418  986  825 1298 1132  746  812 1315  307  860 1396  583  473  553
  675 1324  582 1449  829  335  612  175  197  623  544  714  537  285
  366 1252  934  518  734 1045  717  830 1020  680  524 1013  864  164
  653  369 1012  133  188  811  639  444  372  928  272   92  555  351
  263  982  495  312  737  751  346  684  309  115  136  756 1279  538
  920  433  192  453  321  169]

Valores únicos de la columna 'mntfruits':
[104   5  11   0  16 130  80  26   4  82  10   6   1   9   2  21 174   7
  42  12  22  45 169   3  35  36  51   8  50  37  76  17 107 105  81  53
  96  86  32  19 193  63  83  28  49  34  69  40  48  13  20 148  73  23
 103  64  61 142  97 117 134  60  25  30 153  58  33  57  14  24  18 106
  88 133  99  68  72  38 129  93  74  27 185  15  79 162  71  56 168  98
  44 172  54 140 194  91 183 151 197 178 189 102 155 115  77  90 114  39
  59 199 154 123 108 137  66  31  43 120  84  29 112  46 160 159  65 111
 147 143 161 144  47 181  89  62  41 132  67 138  55 184 122  75  70  85
 149 152 100 164 101 126  87  92 166 124 190 131 163 127]

Valores únicos de la columna 'mntmeatproducts':
[ 379   64   59    1   24  411  252   11  102  535   61  441    8   12
   57    2    5    3   76   68   23   73  300   37  171  256   80  706
   21    9  449  112    6  349  189   17  204  115   33  816  249  179
   38  460    4  981   13    7   43  407  257   26   18  140   16  431
   22  518  184  309  125   28  653  780  356  154  528  333  559  348
   44   20  536  202  132  459   50   45  292  547   30   41   67  322
  232  520  215  159  217   69  100  471  469  192  849  560   14  350
  444  206  223  380  311  466  751  785  113  291   83  678  786  207
   56  273  214  592  503  228  161   88  128   48   86  240   60   96
  898   29   99   77  694   81  403   91  218   46   71  422   31  873
  111  168   10   89  269  293  282  241   53   19   15   70  172  137
  142   40   79  594  278  569  271  170  242  452  456   84  538  732
  548  850  259  651  391   90  298 1725  537  697  687   32  622  209
   97   35  731  106  804   42  243    0   98  842  253  124  108   25
  413  235  230  145  212  238  320  495  319  424   74  109  599  570
  410  483   39  565  670  117  103  575  101  501   27  689   58  482
  673  167  195  286  733  165  827  590  364   52 1622  104  211   49
  711   75  373  216  119  267  279   54  389  177  160  480  812  545
  303  144  323  797  194  127  180  134  649  497  352  400  396  107
  199  549  399  573  297  558  136   62  522  420  314  186   92  572
   63  388  239  915  367   95  157  143  890   72   82  754  372  114
  196   36  761  512  835  131  133  499  417  853  555  175  255  625
  219  181  164  925  690  509   51  153  120  270  231  562  655  595
  141   78  345  151  951  130  514  845  247  288   66   65   47  317
  222  601  523  272  193  779  254  376  155  353  152  792  510  860
  818  561  631  203   93  929  470  447  201  974  426  753  110  263
  617  178  305  708  182  156  250  158  921  234  398  280   85  746
  445  613  384  500  385  401  162  118  265  275   34  174  185  274
  221  368  123  476  169  704  639  281 1607  550  377  462  597  147
  138  432  397  604  375  374  540  505  507  454  224  226  183  815
  961  446  546  567  749  553  264  359  287  329  493  607  428  363
  843   55  790  899  337   94  614  387  883  294  260  640  530  335
  768  554  116  304  685  473  334  129  276  534  425  461  341  568
  442  419  332  672  414  935  163  369  266  747  713  740   87  611
  591  295  405  724  602  464  756  758  603  248  149  750  188  586
  693  490  674  940  606  324  716  408  430  913  717  494  519  735
  205  342  237  932  813  654  360  139  409  315  176  135  415  984
  612  213  435  122  227  491  629  395  465  742  338  864  968  392
  487  121  946  498  421  736  197  936  166  382  450  455  635  233
  351  832  801  838  354 1582  757  650  774  208  126  701]

Valores únicos de la columna 'mntfishproducts':
[111   7  15   0  11 240  21  73  80   3   2  13   4  25  65   8  50 106
 138  43  97   6  38  30  20 189 224  16 150  32  10 134 193 180 140 137
  28  27  19  98 168  63  76  82  39 205  86  52  46  84 172 119  49 229
  42  29 116 114  45  17 259 127  33  78 130 145 218  12 110  62  71 247
   1  51  91  26  23  69  34  72 124  99 185  89  47 182 160 136  64 175
 162 216 142 207  41 101 108 192  55  59  40  31  24 123 166 201  58  90
 169 219  37 125  85  77 151 242  95 234 253 258  36 227  93 188 104 128
  94  54 141 250 159 121 232 184 120 179 158 153  35 171 112 202  56 173
  81 132 164  75 197 210  60  68 199 181 237 129 156 149 167 231 102 220
 212 198  67 208 133 103 254 177  44 246 223 146  48 186 225 147  61   5
 194 115]

Valores únicos de la columna 'mntsweetproducts':
[189   0   2  32  34  98  13  20  16   4   1   3   7   8  19  30 197  14
  89 172  29 160  12   5  28  60  23  35  92 138  10  80  42  21 167  50
  75  53   9 178   6  26  25  99 101 123  82  96  68  37  48 176  49  73
  69  58  44  62 128 151 133  11 134  36  41 148  15  51  22 262  18  97
  54  77  76 121  45  64 142 198  83  55  67 149  24 175 162  17  40 137
  71  94 114  38  74  46  43 102  65 141 110 152 263  27  33 112  70  47
 115  59  85 126  61 163  91  95  31 120 116 125 144 122  57  56  81 106
  88 185 130 107 143  66 105 111 108 179 118  93 103  84 161 147 194  72
 192 109 150  86 153 165 187  78 132 191 174  87 196 157 169  39 136 139
 100 129 166 173 188 182 156  79  63 195 127 145 146 124 113]

Valores únicos de la columna 'mntgoldprods':
[218  37  30   0  34  43  65   7   5  26   4 102  32 321  22   2  10  23
  44   3 197  17  20  29  16 172  14  45  12   6   9 125  27   1  13   8
  66 262  11  15  54 129  39  35  21  40  97  67  90  31 145  42  33  62
  41  24 143  50  47 109 168  28 150  91  53 128  48 148  80  25  51 191
 108 107  69 121 147  57  71  64  63  61  55 181 135 160  56  86 119  19
 112 153 130  18  58 133 152  95  83  88 134  38  68  76 140  79  99  52
 116 138 166  59 241 157 114 219 231 110 183 205  74  36 177 192 246 127
 122  96  49  77 362  72 144 120 141 248  82 196 139  46  93 190  75 174
 170 182  78 169 106  60  92 233 146  89 198 176 171 242 111 158 101 124
 118 232 227 203  81 142 117 200  84  85 224 207 154 216  70 151  73 132
  94 223 137 100 247 163 126 103 149 162 185 204 173 245 195 161  98 131
 187 215 159 249 210 180 115 178 229 155 291 199 175 165 123]

Valores únicos de la columna 'numdealspurchases':
[ 1  2  3  0  4 12  7  5  6 11  9  8 10 15 13]

Valores únicos de la columna 'numwebpurchases':
[ 4  7  3  1 10  2  6  5 25  8  9  0 11 27 23]

Valores únicos de la columna 'numcatalogpurchases':
[ 4  3  2  0  1  7 10  6  8  5  9 11 28 22]

Valores únicos de la columna 'numstorepurchases':
[ 6  7  5  2  3  9 10  0  8  4 13 12  1 11]

Valores únicos de la columna 'numwebvisitsmonth':
[ 1  5  2  7  6  4  8  3  9  0 17 13 10 14 19 20]

Valores únicos de la columna 'acceptedcmp3':
['0' '1']

Valores únicos de la columna 'acceptedcmp4':
['0' '1']

Valores únicos de la columna 'acceptedcmp5':
['0' '1']

Valores únicos de la columna 'acceptedcmp1':
['0' '1']

Valores únicos de la columna 'acceptedcmp2':
['0' '1']

Valores únicos de la columna 'response':
['1' '0']

Valores únicos de la columna 'complain':
['0' '1']

Valores únicos de la columna 'country':
['SP' 'CA' 'US' 'AUS' 'GER' 'IND' 'SA' 'ME']

In [20]:

#Checking for duplicate rows in the data to remove if any
df.duplicated().sum()

Out[20]:

In [21]:

##hecking null values, I will remove columns have more than 75%
df.isnull().sum().sort_values(ascending=False)

Out[21]:

income                 24
id                      0
numdealspurchases       0
complain                0
response                0
acceptedcmp2            0
acceptedcmp1            0
acceptedcmp5            0
acceptedcmp4            0
acceptedcmp3            0
numwebvisitsmonth       0
numstorepurchases       0
numcatalogpurchases     0
numwebpurchases         0
mntgoldprods            0
year_birth              0
mntsweetproducts        0
mntfishproducts         0
mntmeatproducts         0
mntfruits               0
mntwines                0
recency                 0
dt_customer             0
teenhome                0
kidhome                 0
marital_status          0
education               0
country                 0
dtype: int64

In [22]:

#Plot Income variable to identify best strategy for imputation
plt.figure(figsize=(8,4))
sns.distplot(df['income'], kde=False, hist=True)
plt.title('Distribution', size=16)
plt.ylabel('count');

C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Findings:

Plot shows that most incomes are distributed between 0-100.000$ with a few outliers.

Choosing the median as the imputation strategy is a common approach when dealing with skewed distributions or the presence of outliers. The median is robust to outliers because it is not affected by extreme values, unlike the mean. By using the median, the imputed values will be representative of the central tendency of the non-null income values and provide a more accurate estimation compared to the mean. This provides a better approximation of the actual values and helps maintain the integrity and quality of the data.

In [23]:

df['income'].plot(kind='box', figsize=(3,4), patch_artist=True)

Out[23]:

<AxesSubplot:>

In [24]:

#Impute null values in Income with median value (to avoid skewing of the mean due to outliers):
df['income'] = df['income'].fillna(df['income'].median())

In [25]:

plt.figure(figsize=(8,4))
sns.distplot(df['income'], kde=False, hist=True)
plt.title('Distribution', size=16)
plt.ylabel('count');

C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

DEALING WITH OUTLIERS¶

In [26]:

#PLot outliers of numerical data

plt.show()# select columns to plot
df_to_plot = df.drop(columns=['id', 'acceptedcmp1', 'acceptedcmp2', 'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'response', 'complain']).select_dtypes(include=np.number)

# subplots
df_to_plot.plot(subplots=True, layout=(4,4), kind='box', figsize=(12,14), patch_artist=True)
plt.subplots_adjust(wspace=0.5);

Multiple features contain outliers (see boxplots below), but the only that likely indicate data entry errors are Year_Birth <= 1900

In [27]:

#Remove rows where Year_Birth <= 1900
df = df[df['year_birth'] > 1900].reset_index(drop=True)

plt.figure(figsize=(3,4))
df['year_birth'].plot(kind='box', patch_artist=True);

Outliers Income column¶

We identify Income outliers and then exclude them from the data frame

In [28]:

Q1 = df.income.quantile(0.25)
Q3 = df.income.quantile(0.75)
IQR = Q3 - Q1

income_outliers = df[df.income > Q3 + 1.5*IQR]
df = df[df.income < Q3 + 1.5*IQR]
income_outliers.head()

Out[28]:

	id	year_birth	education	marital_status	income	kidhome	teenhome	dt_customer	recency	mntwines	mntfruits	mntmeatproducts	mntfishproducts	mntsweetproducts	mntgoldprods	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	country
325	4931	1977	Graduation	Together	157146.0	0	0	4/29/13	13	1	0	1725	2	1	1	0	0	28	0	1	SA
497	1501	1982	PhD	Married	160803.0	0	0	8/4/12	21	55	16	1622	17	3	4	15	0	28	1	0	US
526	9432	1977	Graduation	Together	666666.0	1	0	6/2/13	23	9	14	18	8	1	12	4	3	1	3	6	SA
730	1503	1976	PhD	Together	162397.0	1	1	6/3/13	31	85	1	16	2	1	2	0	0	0	1	1	SP
851	5336	1971	Master	Together	157733.0	1	0	6/4/13	37	39	1	9	2	0	8	0	1	0	1	1	SP

In [29]:

plt.figure(figsize=(8,4))
sns.distplot(df['income'], kde=False, hist=True)
plt.title('Distribution', size=16)
plt.ylabel('count');

C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

In [30]:

 plt.figure(figsize=(3,4))
df['income'].plot(kind='box', patch_artist=True);

In [31]:

#Counting outliers for each colum

for column in df_to_plot:
    q1 = np.percentile(df_to_plot[column], 25)
    q3 = np.percentile(df_to_plot[column], 75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    num_outliers = ((df_to_plot[column] < lower_bound) | (df_to_plot[column] > upper_bound)).sum()

    print("Column '{}': {}".format(column, num_outliers))

Column 'year_birth': 3
Column 'income': 8
Column 'kidhome': 0
Column 'teenhome': 0
Column 'recency': 0
Column 'mntwines': 35
Column 'mntfruits': 227
Column 'mntmeatproducts': 175
Column 'mntfishproducts': 223
Column 'mntsweetproducts': 248
Column 'mntgoldprods': 207
Column 'numdealspurchases': 86
Column 'numwebpurchases': 4
Column 'numcatalogpurchases': 23
Column 'numstorepurchases': 0
Column 'numwebvisitsmonth': 8

In [32]:

#Counting percentage of outliers

for column in df_to_plot:
    q1 = np.percentile(df_to_plot[column], 25)
    q3 = np.percentile(df_to_plot[column], 75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outliers = ((df_to_plot[column] < lower_bound) | (df_to_plot[column] > upper_bound))
    num_outliers = outliers.sum()
    percentage = (num_outliers / len(df_to_plot[column])) * 100

    print("Column '{}': {:.2f}%".format(column, percentage))

Column 'year_birth': 0.13%
Column 'income': 0.36%
Column 'kidhome': 0.00%
Column 'teenhome': 0.00%
Column 'recency': 0.00%
Column 'mntwines': 1.56%
Column 'mntfruits': 10.13%
Column 'mntmeatproducts': 7.81%
Column 'mntfishproducts': 9.96%
Column 'mntsweetproducts': 11.07%
Column 'mntgoldprods': 9.24%
Column 'numdealspurchases': 3.84%
Column 'numwebpurchases': 0.18%
Column 'numcatalogpurchases': 1.03%
Column 'numstorepurchases': 0.00%
Column 'numwebvisitsmonth': 0.36%

We are going to work on the predictions with the rest of the ouliers since once we segment the clients to make the predictions we can lose a lot of data and this could invalidate the model. Once we get the first results we can test the model without outliers.

In [33]:

#Transform yaer_birth and dt_customer into date format
df['year_birth'] = pd.to_datetime(df['year_birth'], format='%Y')
df['dt_customer'] = pd.to_datetime(df['dt_customer'])

In [34]:

#Transform yaer_birth and dt_customer into date format

Feature engineering¶

Having a first look at the row data enables us to start thinking at some useful variables we could create in order to better understand our dataset and reveal precious information.

We will create several variables :

Variable Age in replacement of the variable Year_birth
Variable Spending as the sum of the amount spent on the product categories
Variable Marital_Situation to group the different marital status in only 2 comprehensive categories : Couple(Married or Together) vs Alone (Divorced' 'Single', 'Widow' 'YOLO' 'Alone' 'Absurd'
Variable dependents in the home ('Dependents') can be engineered from the sum of 'Kidhome' and 'Teenhome'
The year of becoming a customer ('Year_Customer') can be engineered from 'Dt_Customer'
The total purchases ('TotalPurchases') can be engineered from the sum of all features containing the keyword 'Purchases'
The total number of campains accepted ('TotalCampaignsAcc') can be engineered from the sum of all features containing the keywords 'Cmp' and 'Response' (the latest campaign)

We will remove the unused variables for this analysis

In [35]:

counts = df['marital_status'].value_counts()
print(counts)

Married     861
Together    575
Single      479
Divorced    230
Widow        77
Alone         3
YOLO          2
Absurd        2
Name: marital_status, dtype: int64

In [36]:

#Age
df['age'] = datetime.now().year - df['year_birth'].dt.year

# Spending
mnt_cols = [col for col in df.columns if 'mnt' in col]
df['spending'] = df[mnt_cols].sum(axis=1)

#Marital Status

# Create a mapping dictionary for states

estado_mapeo = {
    'Divorced': 'single',
    'Single': 'single',
    'Married': 'couple',
    'Together': 'couple',
    'Widow':'single',
    'YOLO':'single',
    'Alone':'single',
    'Absurd':'single',
}

# Apply mapping to 'marital_status' column
df['marital_status'] = df['marital_status'].map(estado_mapeo)


# Dependents
df['dependents'] = df['kidhome'] + df['teenhome']

# Year becoming a Customer
df['year_customer'] = pd.DatetimeIndex(df['dt_customer']).year


# Total Purchases
purchases_cols = [col for col in df.columns if 'purchases' in col]
df['total_purchases'] = df[purchases_cols].sum(axis=1)

# Total Campaigns Accepted
campaigns_cols = [col for col in df.columns if 'cmp' in col] + ['response'] # 'Response' is for the latest campaign
df['total_campaigns_acc'] = df[campaigns_cols].sum(axis=1)

In [ ]:

In [37]:

#Drop columns we don´t need
df = df.drop(['year_birth', 'teenhome', 'kidhome','dt_customer'], axis=1)

In [38]:

df.head(3)

Out[38]:

	id	education	marital_status	income	mntwines	mntfruits	mntmeatproducts	mntfishproducts	mntsweetproducts	mntgoldprods	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	acceptedcmp2	response	country	age	spending	dependents	year_customer	total_purchases	total_campaigns_acc
0	1826	Graduation	single	84835.0	189	104	379	111	189	218	1	4	4	6	1	0	1	SP	53	1190	0	2014	15	1.0
1	1	Graduation	single	57091.0	464	5	64	7	0	37	1	7	3	7	5	1	1	CA	62	577	0	2014	18	11.0
2	10476	Graduation	couple	67267.0	134	11	59	15	2	30	1	3	2	5	2	0	0	US	65	251	1	2014	11	0.0

In [39]:

#Saving clean df
df.to_csv('marketing_analysis_clean.csv', index=False)

In [40]:

suma_spending = df['spending'].sum()
suma_spending

Out[40]:

In [41]:

fila_cliente_1386 = df[df['id'] == 9365]

# Imprimir la fila seleccionada
print(fila_cliente_1386)

        id education marital_status   income  recency  mntwines  mntfruits  \
1321  9365       PhD         single  60000.0       57      1048          0   

      mntmeatproducts  mntfishproducts  mntsweetproducts  mntgoldprods  \
1321              217                0                 0            12   

      numdealspurchases  numwebpurchases  numcatalogpurchases  \
1321                  5               11                    3   

      numstorepurchases  numwebvisitsmonth acceptedcmp3 acceptedcmp4  \
1321                  5                  6            0            0   

     acceptedcmp5 acceptedcmp1 acceptedcmp2 response complain country  age  \
1321            0            0            0        0        0      SP   47   

      spending  dependents  year_customer  total_purchases  \
1321      1277           1           2012               24   

      total_campaigns_acc  
1321                  0.0

In [42]:

df.shape

Out[42]:

(2229, 30)

Statistical summary¶

In [43]:

df.describe()

Out[43]:

	id	income	recency	mntwines	mntfruits	mntmeatproducts	mntfishproducts	mntsweetproducts	mntgoldprods	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	age	spending	dependents	year_customer	total_purchases	total_campaigns_acc
count	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000	2229.000000
mean	5589.296097	51619.504262	49.106326	304.991476	26.348587	165.283984	37.642441	27.161507	44.113055	2.319874	4.100045	2.636608	5.812472	5.334231	54.111261	605.541050	0.951996	2013.027367	14.869000	8123.388964
std	3244.840538	20601.216248	28.946476	336.761943	39.764060	219.336589	54.700548	41.338945	52.091255	1.895252	2.775461	2.796161	3.242144	2.414981	11.707430	601.032228	0.751976	0.684966	7.622187	28669.066626
min	0.000000	1730.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	27.000000	5.000000	0.000000	2012.000000	0.000000	0.000000
25%	2829.000000	35416.000000	24.000000	24.000000	2.000000	16.000000	3.000000	1.000000	9.000000	1.000000	2.000000	0.000000	3.000000	3.000000	46.000000	69.000000	0.000000	2013.000000	8.000000	0.000000
50%	5455.000000	51381.500000	49.000000	176.000000	8.000000	67.000000	12.000000	8.000000	24.000000	2.000000	4.000000	2.000000	5.000000	6.000000	53.000000	397.000000	1.000000	2013.000000	15.000000	0.000000
75%	8420.000000	68118.000000	74.000000	505.000000	33.000000	231.000000	50.000000	34.000000	56.000000	3.000000	6.000000	4.000000	8.000000	7.000000	64.000000	1044.000000	1.000000	2013.000000	21.000000	0.000000
max	11191.000000	113734.000000	99.000000	1493.000000	199.000000	1725.000000	259.000000	263.000000	362.000000	15.000000	27.000000	28.000000	13.000000	20.000000	83.000000	2525.000000	3.000000	2014.000000	43.000000	111111.000000

The describe function generates for us asummary, particularly useful as a first step in our preliminary investigation. Analyzing the statistical summary gives us insightful information in one look :

-Average income is 52.227$ while median income is 51.381$. The distribution is right skewed with the possible presence of outliers
The maximum value is 666.666$ and the 3rd quartile 68.281$ dollars so we could deduce that the distribution is skewed to the right with the possible presence of outliers.
-Average spending in the last 2 years is 605$ while median spending is 396$.
-Average age is 54 years old and the oldest customer is 83 years old.
-Average store purchases is the highest with 5.794.397 following by average of web purchases with 4.087.170.

The highest average spending on products that have been sold in the last two years has been wine (303.995530), meat(166.916853, gold products(43.968708).
The campaign with the highest mean response has been Response, which was the last campaign launched.

EDA¶

In [44]:

#Split data into categorical and numerical  for exploration

In [45]:

df_cat=df.select_dtypes(include=['object'])
df_cat.head(5)

Out[45]:

	education	marital_status	acceptedcmp3	acceptedcmp4	acceptedcmp2	response	country
0	Graduation	single	0	0	0	1	SP
1	Graduation	single	0	0	1	1	CA
2	Graduation	couple	0	0	0	0	US
3	Graduation	couple	0	0	0	0	AUS
4	Graduation	single	1	1	0	1	SP

In [46]:

#Save numerical data
df_num= df.select_dtypes(exclude=["object"])
df_num.head(3)

Out[46]:

	id	income	mntwines	mntfruits	mntmeatproducts	mntfishproducts	mntsweetproducts	mntgoldprods	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	age	spending	dependents	year_customer	total_purchases	total_campaigns_acc
0	1826	84835.0	189	104	379	111	189	218	1	4	4	6	1	53	1190	0	2014	15	1.0
1	1	57091.0	464	5	64	7	0	37	1	7	3	7	5	62	577	0	2014	18	11.0
2	10476	67267.0	134	11	59	15	2	30	1	3	2	5	2	65	251	1	2014	11	0.0

CUSTOMER PROFILE ANALYSIS¶

There are columsn relevant for customer profile: Education:ordinal Incomo:continuos Marital:nominal

In [47]:

#Displaying categorical data

fig, ax = plt.subplots(1, 4, figsize=(15, 4))

sns.countplot(x=df_cat['marital_status'], ax=ax[0])
sns.countplot(x=df_cat['education'], ax=ax[1])
sns.countplot(x=df_cat['country'], ax=ax[2])


df_plot = df_cat.groupby(['marital_status', 'education']).size().reset_index().pivot(columns='marital_status', index='education', values=0)
df_plot.apply(lambda x: x / x.sum(), axis=1).plot(kind='bar', stacked=True, ax=ax[3], colormap='Paired')
ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

plt.tight_layout()
plt.show()

In [48]:

fig, ax = plt.subplots(1, 7, figsize=(15, 4))

sns.countplot(x=df_cat['acceptedcmp1'], ax=ax[0])
sns.countplot(x=df_cat['acceptedcmp2'], ax=ax[1])
sns.countplot(x=df_cat['acceptedcmp3'], ax=ax[2])
sns.countplot(x=df_cat['acceptedcmp4'], ax=ax[3])
sns.countplot(x=df_cat['acceptedcmp5'], ax=ax[4])
sns.countplot(x=df_cat['response'], ax=ax[5])
sns.countplot(x=df_cat['complain'], ax=ax[6])

ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

plt.tight_layout()
plt.show()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

Association between marital status and education¶

In [49]:

df_cat_con= df_cat[['marital_status', 'education']]
df_cat_con

Out[49]:

	marital_status	education
0	single	Graduation
1	single	Graduation
2	couple	Graduation
3	couple	Graduation
4	single	Graduation
...	...	...
2232	single	PhD
2233	couple	2n Cycle
2234	single	Graduation
2235	couple	Graduation
2236	couple	PhD

2229 rows × 2 columns

In [50]:

from scipy.stats import chi2_contingency
# Crear la tabla de contingencia
contingency_table= pd.crosstab(df_cat_con['marital_status'], df_cat_con['education'])

# Realizar la prueba de chi-cuadrado
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Mostrar la tabla de contingencia
print(contingency_table)

# Mostrar el valor de chi-cuadrado y el valor p
print(f"Chi-cuadrado: {chi2}")
print(f"Valor p: {p_value}")

education       2n Cycle  Basic  Graduation  Master  PhD
marital_status                                          
couple               138     34         717     243  304
single                63     20         407     126  177
Chi-cuadrado: 2.4612027478482332
Valor p: 0.651596254517518

The result obtained indicates that the chi-square value is approximately 2.461 and the p-value is approximately 0.6516.

The chi-square value is a measure of the association between the variables "marital_status" (marital status) and "education" (education) in your data set. A larger chi-square value indicates a stronger association between the variables, while a value close to zero indicates that the variables are independent.

The p-value is the probability of obtaining a chi-square value equal to or more extreme than that observed if the variables were completely independent. In other words, it represents the probability that the association between the variables is simply the result of chance, given the size of the sample.

In your case, the p-value is 0.6516, which means that the probability of getting a chi-square value equal to or more extreme than 2.461 under the null hypothesis (that the variables are independent) is about 0.6516. Generally, if the p-value is greater than a predefined significance level (for example, 0.05), we do not have sufficient evidence to reject the null hypothesis and we can consider the variables to be independent.

Therefore, in this case, the result suggests that there is no significant association between marital status and education level in the data set you are analyzing.

In [51]:

#Displaying numerical dataç
df_num.columns

Out[51]:

Index(['id', 'income', 'recency', 'mntwines', 'mntfruits', 'mntmeatproducts',
       'mntfishproducts', 'mntsweetproducts', 'mntgoldprods',
       'numdealspurchases', 'numwebpurchases', 'numcatalogpurchases',
       'numstorepurchases', 'numwebvisitsmonth', 'age', 'spending',
       'dependents', 'year_customer', 'total_purchases',
       'total_campaigns_acc'],
      dtype='object')

In [52]:

# Crear el lienzo y los ejes
fig, axes = plt.subplots(6, 4, figsize=(15, 20))

# Aplanar el arreglo de ejes para facilitar el acceso
axes = axes.flatten()

# Graficar los histogramas de las columnas numéricas
for i, col in enumerate(df_num.columns):
    sns.histplot(data=df_num, x=col, ax=axes[i], kde=True)

# Ajustar el espaciado entre subplots
plt.tight_layout()

# Mostrar el gráfico
plt.show()

In [ ]:

Features Correlation Analysis¶

In [53]:

corr = df_num.corr()
ones = np.ones_like(corr, dtype="bool")
mask = np.triu(ones)
mask = mask[1:, :-1]
corr = corr.iloc[1:,:-1].copy()
fig, ax = plt.subplots(figsize=(40,30))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="Greens", linewidths=.6, cbar_kws={"shrink":.9})
ax.xaxis.tick_bottom()
plt.title("Correlations heatmap for numerical data".upper(), fontdict={"fontsize": 18}, loc="left")

Out[53]:

Text(0.0, 1.0, 'CORRELATIONS HEATMAP FOR NUMERICAL DATA')

In [67]:

sns.set(rc ={'figure.figsize':(8,8)}, font_scale=0.5)
sns.heatmap(corr[corr>0.8], annot=True, fmt=".3f")

Out[67]:

<AxesSubplot:>

In [54]:

#High correlation:.80 to 1
#Mean correlation:.60 to .80
#Low correlation: .40 to .60

#We see that our target to predict  has mean correlation with total purchases

In [55]:

corr = df_num.corr()
high_corr = corr[corr > 0.80]
high_corr = high_corr.unstack().dropna()
high_corr = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)]

for index, value in high_corr.items():
    variable_1, variable_2 = index
    correlation_value = value
    print("Variables:", variable_1, "and", variable_2)
    print("Correlation:", correlation_value)
    print("------------------------")

Variables: income and spending
Correlation: 0.8202215056408394
------------------------
Variables: mntwines and spending
Correlation: 0.8959193067149777
------------------------
Variables: mntmeatproducts and spending
Correlation: 0.8534350636911707
------------------------
Variables: numstorepurchases and total_purchases
Correlation: 0.8317509633068462
------------------------
Variables: spending and income
Correlation: 0.8202215056408394
------------------------
Variables: spending and mntwines
Correlation: 0.8959193067149777
------------------------
Variables: spending and mntmeatproducts
Correlation: 0.8534350636911707
------------------------
Variables: total_purchases and numstorepurchases
Correlation: 0.8317509633068462
------------------------

Correlation Results¶

Income is a proxy for several other features, such as spending capacity or total purchases. It shows a positive relationship with expenses on meat and wine, indicating that individuals with higher income tend to spend more on these items. On the other hand, income has a negative correlation with the number of children at home and the frequency of website visits, suggesting that individuals with higher income tend to have fewer children at home and visit websites less frequently.
Furthermore, the amount spent on wine is not only influenced by high income but also by the amount spent on meat. These two variables are interconnected, meaning that individuals who spend more on meat are also likely to spend more on wine. Moreover, individuals can purchase wine either from catalogs or from physical stores.
The number of children at home is negatively associated with spending and income, and consequently, wine consumption. This implies that individuals with more children tend to have lower income and spend less overall, including on wine. Conversely, higher income levels are also linked to a greater willingness to accept advertising campaigns.
In summary, income serves as a proxy for several factors, with positive correlations to spending, meat and wine expenses, while showing negative associations with the number of children at home and website visits. The relationship between income, spending habits, and preferences for specific products such as wine can provide insights into consumer behavior and marketing strategies.

In [88]:

#VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

features = ['income', 'mntmeatproducts','mntwines','total_purchases']

def calculate_vif(df, features):    
    vif, tolerance, r2sq = {}, {}, {}
    # all the features that you want to examine
    for feature in features:
        # extract all the other features you will regress against
        X = [f for f in features if f != feature]        
        X, y = df[X], df[feature]
        # extract r-squared from the fit
        r2 = LinearRegression().fit(X, y).score(X, y)                
        
        # calculate tolerance
        r2sq[feature] = r2
        tolerance[feature] = 1 - r2
        # calculate VIF
        vif[feature] = 1/(tolerance[feature])
    # return VIF DataFrame
    return pd.DataFrame({'VIF': vif, 'Tolerance': tolerance, 'R2': r2sq})

In [95]:

calculate_vif(df_num, df_num)

C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])
C:\Users\danie\AppData\Local\Temp\ipykernel_26540\2798046051.py:21: RuntimeWarning: divide by zero encountered in double_scalars
  vif[feature] = 1/(tolerance[feature])

Out[95]:

	VIF	Tolerance	R2
id	1.009077	0.991004	0.008996
income	5.114370	0.195528	0.804472
recency	1.011846	0.988293	0.011707
mntwines	inf	0.000000	1.000000
mntfruits	inf	0.000000	1.000000
mntmeatproducts	inf	0.000000	1.000000
mntfishproducts	inf	0.000000	1.000000
mntsweetproducts	inf	0.000000	1.000000
mntgoldprods	inf	0.000000	1.000000
numdealspurchases	inf	0.000000	1.000000
numwebpurchases	inf	0.000000	1.000000
numcatalogpurchases	inf	0.000000	1.000000
numstorepurchases	inf	0.000000	1.000000
numwebvisitsmonth	2.776575	0.360156	0.639844
age	1.121263	0.891851	0.108149
spending	inf	0.000000	1.000000
dependents	2.027792	0.493147	0.506853
year_customer	1.236980	0.808421	0.191579
total_purchases	inf	0.000000	1.000000
total_campaigns_acc	1.090735	0.916813	0.083187

In [192]:

#remving spending
features_2 = ['mntmeatproducts', 'mntwines', 'total_purchases', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods', 'numwebpurchases', 'numwebvisitsmonth', 'age', 'dependents']

In [193]:

calculate_vif(df,features_2)

Out[193]:

	VIF	Tolerance	R2
mntmeatproducts	2.507714	0.398770	0.601230
mntwines	2.423295	0.412661	0.587339
total_purchases	4.861589	0.205694	0.794306
mntfishproducts	1.964291	0.509090	0.490910
mntsweetproducts	1.798715	0.555952	0.444048
mntgoldprods	1.446274	0.691432	0.308568
numwebpurchases	3.036062	0.329374	0.670626
numwebvisitsmonth	1.689992	0.591719	0.408281
age	1.096738	0.911794	0.088206
dependents	1.551958	0.644347	0.355653

Interpreting R square k - If $R^2_k$ equals zero, variable k is not correlated with any other independent variable. Usually, multicollinearity is a potential problem when $R^2_k$ is greater than 0.75 and, a serious problem when $R^2_k$ is greater than 0.9.

Interpretation of the variance inflation factor: If $VIF_k = 1$, variable k is not correlated with any other independent variable. Multicollinearity is a potential problem when VIFk is greater than 5 and, a serious problem when it is greater than 10.

Interpretation of tolerance A tolerance of less than 0.20 or 0.10 indicates a multicollinearity problem.

In [ ]:

In [96]:

#Plot illustrating the effect of income on spending
plt.figure(figsize=(8, 8))
plt.scatter(df['income'], df['spending'])
plt.title('Income vs. Spending')
plt.xlabel('Income')
plt.ylabel('Spending')
plt.show()

In [97]:

#Plot illustrating negative effect of having dependents (kids & teens) on spending
plt.figure(figsize=(4,4))
sns.boxplot(x='dependents', y='spending', data=df)

Out[97]:

<AxesSubplot:xlabel='dependents', ylabel='spending'>

In [98]:

#Plot illustrating positive effect of having dependents (kids & teens) on number of deals purchased
plt.figure(figsize=(4,4))
sns.boxplot(x='dependents', y='numdealspurchases', data=df);

In [ ]:

In [99]:

#REMOVING SPENDING AND TOTAL PURCHASE to check Multicolinearity

In [100]:

corr_de = df_num.drop(['spending', 'total_purchases'], axis=1)

In [101]:

fig, ax = plt.subplots(figsize=(40,30)  )
heatmap = sns.heatmap(corr_de.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')

Segmentation of Customers¶

Choosing columsn for segmentation¶

I choose the spending and income variables because they provide interesting information for the segmentation.

In [ ]:

In [136]:

df_num.head(2)

Out[136]:

	id	income	recency	mntwines	mntfruits	mntmeatproducts	mntfishproducts	mntsweetproducts	mntgoldprods	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	age	spending	dependents	year_customer	total_purchases	total_campaigns_acc
0	1826	84835.0	0	189	104	379	111	189	218	1	4	4	6	1	53	1190	0	2014	15	1.0
1	1	57091.0	0	464	5	64	7	0	37	1	7	3	7	5	62	577	0	2014	18	11.0

In [137]:

cluster = df_num.iloc[:,[1,15,]]

#income
#recency, mntwines	mntfruits	mntmeatproducts	mntfishproducts	mntsweetproducts	mntgoldprods	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases numwebvisitsmonth	age	spending,total_purchases

In [135]:

cluster

Out[135]:

In [ ]:

Standardizing numerical data with standard scaler

In [105]:

#scaler = StandardScaler()

#cluster_encoded = scaler.fit_transform(cluster)

In [106]:

#cluster_encoded

In [138]:

scaler = MinMaxScaler()
cluster_encoded= scaler.fit_transform(cluster)
cluster_encoded

Out[138]:

array([[0.74198243, 0.4702381 ],
       [0.49427699, 0.22698413],
       [0.58513089, 0.09761905],
       ...,
       [0.3980215 , 0.12063492],
       [0.57220278, 0.5468254 ],
       [0.83158637, 0.42579365]])

In [108]:

cluster_encoded.shape

Out[108]:

(2229, 2)

Choosing the number of clusters WCSS -> Within Clusters Sum of Squares

In [109]:

## finding wcss value for different number of clusters

wcss = []

for i in range(1,11):
  kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
  kmeans.fit(cluster_encoded)

  wcss.append(kmeans.inertia_)

In [110]:

# plot an elbow graph

sns.set()
plt.plot(range(1,11), wcss)
plt.title('The Elbow Point Graph')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

Optimum Number of Clusters = 4
Training the k-Means Clustering Model

In [111]:

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)

# return a label for each data point based on their cluster
Y = kmeans.fit_predict(cluster_encoded)

print(Y)

[3 2 2 ... 2 3 3]

In [112]:

#Visualization
plt.figure(figsize=(8,8))
plt.scatter(cluster_encoded[Y==0,0 ], cluster_encoded[Y==0,1 ], s=50, c='green', label='Cluster 1')
plt.scatter(cluster_encoded[Y==1,0 ], cluster_encoded[Y==1,1 ], s=50, c='red', label='Cluster 2')
plt.scatter(cluster_encoded[Y==2,0 ], cluster_encoded[Y==2,1 ], s=50, c='yellow', label='Cluster 3')
plt.scatter(cluster_encoded[Y==3,0 ], cluster_encoded[Y==3,1 ], s=50, c='blue', label='Cluster 4')
                      

plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=100, c='cyan', label='Centroids')




# Configura las etiquetas de los ejes y el título de la gráfica
plt.ylabel('Spending')
plt.xlabel('Income')
plt.title('Customers Groups')

# Muestra la gráfica
plt.show()

In [113]:

kmeans.labels_

Out[113]:

array([3, 2, 2, ..., 2, 3, 3])

In [114]:

cluster_1_data = cluster_encoded[Y == 0]
cluster_1_data

Out[114]:

array([[0.2744902 , 0.00238095],
       [0.17627942, 0.03412698],
       [0.22447413, 0.0265873 ],
       ...,
       [0.16427985, 0.01150794],
       [0.16427985, 0.01150794],
       [0.26182993, 0.01984127]])

In [ ]:

In [115]:

# Aplica K-means para obtener los clusters
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(cluster_encoded)

# Obtiene las etiquetas asignadas a cada cliente
labels = kmeans.labels_

# Cuenta la cantidad de clientes en cada cluster
conteo_clientes = pd.Series(labels).value_counts()

# Grafica los grupos de clientes por ingresos y gastos
plt.figure(figsize=(8, 8))
plt.scatter(cluster_encoded[:, 0], cluster_encoded[:, 1], c=labels, cmap='viridis')

# Grafica los centroides de los clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=100, c='red', label='Centroids')

# Configura las etiquetas de los ejes y el título de la gráfica
plt.xlabel('Income')
plt.ylabel('Spending')
plt.title('Customers Groups')



# Muestra la cantidad de clientes en cada cluster
for cluster, count in conteo_clientes.items():
    centroid = kmeans.cluster_centers_[cluster]
    plt.text(centroid[0], centroid[1], f'{count}', fontsize=12, color='black', ha='center')

# Muestra la gráfica
plt.show()

I create a copy of the original dataframe and assign the labels to it to be able to filter the rows for each label and analyze the customer groups

In [139]:

cluster2 = cluster.copy()

In [140]:

cluster2['cluster_labels'] = kmeans.labels_
cluster2

Out[140]:

	income	spending	cluster_labels
0	84835.0	1190	3
1	57091.0	577	2
2	67267.0	251	2
3	32474.0	11	0
4	21474.0	91	0
...	...	...	...
2232	66476.0	689	3
2233	31056.0	55	0
2234	46310.0	309	2
2235	65819.0	1383	3
2236	94871.0	1078	3

2229 rows × 3 columns

In [141]:

# Filtrar las filas por los valores de cluster_labels
cluster_0 = cluster2.loc[cluster2['cluster_labels'] == 0]
cluster_1 = cluster2.loc[cluster2['cluster_labels'] == 1]
cluster_2 = cluster2.loc[cluster2['cluster_labels'] == 2]
cluster_3 = cluster2.loc[cluster2['cluster_labels'] == 3]

# Imprimir los resultados
print("Cluster 0:")
print(cluster_0)
print()

print("Cluster 1:")
print(cluster_1)
print()

print("Cluster 2:")
print(cluster_2)
print()

print("Cluster 3:")
print(cluster_3)

Cluster 0:
       income  spending  cluster_labels
3     32474.0        11               0
4     21474.0        91               0
13    26872.0        72               0
14     4428.0       359               0
17    32173.0        22               0
...       ...       ...             ...
2220   7500.0       100               0
2226  42231.0        37               0
2227  20130.0        34               0
2228  20130.0        34               0
2233  31056.0        55               0

[736 rows x 3 columns]

Cluster 1:
       income  spending  cluster_labels
31    79529.0      1638               1
35    95169.0      1901               1
38    84865.0      1688               1
41    66465.0      1485               1
45    64260.0      1574               1
...       ...       ...             ...
2205  84906.0      1631               1
2206  84906.0      1631               1
2210  90687.0      1779               1
2216  83273.0      1392               1
2217  81702.0      1633               1

[334 rows x 3 columns]

Cluster 2:
       income  spending  cluster_labels
1     57091.0       577               2
2     67267.0       251               2
7     44931.0        96               2
8     65324.0       544               2
9     65324.0       544               2
...       ...       ...             ...
2222  44794.0        65               2
2223  44794.0        65               2
2229  42429.0        67               2
2230  42429.0        67               2
2234  46310.0       309               2

[612 rows x 3 columns]

Cluster 3:
       income  spending  cluster_labels
0     84835.0      1190               3
5     71691.0      1192               3
6     63564.0      1215               3
10    81044.0      1208               3
12    67786.0      1156               3
...       ...       ...             ...
2225  62568.0       953               3
2231  78901.0       734               3
2232  66476.0       689               3
2235  65819.0      1383               3
2236  94871.0      1078               3

[547 rows x 3 columns]

In [142]:

#Converting clusters to dataframe

cluster_0_df = pd.DataFrame(cluster_0)
cluster_1_df = pd.DataFrame(cluster_1)
cluster_2_df = pd.DataFrame(cluster_2)
cluster_3_df = pd.DataFrame(cluster_3)

In [143]:

cluster_0_df

Out[143]:

	income	spending	cluster_labels
3	32474.0	11	0
4	21474.0	91	0
13	26872.0	72	0
14	4428.0	359	0
17	32173.0	22	0
...	...	...	...
2220	7500.0	100	0
2226	42231.0	37	0
2227	20130.0	34	0
2228	20130.0	34	0
2233	31056.0	55	0

736 rows × 3 columns

In [144]:

cluster_0_df.shape

Out[144]:

(736, 3)

In [145]:

cluster_0_df.describe()

Out[145]:

	income	spending	cluster_labels
count	736.000000	736.000000	736.0
mean	28541.343750	72.880435	0.0
std	8635.690514	69.255511	0.0
min	1730.000000	5.000000	0.0
25%	22678.750000	32.000000	0.0
50%	29968.500000	53.500000	0.0
75%	35528.250000	84.000000	0.0
max	43269.000000	570.000000	0.0

As a data analyst, looking at the Cluster 1 of customer segmentation, cluster_0_df, we can make the following conclusions:¶

Cluster size: 736 clients.
Average income: The average income of clients in this cluster is approximately 28,541$
Average expenses: Clients in this cluster have an average expense of around 72.88$.
Variability: There is considerable variability in the income and expenses of clients in this cluster, as indicated by the standard deviation. This suggests that there are clients with different levels of income and expenses within the cluster.
Range of income and expenses: The range of income goes from 1,730$ to 43,269$, while the range of expenses goes from 5$ to 570$ monetary units.
Distribution of income and expenses: The median (central value) of income is approximately 29,968$, which means that half of the clients have incomes above this value and the other half have incomes below this value. The median of the expenses is approximately 53$.
Low Spend Segment: This cluster can be considered as a segment of customers with low spending levels, since the average spend is relatively low compared to the other clusters.
With these conclusions, we can infer that this group of clients has relatively low income and expenses compared to the other clusters. This may indicate a customer profile with lower spending capacity or more conservative consumption preferences. These observations can be useful for the marketing team in targeting their strategies and campaigns more effectively towards this customer segment, for example, by offering affordable products or services and promotions tailored to their level of spending.

CLUSTER 2 CONCLUSIONS¶

In [146]:

cluster_1_df.shape

Out[146]:

(334, 3)

In [147]:

cluster_1_df.describe()

Out[147]:

	income	spending	cluster_labels
count	334.000000	334.000000	334.0
mean	78690.004491	1688.784431	1.0
std	8955.332848	272.797511	0.0
min	51381.500000	1112.000000	1.0
25%	72270.750000	1479.750000	1.0
50%	79414.500000	1634.000000	1.0
75%	83842.250000	1865.750000	1.0
max	105471.000000	2525.000000	1.0

Cluster size: 334 clients.
Average income: The average income of clients in Cluster 1 is approximately 78,690$. This suggests that the customers in this cluster have relatively higher income levels compared to Cluster 0.
Average expenses: The average expenses of clients in Cluster 1 amount to around 1,688$. This indicates that the customers in this segment tend to have higher spending levels compared to Cluster 0.
Variability: The standard deviation for both income and expenses within Cluster 1 suggests a moderate degree of variability. This implies that there are clients with varying income and spending patterns within this cluster.
Range of income and expenses: The range of income in Cluster 1 spans from 51,381$ to 105,471$ while the range of expenses ranges from 1.112$ to 2,525$. This wide range indicates the presence of customers with different income and spending levels within this cluster.
Distribution of income and expenses: The median income of clients in Cluster 1 is approximately 79,414$. This suggests that half of the clients have an income below this value, while the other half has an income above it.
Overall, Cluster 1 represents a segment of customers with relatively higher income and spending levels compared to Cluster 0. These customers may have a higher purchasing power and exhibit more liberal spending behavior. The marketing department can leverage these insights to develop strategies and campaigns that cater to this customer segment. For example, they can focus on offering premium products or services, creating promotions that align with the customers' higher spending capacity, and delivering messaging that resonates with their more indulgent consumption preferences.

CLUSTER 3 CONCLUSIONS¶

In [148]:

cluster_2_df.shape

Out[148]:

(612, 3)

In [149]:

cluster_2_df.describe()

Out[149]:

	income	spending	cluster_labels
count	612.000000	612.000000	612.0
mean	50399.556373	315.490196	2.0
std	7572.817956	190.461525	0.0
min	32632.000000	16.000000	2.0
25%	44767.750000	147.750000	2.0
50%	49973.500000	307.500000	2.0
75%	55305.250000	455.750000	2.0
max	73395.000000	839.000000	2.0

Cluster size: 612 clients.
Average income: The average income of clients in Cluster 3 is approximately 50,399$.
Average expenses: Clients in Cluster 3 have an average expense of around 315$. This indicates that the customers in this segment tend to have moderate spending levels than Cluster 2.
Variability: The standard deviation for both income and expenses within Cluster 2 suggests a moderate degree of variability. This implies that there are clients with different income and spending patterns within this cluster.
Range of income and expenses: The range of income in Cluster 3 spans from 32,632$ to 73,395$ while the range of expenses ranges from 16$ to 839$. This wide range indicates the presence of customers with varying income and spending levels within this cluster.
Distribution of income and expenses: The median income of clients in Cluster 2 is approximately 49,973$.

From this analysis, we can conclude that Cluster 3 represents a segment of customers with a moderate income level and moderate spending behavior. These customers may have a balanced approach to their finances and exhibit stable spending patterns. The marketing department can consider targeting this customer segment with products or services that align with their moderate income and spending capacity. They can also focus on providing value-based offerings and promotions that cater to their stable spending behavior.

CLUSTER 4 CONCLUSIONS¶

In [150]:

cluster_3_df.shape

Out[150]:

(547, 3)

In [151]:

cluster_3_df.describe()

Out[151]:

	income	spending	cluster_labels
count	547.000000	547.000000	547.0
mean	67507.232176	985.332724	3.0
std	9879.242251	206.307558	0.0
min	2447.000000	277.000000	3.0
25%	61194.500000	824.000000	3.0
50%	66886.000000	988.000000	3.0
75%	74279.000000	1150.000000	3.0
max	113734.000000	1730.000000	3.0

Cluster size: 547 clients.
Average income: 67,507.23$.
Average expenses: Clients in Cluster 3 have an average expense of around 985$.
Variability: The standard deviation for both income and expenses within Cluster 3 indicates a moderate degree of variability, suggesting that there are clients with different income and spending patterns within this cluster.
Range of income and expenses: The range of income in Cluster 3 spans from 2,447$ to 113,734$ while the range of expenses ranges from 277$ to 1,730$.
Distribution of income and expenses: The median income of clients in Cluster 3 is approximately 66,886$

Based on this analysis, we can conclude that Cluster 3 represents a segment of clients with a relatively higher average income and higher spending levels compared to other clusters. These clients may have a higher spending capacity and may be more inclined towards making larger purchases. The marketing department can consider targeting this customer segment with higher-end products or services, luxury offerings, or personalized marketing campaigns that cater to their higher spending power.

In [ ]:

Answer the following questions of the CMO:¶

What does the Average customer look like for our company?¶

In [152]:

selected_columns = ['age', 'year_customer', 'income', 'dependents', 'recency']
selected_data = df[selected_columns]
mean_values = selected_data.mean().round(1)
mean_df = pd.DataFrame(mean_values, columns=['Mean'])
mean_df.index.name = 'Variable'
mean_df.columns = ['Average Customer']
mean_df.index = mean_df.index.rename('Characteristics')
mean_df

Out[152]:

	Average Customer
Characteristics
age	54.1
year_customer	2013.0
income	51619.5
dependents	1.0
recency	49.1

What Products are best performing?¶

Wines Followed by meats

In [153]:

# Calcular la suma de las ventas de cada producto
sales = df[['mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']].sum()

# Crear el gráfico de pie
plt.figure(figsize=(8, 6))
plt.pie(sales, labels=sales.index, autopct='%1.1f%%')

# Configurar el título
plt.title('Product Performance by Sales')

# Mostrar el gráfico
plt.show()

Which channels are underperforming?¶

In [154]:

# Seleccionar las columnas de interés
channels = df[['numwebvisitsmonth', 'numstorepurchases', 'numcatalogpurchases', 'numwebpurchases', 'numdealspurchases']]

# Calcular la suma de cada canal
channel_purchases = channels.sum()

# Calcular el porcentaje de compras de cada canal
total_purchases = df['total_purchases'].sum()
channel_performance = (channel_purchases / total_purchases) * 100

# Crear el gráfico de pie
plt.figure(figsize=(8, 6))
plt.pie(channel_performance, labels=channel_performance.index, autopct='%1.1f%%')

# Configurar el título
plt.title('Channels Performance by Total Purchases')

# Mostrar el gráfico
plt.show()

Which Marketing Campaigns were most successful?¶

The most successful campaign is the most recent (column name: Response)

In [155]:

fig, ax = plt.subplots(1, 7, figsize=(15, 4))

sns.countplot(x=df_cat['acceptedcmp1'], ax=ax[0])
sns.countplot(x=df_cat['acceptedcmp2'], ax=ax[1])
sns.countplot(x=df_cat['acceptedcmp3'], ax=ax[2])
sns.countplot(x=df_cat['acceptedcmp4'], ax=ax[3])
sns.countplot(x=df_cat['acceptedcmp5'], ax=ax[4])
sns.countplot(x=df_cat['response'], ax=ax[5])

ax[3].legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

plt.tight_layout()
plt.show()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

In [ ]:

Purchase amount spending prediction: Spending/Income¶

In [156]:

cluster

Out[156]:

	income	spending
0	84835.0	1190
1	57091.0	577
2	67267.0	251
3	32474.0	11
4	21474.0	91
...	...	...
2232	66476.0	689
2233	31056.0	55
2234	46310.0	309
2235	65819.0	1383
2236	94871.0	1078

2229 rows × 2 columns

In [157]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

#X-y split 
Y=cluster['spending']
X=cluster.drop('spending',axis=1)

In [158]:

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [159]:

# Normalizar los datos en X_train y X_test usando StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [160]:

# Crear y ajustar el modelo de regresión lineal
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)

Out[160]:

LinearRegression()

In [161]:

# Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba
y_train_pred = regression_model.predict(X_train_scaled)
y_test_pred = regression_model.predict(X_test_scaled)

In [162]:

# Calcular las métricas en conjunto de entrenamiento
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = sqrt(mse_train)

In [163]:

# Calcular las métricas en conjunto de prueba
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = sqrt(mse_test)

In [164]:

# Imprimir las métricas
print("Métricas en conjunto de entrenamiento:")
print("R^2:", r2_train)
print("MSE:", mse_train)
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("------------------------------")
print("Métricas en conjunto de prueba:")
print("R^2:", r2_test)
print("MSE:", mse_test)
print("MAE:", mae_test)
print("RMSE:", rmse_test)

Métricas en conjunto de entrenamiento:
R^2: 0.6784919073987676
MSE: 118045.350349332
MAE: 268.46887171988595
RMSE: 343.57728439076413
------------------------------
Métricas en conjunto de prueba:
R^2: 0.6469237899197506
MSE: 118895.62031097068
MAE: 267.66162931031494
RMSE: 344.8124422218124

In [ ]:

Purchase amount spending prediction with other variables¶

In [165]:

df_cat

Out[165]:

	education	marital_status	acceptedcmp3	acceptedcmp4	acceptedcmp5	acceptedcmp1	acceptedcmp2	response	complain	country
0	Graduation	single	0	0	0	0	0	1	0	SP
1	Graduation	single	0	0	0	0	1	1	0	CA
2	Graduation	couple	0	0	0	0	0	0	0	US
3	Graduation	couple	0	0	0	0	0	0	0	AUS
4	Graduation	single	1	1	0	0	0	1	0	SP
...	...	...	...	...	...	...	...	...	...	...
2232	PhD	single	0	0	0	0	0	0	0	US
2233	2n Cycle	couple	0	0	0	0	0	0	0	SP
2234	Graduation	single	0	0	0	0	0	0	0	SP
2235	Graduation	couple	0	0	0	0	0	0	0	IND
2236	PhD	couple	0	0	1	0	0	1	0	CA

2229 rows × 10 columns

In [166]:

df_cat_l=df_cat[['education', 'marital_status', 'country']]
df_cat_l= df_cat_l.reset_index()
df_cat_l = df_cat_l.drop(columns=['index'])
df_cat_l

Out[166]:

	education	marital_status	country
0	Graduation	single	SP
1	Graduation	single	CA
2	Graduation	couple	US
3	Graduation	couple	AUS
4	Graduation	single	SP
...	...	...	...
2224	PhD	single	US
2225	2n Cycle	couple	SP
2226	Graduation	single	SP
2227	Graduation	couple	IND
2228	PhD	couple	CA

2229 rows × 3 columns

In [167]:

#Encodign categorical data 
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(drop='first').fit(df_cat_l)
encoder=encoder.transform(df_cat_l).toarray()
encoder

Out[167]:

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [168]:

df_num

Out[168]:

	id	income	recency	mntwines	mntfruits	mntmeatproducts	mntfishproducts	mntsweetproducts	mntgoldprods	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	age	spending	dependents	year_customer	total_purchases	total_campaigns_acc
0	1826	84835.0	0	189	104	379	111	189	218	1	4	4	6	1	53	1190	0	2014	15	1.0
1	1	57091.0	0	464	5	64	7	0	37	1	7	3	7	5	62	577	0	2014	18	11.0
2	10476	67267.0	0	134	11	59	15	2	30	1	3	2	5	2	65	251	1	2014	11	0.0
3	1386	32474.0	0	10	0	1	0	0	0	1	1	0	2	7	56	11	2	2014	4	0.0
4	5371	21474.0	0	6	16	24	11	0	34	2	3	1	2	7	34	91	1	2014	8	110001.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2232	10142	66476.0	99	372	18	126	47	48	78	2	5	2	11	4	47	689	1	2013	20	0.0
2233	5263	31056.0	99	5	10	13	3	8	16	1	1	0	3	8	46	55	1	2013	5	0.0
2234	22	46310.0	99	185	2	88	15	5	14	2	6	1	5	8	47	309	1	2012	14	0.0
2235	528	65819.0	99	267	38	701	149	165	63	1	5	4	10	3	45	1383	0	2012	20	0.0
2236	4070	94871.0	99	169	24	553	188	0	144	1	8	5	4	7	54	1078	2	2012	18	1001.0

2229 rows × 20 columns

In [169]:

df_num_l=df_num[['income', 'numdealspurchases',	'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases','numwebvisitsmonth', 'age', 'dependents', 'spending']]
df_num_l= df_num_l.reset_index()
df_num_l = df_num_l.drop(columns=['index'])
df_num_l

Out[169]:

	income	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	age	dependents	spending
0	84835.0	1	4	4	6	1	53	0	1190
1	57091.0	1	7	3	7	5	62	0	577
2	67267.0	1	3	2	5	2	65	1	251
3	32474.0	1	1	0	2	7	56	2	11
4	21474.0	2	3	1	2	7	34	1	91
...	...	...	...	...	...	...	...	...	...
2224	66476.0	2	5	2	11	4	47	1	689
2225	31056.0	1	1	0	3	8	46	1	55
2226	46310.0	2	6	1	5	8	47	1	309
2227	65819.0	1	5	4	10	3	45	0	1383
2228	94871.0	1	8	5	4	7	54	2	1078

2229 rows × 9 columns

In [170]:

#X-y split 
y=df_num_l['spending']
X=df_num_l.drop(['spending'],axis=1)
X

Out[170]:

	income	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	age	dependents
0	84835.0	1	4	4	6	1	53	0
1	57091.0	1	7	3	7	5	62	0
2	67267.0	1	3	2	5	2	65	1
3	32474.0	1	1	0	2	7	56	2
4	21474.0	2	3	1	2	7	34	1
...	...	...	...	...	...	...	...	...
2224	66476.0	2	5	2	11	4	47	1
2225	31056.0	1	1	0	3	8	46	1
2226	46310.0	2	6	1	5	8	47	1
2227	65819.0	1	5	4	10	3	45	0
2228	94871.0	1	8	5	4	7	54	2

2229 rows × 8 columns

In [ ]:

In [171]:

# Normalizar datos numéricos utilizando StandardScaler
scaler = StandardScaler()
X_es = scaler.fit_transform(X)
X_es

Out[171]:

array([[ 1.6126693 , -0.69656739, -0.03605431, ..., -1.79512893,
        -0.09494056, -1.26627659],
       [ 0.2656505 , -0.69656739,  1.04508986, ..., -0.13842988,
         0.67397452, -1.26627659],
       [ 0.75971276, -0.69656739, -0.3964357 , ..., -1.38095417,
         0.93027955,  0.06385089],
       ...,
       [-0.25778554, -0.1688146 ,  0.68470847, ...,  1.10409441,
        -0.60755062,  0.06385089],
       [ 0.68940988, -0.69656739,  0.32432708, ..., -0.96677941,
        -0.77842064, -1.26627659],
       [ 2.09993432, -0.69656739,  1.40547125, ...,  0.68991965,
        -0.00950556,  1.39397838]])

In [172]:

X=np.concatenate((X_es ,encoder),axis=1)
X

Out[172]:

array([[ 1.6126693 , -0.69656739, -0.03605431, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.2656505 , -0.69656739,  1.04508986, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.75971276, -0.69656739, -0.3964357 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.25778554, -0.1688146 ,  0.68470847, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.68940988, -0.69656739,  0.32432708, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.09993432, -0.69656739,  1.40547125, ...,  0.        ,
         0.        ,  0.        ]])

In [173]:

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [174]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

Out[174]:

array([[-0.61140256, -0.68389372, -1.11115159, ..., -0.41585133,
        -0.96225045, -0.22786351],
       [ 0.70526191,  0.33801644,  1.75100978, ..., -0.41585133,
         1.03923048, -0.22786351],
       [ 0.14579357,  0.84897152, -0.39561125, ..., -0.41585133,
        -0.96225045, -0.22786351],
       ...,
       [-0.15850648, -0.68389372, -1.11115159, ..., -0.41585133,
         1.03923048, -0.22786351],
       [ 1.10278267, -0.68389372, -0.03784108, ...,  2.40470553,
        -0.96225045, -0.22786351],
       [ 0.82513769, -0.68389372,  1.75100978, ..., -0.41585133,
        -0.96225045, -0.22786351]])

In [175]:

# Crear y ajustar el modelo de regresión lineal
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)

Out[175]:

LinearRegression()

In [176]:

# Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba
y_train_pred = regression_model.predict(X_train_scaled)
y_test_pred = regression_model.predict(X_test_scaled)

In [177]:

# Calcular las métricas
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = sqrt(mse_train)

r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = sqrt(mse_test)

In [178]:

# Imprimir las métricas
print("Métricas en conjunto de entrenamiento:")
print("R^2:", r2_train)
print("MSE:", mse_train)
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("---------------------------")
print("Métricas en conjunto de prueba:")
print("R^2:", r2_test)
print("MSE:", mse_test)
print("MAE:", mae_test)
print("RMSE:", rmse_test)

Métricas en conjunto de entrenamiento:
R^2: 0.8156524015032155
MSE: 68735.41992095124
MAE: 188.13034539025102
RMSE: 262.1744074484602
---------------------------
Métricas en conjunto de prueba:
R^2: 0.8082373964076417
MSE: 63965.346222295244
MAE: 178.48083942988046
RMSE: 252.91371299772428

In [ ]:

Prediction removing income of last prediciton¶

In [179]:

df_num_l=df_num[['numdealspurchases',	'numwebpurchases', 'numcatalogpurchases', 'numstorepurchases','numwebvisitsmonth', 'age', 'dependents', 'spending']]
df_num_l= df_num_l.reset_index()
df_num_l = df_num_l.drop(columns=['index'])
df_num_l

Out[179]:

	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	age	dependents	spending
0	1	4	4	6	1	53	0	1190
1	1	7	3	7	5	62	0	577
2	1	3	2	5	2	65	1	251
3	1	1	0	2	7	56	2	11
4	2	3	1	2	7	34	1	91
...	...	...	...	...	...	...	...	...
2224	2	5	2	11	4	47	1	689
2225	1	1	0	3	8	46	1	55
2226	2	6	1	5	8	47	1	309
2227	1	5	4	10	3	45	0	1383
2228	1	8	5	4	7	54	2	1078

2229 rows × 8 columns

In [180]:

#X-y split 
y=df_num_l['spending']
X=df_num_l.drop(['spending'],axis=1)
X

Out[180]:

	numdealspurchases	numwebpurchases	numcatalogpurchases	numstorepurchases	numwebvisitsmonth	age	dependents
0	1	4	4	6	1	53	0
1	1	7	3	7	5	62	0
2	1	3	2	5	2	65	1
3	1	1	0	2	7	56	2
4	2	3	1	2	7	34	1
...	...	...	...	...	...	...	...
2224	2	5	2	11	4	47	1
2225	1	1	0	3	8	46	1
2226	2	6	1	5	8	47	1
2227	1	5	4	10	3	45	0
2228	1	8	5	4	7	54	2

2229 rows × 7 columns

In [181]:

# Normalizar datos numéricos utilizando StandardScaler
scaler = StandardScaler()
X_es = scaler.fit_transform(X)
X_es

Out[181]:

array([[-0.69656739, -0.03605431,  0.48770355, ..., -1.79512893,
        -0.09494056, -1.26627659],
       [-0.69656739,  1.04508986,  0.12999009, ..., -0.13842988,
         0.67397452, -1.26627659],
       [-0.69656739, -0.3964357 , -0.22772338, ..., -1.38095417,
         0.93027955,  0.06385089],
       ...,
       [-0.1688146 ,  0.68470847, -0.58543684, ...,  1.10409441,
        -0.60755062,  0.06385089],
       [-0.69656739,  0.32432708,  0.48770355, ..., -0.96677941,
        -0.77842064, -1.26627659],
       [-0.69656739,  1.40547125,  0.84541702, ...,  0.68991965,
        -0.00950556,  1.39397838]])

In [182]:

X=np.concatenate((X_es ,encoder),axis=1)
X

Out[182]:

array([[-0.69656739, -0.03605431,  0.48770355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.69656739,  1.04508986,  0.12999009, ...,  0.        ,
         0.        ,  0.        ],
       [-0.69656739, -0.3964357 , -0.22772338, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.1688146 ,  0.68470847, -0.58543684, ...,  0.        ,
         1.        ,  0.        ],
       [-0.69656739,  0.32432708,  0.48770355, ...,  0.        ,
         0.        ,  0.        ],
       [-0.69656739,  1.40547125,  0.84541702, ...,  0.        ,
         0.        ,  0.        ]])

In [183]:

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [184]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

Out[184]:

array([[-0.68389372, -1.11115159, -0.92884441, ..., -0.41585133,
        -0.96225045, -0.22786351],
       [ 0.33801644,  1.75100978,  0.85168801, ..., -0.41585133,
         1.03923048, -0.22786351],
       [ 0.84897152, -0.39561125, -0.57273793, ..., -0.41585133,
        -0.96225045, -0.22786351],
       ...,
       [-0.68389372, -1.11115159, -0.92884441, ..., -0.41585133,
         1.03923048, -0.22786351],
       [-0.68389372, -0.03784108,  0.13947504, ...,  2.40470553,
        -0.96225045, -0.22786351],
       [-0.68389372,  1.75100978,  1.56390097, ..., -0.41585133,
        -0.96225045, -0.22786351]])

In [185]:

# Crear y ajustar el modelo de regresión lineal
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)

Out[185]:

LinearRegression()

In [186]:

# Realizar las predicciones en conjunto de entrenamiento y conjunto de prueba
y_train_pred = regression_model.predict(X_train_scaled)
y_test_pred = regression_model.predict(X_test_scaled)

In [187]:

# Calcular las métricas
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = sqrt(mse_train)

r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = sqrt(mse_test)

In [188]:

# Imprimir las métricas
print("Métricas en conjunto de entrenamiento:")
print("R^2:", r2_train)
print("MSE:", mse_train)
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("---------------------------")
print("Métricas en conjunto de prueba:")
print("R^2:", r2_test)
print("MSE:", mse_test)
print("MAE:", mae_test)
print("RMSE:", rmse_test)

Métricas en conjunto de entrenamiento:
R^2: 0.7532939530028419
MSE: 91986.2470445113
MAE: 205.98088193206826
RMSE: 303.29234583897977
---------------------------
Métricas en conjunto de prueba:
R^2: 0.7535298353825518
MSE: 82213.8890371711
MAE: 194.52090187890965
RMSE: 286.7296445036179

In [ ]: