Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

In [1]:

#link python site packages folder
import sys
source_directory = "/Users/pauljacob/Library/Python/3.8/lib/python/site-packages"
sys.path.append(source_directory)

In [2]:

#file structure
!pip3 install cookiecutter

#general
!pip3 install --upgrade pip
!pip3 install ipython-autotime --quiet
!pip3 install watermark

Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: cookiecutter in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (2.1.1)
Requirement already satisfied: binaryornot>=0.4.4 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from cookiecutter) (0.4.4)
Requirement already satisfied: Jinja2<4.0.0,>=2.7 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from cookiecutter) (3.1.2)
Requirement already satisfied: click<9.0.0,>=7.0 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from cookiecutter) (8.1.3)
Requirement already satisfied: pyyaml>=5.3.1 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from cookiecutter) (6.0)
Requirement already satisfied: jinja2-time>=0.2.0 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from cookiecutter) (0.2.0)
Requirement already satisfied: python-slugify>=4.0.0 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from cookiecutter) (6.1.2)
Requirement already satisfied: requests>=2.23.0 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from cookiecutter) (2.28.1)
Requirement already satisfied: chardet>=3.0.2 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from binaryornot>=0.4.4->cookiecutter) (5.0.0)
Requirement already satisfied: MarkupSafe>=2.0 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from Jinja2<4.0.0,>=2.7->cookiecutter) (2.1.1)
Requirement already satisfied: arrow in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from jinja2-time>=0.2.0->cookiecutter) (1.2.3)
Requirement already satisfied: text-unidecode>=1.3 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from python-slugify>=4.0.0->cookiecutter) (1.3)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from requests>=2.23.0->cookiecutter) (2.1.1)
Requirement already satisfied: idna<4,>=2.5 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from requests>=2.23.0->cookiecutter) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from requests>=2.23.0->cookiecutter) (1.26.12)
Requirement already satisfied: certifi>=2017.4.17 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from requests>=2.23.0->cookiecutter) (2022.9.24)
Requirement already satisfied: python-dateutil>=2.7.0 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from arrow->jinja2-time>=0.2.0->cookiecutter) (2.8.2)
Requirement already satisfied: six>=1.5 in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/site-packages (from python-dateutil>=2.7.0->arrow->jinja2-time>=0.2.0->cookiecutter) (1.15.0)
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: pip in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (23.2.1)
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: watermark in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (2.3.1)
Requirement already satisfied: ipython in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from watermark) (8.5.0)
Requirement already satisfied: backcall in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (0.2.0)
Requirement already satisfied: decorator in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (5.1.1)
Requirement already satisfied: jedi>=0.16 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (0.18.1)
Requirement already satisfied: matplotlib-inline in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (0.1.6)
Requirement already satisfied: pickleshare in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (0.7.5)
Requirement already satisfied: prompt-toolkit<3.1.0,>3.0.1 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (3.0.31)
Requirement already satisfied: pygments>=2.4.0 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (2.13.0)
Requirement already satisfied: stack-data in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (0.5.1)
Requirement already satisfied: traitlets>=5 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (5.4.0)
Requirement already satisfied: pexpect>4.3 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (4.8.0)
Requirement already satisfied: appnope in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from ipython->watermark) (0.1.3)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from jedi>=0.16->ipython->watermark) (0.8.3)
Requirement already satisfied: ptyprocess>=0.5 in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from pexpect>4.3->ipython->watermark) (0.7.0)
Requirement already satisfied: wcwidth in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from prompt-toolkit<3.1.0,>3.0.1->ipython->watermark) (0.2.5)
Requirement already satisfied: executing in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from stack-data->ipython->watermark) (1.1.0)
Requirement already satisfied: asttokens in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from stack-data->ipython->watermark) (2.0.8)
Requirement already satisfied: pure-eval in /Users/pauljacob/Library/Python/3.8/lib/python/site-packages (from stack-data->ipython->watermark) (0.2.2)
Requirement already satisfied: six in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/site-packages (from asttokens->stack-data->ipython->watermark) (1.15.0)

In [3]:

%load_ext autotime

time: 283 µs (started: 2023-09-28 17:19:55 -07:00)

Import Libraries¶

In [4]:

#get libraries
import pandas as pd
import os
import numpy as np
import itertools
from itertools import combinations
import warnings

#data wrangling
from functools import reduce

#get visualization libraries
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import matplotlib.patches as mpatches
from matplotlib.ticker import FuncFormatter
import matplotlib.ticker as mtick
from matplotlib.ticker import PercentFormatter
from matplotlib.patches import Patch

#ML preprocessing
from sklearn.preprocessing import StandardScaler

#get ML functions
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, StratifiedKFold, learning_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import __version__ as sklearn_version
import datetime

#get ML metric functions
from sklearn.metrics import accuracy_score, precision_score, recall_score, auc, precision_recall_curve, confusion_matrix

#get icr module and associated common functions
import in_vehicle_coupon_recommendation as icr
from in_vehicle_coupon_recommendation import p, rcp, rpp, rcr, pl, pdc, save_and_return_data_frame, initialize_custom_notebook_settings

time: 2.26 s (started: 2023-09-28 17:19:55 -07:00)

In [5]:

#initialize notebook
st='no'; number_of_replicates=100 if st=='yes' else 10000

filename_version='4dot3'

initialize_custom_notebook_settings()

%load_ext autoreload
%autoreload 1
%aimport in_vehicle_coupon_recommendation 
%load_ext watermark

time: 31.4 ms (started: 2023-09-28 17:19:57 -07:00)

Get In-Vehicle Coupon Recommendation Dataset¶

In [6]:

df = pd.read_csv(os.path.join('..', 'data', 'raw', 'in-vehicle-coupon-recommendation.csv'))
df = df.sample(frac=1, random_state=200) #row shuffle DataFrame
#p(df)

time: 48.1 ms (started: 2023-09-28 17:19:57 -07:00)

In [ ]:

Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

Data Wrangling

In [7]:

#rename 'passanger' and 'coupon' column
df = df.rename(columns={'passanger':'passenger', 'coupon':'coupon_venue_type'})

#decode string 'age' values to (explicit) string age ranges
column_name_age_value_list = list(df.loc[:, 'age'].unique())
column_name_age_value_list_cleaned_up = ['21-25', '46-49', '26-30', '31-35', '41-45', '50+', '36-40', '<21']
column_name_age_value_dict = dict(zip(column_name_age_value_list, column_name_age_value_list_cleaned_up))
df.loc[:, 'age'] = df.loc[:, 'age'].replace(column_name_age_value_dict)

del column_name_age_value_dict, column_name_age_value_list, column_name_age_value_list_cleaned_up


#drop column 'toCoupon_GEQ5min' because all 1's
column_name_list_not_toCoupon_GEQ5min = [column_name for column_name in df.columns if column_name != 'toCoupon_GEQ5min']
df = df.loc[:, column_name_list_not_toCoupon_GEQ5min]

del column_name_list_not_toCoupon_GEQ5min


#rename values in columns 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', and 'Restaurant20To50' to math language...
#get column name list for column values uniques that are the same as column name 'CarryAway' value uniques
column_name_list_same_unique_values = []
for column_name in df.columns:    
    value = icr.column_name_value_sets_equal(df, column_name1='CarryAway', column_name2=column_name)
    if value == 1:
        column_name_list_same_unique_values = column_name_list_same_unique_values + [column_name]
#rename values in columns 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', and 'Restaurant20To50' to math language
column_name_Bar_value_list = list(df.loc[:, 'Bar'].unique())
column_name_Bar_value_list_cleaned_up = ['never', '<1', '1-3', '>8', np.nan, '4-8']
column_name_Bar_value_dict = dict(zip(column_name_Bar_value_list, column_name_Bar_value_list_cleaned_up))
df.loc[:, column_name_list_same_unique_values] = df.loc[:, column_name_list_same_unique_values].replace(column_name_Bar_value_dict)

del column_name_Bar_value_dict, column_name_Bar_value_list, column_name_Bar_value_list_cleaned_up, column_name_list_same_unique_values


#clean up columns 'direction_same' and 'direction_opp' to 'direction_same_or_opposite'
#drop column 'direction_same'
column_name_list_not_direction_same = [column_name for column_name in df.columns if column_name != 'direction_same']
df = df.loc[:, column_name_list_not_direction_same]

del column_name_list_not_direction_same

#rename 'direction_opp' to 'direction_same_or_opposite'
df = df.rename(columns={'direction_opp':'direction_same_or_opposite'})


#solution1: fill in missing values with 'no response' or 'unknown'
column_name_list = ['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50', 'car']
df.loc[:, column_name_list] = df.loc[:, column_name_list].fillna('no response')


#fix income values
income_list = df.loc[:, 'income'].drop_duplicates().to_list()
income_list_fixed = ['\$12500 - \$24999', '\$87500 - \$99999', '\$37500 - \$49999', '\$50000 - \$62499', '\$75000 - \$87499', '\$25000 - \$37499', '\$100000 or More', 'Less than \$12500', '\$62500 - \$74999']
income_dictionary = dict(zip(income_list, income_list_fixed))
df.loc[:, 'income'] = df.loc[:, 'income'].replace(income_dictionary)
p(df)

(12684, 24)

Out[7]:

	destination	passenger	weather	temperature	time	coupon_venue_type	expiration	gender	age	maritalStatus	has_children	education	occupation	income	car	Bar	CoffeeHouse	CarryAway	RestaurantLessThan20	Restaurant20To50	toCoupon_GEQ15min	toCoupon_GEQ25min	direction_same_or_opposite	Y
11199	Home	Alone	Sunny	80	6PM	Carry out & Take away	2h	Male	21-25	Single	0	Bachelors degree	Student	\$12500 - \$24999	no response	never	no response	no response	no response	never	1	0	0	1
1474	Work	Alone	Sunny	55	7AM	Bar	1d	Male	46-49	Married partner	1	Graduate degree (Masters or Doctorate)	Management	\$87500 - \$99999	no response	never	<1	<1	never	<1	1	1	1	0
10836	Home	Alone	Sunny	30	6PM	Carry out & Take away	1d	Male	26-30	Single	0	Some college - no degree	Sales & Related	\$37500 - \$49999	no response	<1	no response	>8	<1	never	0	0	1	1
4567	No Urgent Place	Alone	Sunny	80	10AM	Bar	1d	Female	21-25	Unmarried partner	0	Graduate degree (Masters or Doctorate)	Education&Training&Library	\$37500 - \$49999	no response	1-3	<1	<1	<1	never	0	0	1	0
5658	No Urgent Place	Alone	Sunny	80	2PM	Restaurant(<20)	2h	Female	31-35	Single	1	Bachelors degree	Production Occupations	\$37500 - \$49999	no response	never	never	>8	4-8	never	1	0	1	1
11343	Work	Alone	Sunny	80	7AM	Restaurant(20-50)	2h	Female	36-40	Single	1	Bachelors degree	Food Preparation & Serving Related	\$12500 - \$24999	no response	1-3	<1	>8	never	never	0	0	0	0
9036	Home	Alone	Snowy	30	10PM	Restaurant(<20)	2h	Male	26-30	Single	0	Some college - no degree	Student	\$12500 - \$24999	no response	<1	never	>8	never	1-3	1	1	1	0
11050	Home	Alone	Sunny	80	6PM	Restaurant(20-50)	1d	Male	46-49	Single	0	Some college - no degree	Sales & Related	Less than \$12500	no response	<1	<1	>8	>8	>8	0	0	0	0
784	Work	Alone	Sunny	80	7AM	Carry out & Take away	2h	Female	21-25	Single	0	Graduate degree (Masters or Doctorate)	Legal	\$25000 - \$37499	no response	<1	<1	<1	no response	<1	0	0	0	1
1818	No Urgent Place	Kid(s)	Sunny	80	10AM	Bar	1d	Female	36-40	Married partner	1	Bachelors degree	Retired	\$50000 - \$62499	no response	1-3	never	>8	<1	never	1	0	1	0

time: 93.4 ms (started: 2023-09-28 17:19:57 -07:00)

In [8]:

#takeaway: the most represented scenario is heading to Work, alone, sunny weather, 55 degrees, 7am, bar, 1 day til expiration, >25 minute away in the opposite drive direction of destination
column_name_list_scenario = ['destination', 'passenger', 'weather', 'temperature', 'time', 'coupon_venue_type', 'expiration', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same_or_opposite']
print(df.loc[:, column_name_list_scenario].drop_duplicates().shape)
df.loc[:, column_name_list_scenario].value_counts().to_frame().reset_index().rename(columns={0:'Count'}).head(10)

(201, 10)

Out[8]:

	destination	passenger	weather	temperature	time	coupon_venue_type	expiration	toCoupon_GEQ15min	toCoupon_GEQ25min	direction_same_or_opposite	Count
0	Work	Alone	Sunny	55	7AM	Bar	1d	1	1	1	194
1	No Urgent Place	Friend(s)	Sunny	55	2PM	Carry out & Take away	1d	1	0	1	181
2	No Urgent Place	Friend(s)	Sunny	80	2PM	Coffee House	2h	1	0	1	181
3	Work	Alone	Sunny	80	7AM	Carry out & Take away	2h	0	0	0	181
4	No Urgent Place	Alone	Sunny	55	2PM	Restaurant(<20)	1d	0	0	1	181
5	No Urgent Place	Friend(s)	Sunny	80	10AM	Carry out & Take away	2h	1	0	1	181
6	Work	Alone	Sunny	80	7AM	Restaurant(20-50)	1d	1	0	1	181
7	No Urgent Place	Friend(s)	Sunny	80	6PM	Restaurant(<20)	2h	1	0	1	181
8	No Urgent Place	Friend(s)	Sunny	80	10AM	Coffee House	2h	0	0	1	180
9	Home	Alone	Sunny	80	6PM	Coffee House	2h	0	0	1	180

time: 23.8 ms (started: 2023-09-28 17:19:57 -07:00)

In [9]:

#takeaway: the most represented demographic is Female, 31-35, Married partner, 0, Some college - no degree	Computer & Mathematical, Income $100000 or More, car no response, Bar never, CoffeeHouse never, Takeout no response, low-cost restaurant >8, mid-range restaurant never
column_name_list_not_demographic=['destination', 'passenger', 'weather', 'temperature', 'time', 'coupon_venue_type', 'expiration', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same_or_opposite', 'Y',]
column_name_list_demographic=[column_name for column_name in df.columns if not column_name in column_name_list_not_demographic]
df.loc[:, column_name_list_demographic].value_counts().reset_index().rename(columns={0:'Count'}).head(10)

Out[9]:

	gender	age	maritalStatus	has_children	education	occupation	income	car	Bar	CoffeeHouse	CarryAway	RestaurantLessThan20	Restaurant20To50	Count
0	Female	31-35	Married partner	0	Some college - no degree	Computer & Mathematical	\$100000 or More	no response	never	never	no response	>8	never	110
1	Male	46-49	Married partner	1	Graduate degree (Masters or Doctorate)	Management	\$87500 - \$99999	no response	never	<1	<1	never	<1	66
2	Female	26-30	Married partner	1	Associates degree	Unemployed	\$50000 - \$62499	no response	1-3	<1	<1	<1	<1	66
3	Male	21-25	Single	0	High School Graduate	Unemployed	\$37500 - \$49999	no response	no response	no response	no response	no response	no response	66
4	Female	31-35	Married partner	0	Some college - no degree	Arts Design Entertainment Sports & Media	\$100000 or More	no response	never	never	no response	>8	never	44
5	Female	21-25	Single	0	Bachelors degree	Unemployed	Less than \$12500	no response	>8	<1	no response	>8	<1	44
6	Female	41-45	Married partner	1	Graduate degree (Masters or Doctorate)	Computer & Mathematical	\$75000 - \$87499	no response	never	>8	no response	<1	never	44
7	Male	<21	Married partner	1	Associates degree	Computer & Mathematical	\$100000 or More	no response	1-3	never	>8	<1	never	44
8	Female	<21	Divorced	1	Graduate degree (Masters or Doctorate)	Student	\$12500 - \$24999	no response	1-3	no response	no response	no response	<1	44
9	Female	50+	Unmarried partner	0	High School Graduate	Student	Less than \$12500	no response	1-3	>8	<1	>8	never	44

time: 23 ms (started: 2023-09-28 17:19:57 -07:00)

In [ ]:

Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

Feature Engineering

Get Category Representative Numeric Encoding for Features: expiration, time, age, income¶

In [10]:

#category representative numeric encoding: expiration, time, age, income

expiration_category_representative_numeric_encoding={'expiration':{'2h':2, '1d':24}}
time_category_representative_numeric_encoding={'time':{'7AM':7,'10AM':10, '2PM':14, '6PM':18, '10PM':22}}
age_category_representative_numeric_encoding={'age':{'<21':18, '21-25':23, '26-30':28, '31-35':33, '36-40':38, '41-45':43, '46-49':48, '50+':56}}
income_category_representative_numeric_encoding={'income':{'Less than \$12500':6250, '\$12500 - \$24999':18749.5, '\$25000 - \$37499':31249.5, '\$37500 - \$49999':43749.5, '\$50000 - \$62499':56249.5, '\$62500 - \$74999':68749.5, '\$75000 - \$87499':81249.5, '\$87500 - \$99999':93749.5,'\$100000 or More':150000}}

#combine category representative numeric encoding dictionaries
category_representative_numeric_encoding_dictionary = expiration_category_representative_numeric_encoding|time_category_representative_numeric_encoding|age_category_representative_numeric_encoding|income_category_representative_numeric_encoding

del expiration_category_representative_numeric_encoding, time_category_representative_numeric_encoding, age_category_representative_numeric_encoding, income_category_representative_numeric_encoding

category_representative_numeric_encoding_key_list = list(category_representative_numeric_encoding_dictionary.keys())

df_category_representative_numeric_encoding = df.loc[:, category_representative_numeric_encoding_key_list].replace(category_representative_numeric_encoding_dictionary)

del category_representative_numeric_encoding_dictionary, 

#rename columns
category_representative_numeric_encoding_substring='_category_representative_numeric_encoding'

column_name_list_category_representative_numeric_encoding = [str(column_name) + category_representative_numeric_encoding_substring for column_name in category_representative_numeric_encoding_key_list]

column_name_dictionary_category_representative_numeric_encoding = dict(zip(category_representative_numeric_encoding_key_list, column_name_list_category_representative_numeric_encoding))

df_category_representative_numeric_encoding = df_category_representative_numeric_encoding.rename(columns=column_name_dictionary_category_representative_numeric_encoding)

del column_name_dictionary_category_representative_numeric_encoding, column_name_list_category_representative_numeric_encoding, category_representative_numeric_encoding_key_list, category_representative_numeric_encoding_substring

#p(df_category_representative_numeric_encoding)

time: 27.8 ms (started: 2023-09-28 17:19:57 -07:00)

Get Binary Encoding for Features: gender, expiration¶

In [11]:

#binary encoding: gender, expiration
gender_binary_encoding={'gender':{'Female':0, 'Male':1}}
expiration_binary_encoding={'expiration':{'2h':0, '1d':1}}

binary_encoding_dictionary = gender_binary_encoding|expiration_binary_encoding

del gender_binary_encoding, expiration_binary_encoding

binary_encoding_key_list = list(binary_encoding_dictionary.keys())

df_binary_encoding = df.loc[:, binary_encoding_key_list].replace(binary_encoding_dictionary)

del binary_encoding_dictionary


#rename columns
binary_encoding_substring = '_binary_encoding'
column_name_list_binary_encoding = [str(column_name) + binary_encoding_substring for column_name in binary_encoding_key_list]
column_name_dictionary_binary_encoding = dict(zip(binary_encoding_key_list, column_name_list_binary_encoding))
df_binary_encoding = df_binary_encoding.rename(columns=column_name_dictionary_binary_encoding)

del column_name_dictionary_binary_encoding, column_name_list_binary_encoding, binary_encoding_key_list, binary_encoding_substring

#p(df_binary_encoding)

time: 11.9 ms (started: 2023-09-28 17:19:57 -07:00)

Get Ordinal Integer Encoding for Features: coupon_venue_type, education, income, age, time, temperature¶

In [12]:

#ordinal integer encoding: coupon_venue_type, education, income, age, time, temperature

#category ordinal features to ordinal integer encoding
coupon_venue_type_ordinal_integer_encoding={'coupon_venue_type':{'Coffee House':1, 'Bar':2, 'Carry out & Take away':3, 'Restaurant(<20)':4, 'Restaurant(20-50)':5}}
education_ordinal_integer_encoding={'education':{'Some High School':1, 'High School Graduate':2, 'Some college - no degree':3, 'Associates degree':4, 'Bachelors degree':5,'Graduate degree (Masters or Doctorate)':6}}
income_ordinal_integer_encoding={'income':{'Less than \$12500':1, '\$12500 - \$24999':2, '\$25000 - \$37499':3, '\$37500 - \$49999':4, '\$50000 - \$62499':5, '\$62500 - \$74999':6, '\$75000 - \$87499':7, '\$87500 - \$99999':8, '\$100000 or More':9}}
age_ordinal_integer_encoding={'age':{'<21':1, '21-25':2, '26-30':3, '31-35':4, '36-40':5, '41-45':6, '46-49':7, '50+':8}}
time_ordinal_integer_encoding={'time':{'7AM':1, '10AM':2, '2PM':3, '6PM':4, '10PM':5}}

#numeric to ordinal integer encoding
temperature_ordinal_integer_encoding={'temperature':{30:1, 55:2, 80:3}}


ordinal_integer_encoding_dictionary = coupon_venue_type_ordinal_integer_encoding|education_ordinal_integer_encoding|income_ordinal_integer_encoding|age_ordinal_integer_encoding|time_ordinal_integer_encoding|temperature_ordinal_integer_encoding

del coupon_venue_type_ordinal_integer_encoding, education_ordinal_integer_encoding, income_ordinal_integer_encoding, age_ordinal_integer_encoding, time_ordinal_integer_encoding, temperature_ordinal_integer_encoding

ordinal_integer_encoding_key_list = list(ordinal_integer_encoding_dictionary.keys())

df_ordinal_integer_encoding = df.loc[:, ordinal_integer_encoding_key_list].replace(ordinal_integer_encoding_dictionary)

del ordinal_integer_encoding_dictionary

#rename columns
ordinal_integer_encoding_substring='_ordinal_integer_encoding'
column_name_list_ordinal_integer_encoding=[str(column_name) + ordinal_integer_encoding_substring for column_name in ordinal_integer_encoding_key_list]
column_name_dictionary_ordinal_integer_encoding=dict(zip(ordinal_integer_encoding_key_list,column_name_list_ordinal_integer_encoding))
df_ordinal_integer_encoding=df_ordinal_integer_encoding.rename(columns=column_name_dictionary_ordinal_integer_encoding)

del column_name_dictionary_ordinal_integer_encoding, column_name_list_ordinal_integer_encoding, ordinal_integer_encoding_key_list, ordinal_integer_encoding_substring

#p(df_ordinal_integer_encoding)

time: 32.8 ms (started: 2023-09-28 17:19:57 -07:00)

Get Venue Type Visits Per Month Yes Response to Ordinal Integer Encoding for Features: bar, coffeehouse, carryaway, restaurantlessthan20, restaurant20to50¶

In [13]:

#venue type visits per month yes response to ordinal integer encoding: bar, coffeehouse, carryaway, restaurantlessthan20, restaurant20to50
bar_venue_visit_frequency_yes_response_ordinal_integer_encoding={'Bar':{'never':1, '<1':2, '1-3':3, '4-8':4, '>8':5, 'no response':0}}
coffeehouse_venue_visit_frequency_yes_response_ordinal_integer_encoding={'CoffeeHouse':{'never':1, '<1':2, '1-3':3, '4-8':4, '>8':5, 'no response':0}}
carryaway_venue_visit_frequency_yes_response_ordinal_integer_encoding={'CarryAway':{'never':1, '<1':2, '1-3':3, '4-8':4, '>8':5, 'no response':0}}
restaurantlessthan20_venue_visit_frequency_yes_response_ordinal_integer_encoding={'RestaurantLessThan20':{'never':1, '<1':2, '1-3':3, '4-8':4, '>8':5, 'no response':0}}
restaurant20to50_venue_visit_frequency_yes_response_ordinal_integer_encoding={'Restaurant20To50':{'never':1, '<1':2, '1-3':3, '4-8':4, '>8':5, 'no response':0}}


venue_visit_frequency_yes_response_ordinal_integer_encoding_dictionary=bar_venue_visit_frequency_yes_response_ordinal_integer_encoding|coffeehouse_venue_visit_frequency_yes_response_ordinal_integer_encoding|carryaway_venue_visit_frequency_yes_response_ordinal_integer_encoding|restaurantlessthan20_venue_visit_frequency_yes_response_ordinal_integer_encoding|restaurant20to50_venue_visit_frequency_yes_response_ordinal_integer_encoding

del bar_venue_visit_frequency_yes_response_ordinal_integer_encoding, carryaway_venue_visit_frequency_yes_response_ordinal_integer_encoding, restaurantlessthan20_venue_visit_frequency_yes_response_ordinal_integer_encoding, restaurant20to50_venue_visit_frequency_yes_response_ordinal_integer_encoding
#del bar_venue_visit_frequency_yes_response_ordinal_integer_encoding, coffeehouse_venue_visit_frequency_yes_response_ordinal_integer_encoding, carryaway_venue_visit_frequency_yes_response_ordinal_integer_encoding, restaurantlessthan20_venue_visit_frequency_yes_response_ordinal_integer_encoding, restaurant20to50_venue_visit_frequency_yes_response_ordinal_integer_encoding

venue_visit_frequency_yes_response_ordinal_integer_encoding_key_list=list(venue_visit_frequency_yes_response_ordinal_integer_encoding_dictionary.keys())

df_venue_visit_frequency_yes_response_ordinal_integer_encoding = df.loc[:, venue_visit_frequency_yes_response_ordinal_integer_encoding_key_list].replace(venue_visit_frequency_yes_response_ordinal_integer_encoding_dictionary)

del venue_visit_frequency_yes_response_ordinal_integer_encoding_dictionary

#rename columns
venue_visit_frequency_yes_response_ordinal_integer_encoding_substring='_venue_visit_frequency_yes_response_ordinal_integer_encoding'
column_name_list_venue_visit_frequency_yes_response_ordinal_integer_encoding=[str(column_name) + venue_visit_frequency_yes_response_ordinal_integer_encoding_substring for column_name in venue_visit_frequency_yes_response_ordinal_integer_encoding_key_list]
column_name_dictionary_venue_visit_frequency_yes_response_ordinal_integer_encoding=dict(zip(venue_visit_frequency_yes_response_ordinal_integer_encoding_key_list, column_name_list_venue_visit_frequency_yes_response_ordinal_integer_encoding))
df_venue_visit_frequency_yes_response_ordinal_integer_encoding = df_venue_visit_frequency_yes_response_ordinal_integer_encoding.rename(columns=column_name_dictionary_venue_visit_frequency_yes_response_ordinal_integer_encoding)

del column_name_dictionary_venue_visit_frequency_yes_response_ordinal_integer_encoding, venue_visit_frequency_yes_response_ordinal_integer_encoding_key_list, venue_visit_frequency_yes_response_ordinal_integer_encoding_substring, column_name_list_venue_visit_frequency_yes_response_ordinal_integer_encoding

#p(df_venue_visit_frequency_yes_response_ordinal_integer_encoding)

time: 31.6 ms (started: 2023-09-28 17:19:57 -07:00)

Get Venue Type Visits Per Month No Response to Indicator Variable for Features: bar, coffeehouse, carryaway, restaurantlessthan20, restaurant20to50¶

In [14]:

#venue type visits per month no response to indicator variable: bar, coffeehouse, carryaway, restaurantlessthan20, restaurant20to50
bar_venue_visit_frequency_no_response_indicator_variable={'Bar':{'never':0, '<1':0, '1-3':0, '4-8':0, '>8':0, 'no response':1}}
coffeehouse_venue_visit_frequency_no_response_indicator_variable={'CoffeeHouse':{'never':0, '<1':0, '1-3':0, '4-8':0, '>8':0, 'no response':1}}
carryaway_venue_visit_frequency_no_response_indicator_variable={'CarryAway':{'never':0, '<1':0, '1-3':0, '4-8':0, '>8':0, 'no response':1}}
restaurantlessthan20_venue_visit_frequency_no_response_indicator_variable={'RestaurantLessThan20':{'never':0, '<1':0, '1-3':0, '4-8':0, '>8':0, 'no response':1}}
restaurant20to50_venue_visit_frequency_no_response_indicator_variable={'Restaurant20To50':{'never':0, '<1':0, '1-3':0, '4-8':0, '>8':0, 'no response':1}}

venue_visit_frequency_no_response_indicator_variable_dictionary=bar_venue_visit_frequency_no_response_indicator_variable|coffeehouse_venue_visit_frequency_no_response_indicator_variable|carryaway_venue_visit_frequency_no_response_indicator_variable|restaurantlessthan20_venue_visit_frequency_no_response_indicator_variable|restaurant20to50_venue_visit_frequency_no_response_indicator_variable

del bar_venue_visit_frequency_no_response_indicator_variable, coffeehouse_venue_visit_frequency_no_response_indicator_variable, carryaway_venue_visit_frequency_no_response_indicator_variable, restaurantlessthan20_venue_visit_frequency_no_response_indicator_variable, restaurant20to50_venue_visit_frequency_no_response_indicator_variable

venue_visit_frequency_no_response_indicator_variable_key_list=list(venue_visit_frequency_no_response_indicator_variable_dictionary.keys())

df_venue_visit_frequency_no_response_indicator_variable=df.loc[:, venue_visit_frequency_no_response_indicator_variable_key_list].replace(venue_visit_frequency_no_response_indicator_variable_dictionary)

del venue_visit_frequency_no_response_indicator_variable_dictionary

#rename columns
venue_visit_frequency_no_response_indicator_variable='_venue_visit_frequency_no_response_indicator'
column_name_list_venue_visit_frequency_no_response_indicator_variable=[str(column_name) + venue_visit_frequency_no_response_indicator_variable for column_name in venue_visit_frequency_no_response_indicator_variable_key_list]
column_name_dictionary_venue_visit_frequency_no_response_indicator_variable=dict(zip(venue_visit_frequency_no_response_indicator_variable_key_list, column_name_list_venue_visit_frequency_no_response_indicator_variable))
df_venue_visit_frequency_no_response_indicator_variable=df_venue_visit_frequency_no_response_indicator_variable.rename(columns=column_name_dictionary_venue_visit_frequency_no_response_indicator_variable)

del column_name_dictionary_venue_visit_frequency_no_response_indicator_variable, column_name_list_venue_visit_frequency_no_response_indicator_variable, venue_visit_frequency_no_response_indicator_variable_key_list, venue_visit_frequency_no_response_indicator_variable

#p(df_venue_visit_frequency_no_response_indicator_variable)

time: 31.6 ms (started: 2023-09-28 17:19:57 -07:00)

Concatenate the Data Frames¶

In [15]:

#concatenate the data frames

#get data frame list
data_frame_list=[df, df_category_representative_numeric_encoding, df_binary_encoding, df_ordinal_integer_encoding, df_venue_visit_frequency_yes_response_ordinal_integer_encoding, df_venue_visit_frequency_no_response_indicator_variable]
del df_category_representative_numeric_encoding, df_binary_encoding, df_ordinal_integer_encoding, df_venue_visit_frequency_yes_response_ordinal_integer_encoding, df_venue_visit_frequency_no_response_indicator_variable

#concat data frames
df = pd.concat(data_frame_list, axis=1)
del data_frame_list

p(df)

(12684, 46)

Out[15]:

	destination	passenger	weather	temperature	time	coupon_venue_type	expiration	gender	age	maritalStatus	has_children	education	occupation	income	car	Bar	CoffeeHouse	CarryAway	RestaurantLessThan20	Restaurant20To50	toCoupon_GEQ15min	toCoupon_GEQ25min	direction_same_or_opposite	Y	expiration_category_representative_numeric_encoding	time_category_representative_numeric_encoding	age_category_representative_numeric_encoding	income_category_representative_numeric_encoding	gender_binary_encoding	expiration_binary_encoding	coupon_venue_type_ordinal_integer_encoding	education_ordinal_integer_encoding	income_ordinal_integer_encoding	age_ordinal_integer_encoding	time_ordinal_integer_encoding	temperature_ordinal_integer_encoding	Bar_venue_visit_frequency_yes_response_ordinal_integer_encoding	CoffeeHouse_venue_visit_frequency_yes_response_ordinal_integer_encoding	CarryAway_venue_visit_frequency_yes_response_ordinal_integer_encoding	RestaurantLessThan20_venue_visit_frequency_yes_response_ordinal_integer_encoding	Restaurant20To50_venue_visit_frequency_yes_response_ordinal_integer_encoding	CoffeeHouse_venue_visit_frequency_no_response_indicator	CarryAway_venue_visit_frequency_no_response_indicator	RestaurantLessThan20_venue_visit_frequency_no_response_indicator
11199	Home	Alone	Sunny	80	6PM	Carry out & Take away	2h	Male	21-25	Single	0	Bachelors degree	Student	\$12500 - \$24999	no response	never	no response	no response	no response	never	1	0	0	1	2	18	23	18749.5	1	0	3	5	2	2	4	3	1	0	0	0	1	1	1	1
1474	Work	Alone	Sunny	55	7AM	Bar	1d	Male	46-49	Married partner	1	Graduate degree (Masters or Doctorate)	Management	\$87500 - \$99999	no response	never	<1	<1	never	<1	1	1	1	0	24	7	48	93749.5	1	1	2	6	8	7	1	2	1	2	2	1	2	0	0	0
10836	Home	Alone	Sunny	30	6PM	Carry out & Take away	1d	Male	26-30	Single	0	Some college - no degree	Sales & Related	\$37500 - \$49999	no response	<1	no response	>8	<1	never	0	0	1	1	24	18	28	43749.5	1	1	3	3	4	3	4	1	2	0	5	2	1	1	0	0
4567	No Urgent Place	Alone	Sunny	80	10AM	Bar	1d	Female	21-25	Unmarried partner	0	Graduate degree (Masters or Doctorate)	Education&Training&Library	\$37500 - \$49999	no response	1-3	<1	<1	<1	never	0	0	1	0	24	10	23	43749.5	0	1	2	6	4	2	2	3	3	2	2	2	1	0	0	0
5658	No Urgent Place	Alone	Sunny	80	2PM	Restaurant(<20)	2h	Female	31-35	Single	1	Bachelors degree	Production Occupations	\$37500 - \$49999	no response	never	never	>8	4-8	never	1	0	1	1	2	14	33	43749.5	0	0	4	5	4	4	3	3	1	1	5	4	1	0	0	0
11343	Work	Alone	Sunny	80	7AM	Restaurant(20-50)	2h	Female	36-40	Single	1	Bachelors degree	Food Preparation & Serving Related	\$12500 - \$24999	no response	1-3	<1	>8	never	never	0	0	0	0	2	7	38	18749.5	0	0	5	5	2	5	1	3	3	2	5	1	1	0	0	0
9036	Home	Alone	Snowy	30	10PM	Restaurant(<20)	2h	Male	26-30	Single	0	Some college - no degree	Student	\$12500 - \$24999	no response	<1	never	>8	never	1-3	1	1	1	0	2	22	28	18749.5	1	0	4	3	2	3	5	1	2	1	5	1	3	0	0	0
11050	Home	Alone	Sunny	80	6PM	Restaurant(20-50)	1d	Male	46-49	Single	0	Some college - no degree	Sales & Related	Less than \$12500	no response	<1	<1	>8	>8	>8	0	0	0	0	24	18	48	6250.0	1	1	5	3	1	7	4	3	2	2	5	5	5	0	0	0
784	Work	Alone	Sunny	80	7AM	Carry out & Take away	2h	Female	21-25	Single	0	Graduate degree (Masters or Doctorate)	Legal	\$25000 - \$37499	no response	<1	<1	<1	no response	<1	0	0	0	1	2	7	23	31249.5	0	0	3	6	3	2	1	3	2	2	2	0	2	0	0	1
1818	No Urgent Place	Kid(s)	Sunny	80	10AM	Bar	1d	Female	36-40	Married partner	1	Bachelors degree	Retired	\$50000 - \$62499	no response	1-3	never	>8	<1	never	1	0	1	0	24	10	38	56249.5	0	1	2	5	5	5	2	3	3	1	5	2	1	0	0	0

time: 24.5 ms (started: 2023-09-28 17:19:57 -07:00)

In [16]:

#save df collection with four object train test splits before preprocessed DataFrame
df_collection_filename='df_collection_train_test_v' + filename_version + '.pkl'


df_readback = icr.return_processed_collection_if_it_exists(filename=df_collection_filename)
if df_readback != None:
    df_collection = df_readback
else:
    df_collection = {}
    
    #split the data into train and test
    df_collection['X_train'], df_collection['X_test'], df_collection['Y_train'], df_collection['Y_test'] = \
    train_test_split(df.drop(columns=['Y']), df.loc[:, 'Y'], test_size=.2, random_state=200)
    
    #save preprocessed data frame collection
    df_collection = icr.save_and_return_collection(data_frame_collection=df_collection, filename=df_collection_filename)

p(df_collection['X_train'])

This file already exists
(10147, 45)

Out[16]:

	destination	passenger	weather	temperature	time	coupon_venue_type	expiration	gender	age	maritalStatus	has_children	education	occupation	income	car	Bar	CoffeeHouse	CarryAway	RestaurantLessThan20	Restaurant20To50	toCoupon_GEQ15min	direction_same_or_opposite	expiration_category_representative_numeric_encoding	time_category_representative_numeric_encoding	age_category_representative_numeric_encoding	income_category_representative_numeric_encoding	gender_binary_encoding	expiration_binary_encoding	coupon_venue_type_ordinal_integer_encoding	education_ordinal_integer_encoding	income_ordinal_integer_encoding	age_ordinal_integer_encoding	time_ordinal_integer_encoding	temperature_ordinal_integer_encoding	Bar_venue_visit_frequency_yes_response_ordinal_integer_encoding	CoffeeHouse_venue_visit_frequency_yes_response_ordinal_integer_encoding	CarryAway_venue_visit_frequency_yes_response_ordinal_integer_encoding	RestaurantLessThan20_venue_visit_frequency_yes_response_ordinal_integer_encoding	Restaurant20To50_venue_visit_frequency_yes_response_ordinal_integer_encoding	Bar_venue_visit_frequency_no_response_indicator	CarryAway_venue_visit_frequency_no_response_indicator
8630	No Urgent Place	Friend(s)	Sunny	80	6PM	Coffee House	1d	Female	21-25	Married partner	0	Bachelors degree	Unemployed	\$87500 - \$99999	no response	<1	never	4-8	never	never	0	1	24	18	23	93749.5	0	1	1	5	8	2	4	3	2	1	4	1	1	0	0
2418	No Urgent Place	Friend(s)	Sunny	55	2PM	Coffee House	2h	Female	26-30	Single	0	Bachelors degree	Arts Design Entertainment Sports & Media	\$12500 - \$24999	no response	>8	>8	>8	>8	never	0	1	2	14	28	18749.5	0	0	1	5	2	3	3	2	5	5	5	5	1	0	0
10804	No Urgent Place	Alone	Rainy	55	2PM	Carry out & Take away	2h	Female	31-35	Single	0	Graduate degree (Masters or Doctorate)	Legal	\$75000 - \$87499	no response	no response	>8	>8	>8	>8	0	1	2	14	33	81249.5	0	0	3	6	7	4	3	2	0	5	5	5	5	1	0
747	No Urgent Place	Friend(s)	Sunny	80	2PM	Coffee House	1d	Female	36-40	Married partner	0	Some college - no degree	Healthcare Support	\$25000 - \$37499	no response	<1	<1	>8	>8	<1	1	1	24	14	38	31249.5	0	1	1	3	3	5	3	3	2	2	5	5	2	0	0
7333	No Urgent Place	Friend(s)	Sunny	80	6PM	Coffee House	1d	Male	<21	Unmarried partner	1	Some college - no degree	Building & Grounds Cleaning & Maintenance	\$25000 - \$37499	no response	<1	never	<1	<1	never	0	1	24	18	18	31249.5	1	1	1	3	3	1	4	3	2	1	2	2	1	0	0
10949	No Urgent Place	Kid(s)	Snowy	30	6PM	Bar	1d	Female	31-35	Married partner	1	Bachelors degree	Student	\$37500 - \$49999	no response	1-3	never	no response	>8	never	0	1	24	18	33	43749.5	0	1	2	5	4	4	4	1	3	1	0	5	1	0	1
11937	No Urgent Place	Alone	Rainy	55	10AM	Bar	1d	Female	31-35	Married partner	0	Some college - no degree	Computer & Mathematical	\$100000 or More	no response	never	never	no response	>8	never	1	1	24	10	33	150000.0	0	1	2	3	9	4	2	2	1	1	0	5	1	0	1
735	Home	Alone	Sunny	55	6PM	Restaurant(20-50)	1d	Male	41-45	Single	0	Some college - no degree	Sales & Related	\$37500 - \$49999	no response	1-3	1-3	>8	<1	never	1	1	24	18	43	43749.5	1	1	5	3	4	6	4	2	3	3	5	2	1	0	0
67	No Urgent Place	Friend(s)	Sunny	80	10AM	Coffee House	2h	Male	46-49	Married partner	1	Bachelors degree	Education&Training&Library	\$75000 - \$87499	no response	never	<1	<1	<1	never	0	1	2	10	48	81249.5	1	0	1	5	7	7	2	3	1	2	2	2	1	0	0
3251	No Urgent Place	Friend(s)	Sunny	80	10AM	Coffee House	2h	Female	26-30	Married partner	1	Some college - no degree	Unemployed	\$25000 - \$37499	no response	1-3	never	<1	never	never	0	1	2	10	28	31249.5	0	0	1	3	3	3	2	3	3	1	2	1	1	0	0

time: 30 ms (started: 2023-09-28 17:19:57 -07:00)

In [ ]:

Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

Exploratory Data Analysis

Initialize Variables¶

In [17]:

color_list = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6']

time: 575 µs (started: 2023-09-28 17:19:57 -07:00)

Coupon Acceptance Percentage and Frequency Distribution (Vertical Bar Plot)¶

In [18]:

bar_orientation='vertical'
dpi=100
figure_filename = '../reports/figures/figure_coupon_acceptance_percentage_and_frequency_distribution_'+ str(bar_orientation) + '_bar_plot_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'
figure_filename_axes1 = '../reports/figures/figure_coupon_acceptance_percentage_and_frequency_distribution_'+ str(bar_orientation) + '_bar_plot_axes1_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'
figure_filename_axes2 = '../reports/figures/figure_coupon_acceptance_percentage_and_frequency_distribution_'+ str(bar_orientation) + '_bar_plot_axes2_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

plot_save_together_separate = 'together'

figsize=(12, 6.2)
bar_width=0.8

xlabel = 'Coupon Acceptance'
ylabel = 'Percentage'
title = 'Coupon Acceptance '+ str(ylabel) + ' Distribution'

target_value_dictionary = {1: 'Yes', 0:'No'}

df_Y_frequency = df.loc[:, ['Y']].value_counts().reset_index().rename(columns={0:'frequency'})
df_Y_frequency.loc[:, 'percentage'] = df_Y_frequency.loc[:, 'frequency'] / df_Y_frequency.loc[:, 'frequency'].sum(axis=0) * 100
df_Y_frequency_percentage = df_Y_frequency
del df_Y_frequency

x_label_list = df_Y_frequency_percentage.loc[:, 'Y'].replace(target_value_dictionary).to_list()


if plot_save_together_separate == 'together':
    figure, (axes1, axes2) = plt.subplots(nrows=1, ncols=2, figsize=figsize)
elif plot_save_together_separate == 'separate':
    figure1, axes1 = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    figure2, axes2 = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    

#make percentage subplot
annotation_round_by_number = 0

percentage_list = df_Y_frequency_percentage.loc[:, 'percentage'].to_list()

axes1.bar(x=x_label_list, height=percentage_list, width=bar_width, color=[color_list[3], color_list[0]])

axes1.set_title(label=title, fontsize=18)
axes1.set_xlabel(xlabel=xlabel, fontsize=17)
axes1.set_ylabel(ylabel=ylabel, fontsize=17)
axes1.tick_params(axis='both', which='major', labelsize=15)

for rectangle, annotation in zip(axes1.patches, percentage_list):
    axes1.text(rectangle.get_x() + rectangle.get_width() / 2, rectangle.get_height(), format(round(annotation, annotation_round_by_number), '.0f') + '%', ha="center", va="bottom", fontsize=15)

#make frequency subplot
ylabel = 'Frequency'
title = 'Coupon Acceptance '+ str(ylabel) + ' Distribution'
annotation_round_by_number = -2

frequency_list = df_Y_frequency_percentage.loc[:, 'frequency'].to_list()

axes2.bar(x=x_label_list, height=frequency_list, width=bar_width, color=[color_list[3], color_list[0]])

axes2.set_title(label=title, fontsize=18)
axes2.set_xlabel(xlabel=xlabel, fontsize=17)
axes2.set_ylabel(ylabel=ylabel, fontsize=17)
axes2.tick_params(axis='both', which='major', labelsize=15)

for rectangle, annotation in zip(axes2.patches, frequency_list):
    axes2.text(rectangle.get_x() + rectangle.get_width() / 2, rectangle.get_height() + 5, round(annotation, annotation_round_by_number), ha="center", va="bottom", fontsize=15)

plt.tight_layout()

if plot_save_together_separate == 'together':
    figure.savefig(figure_filename, bbox_inches='tight', dpi=dpi)
elif plot_save_together_separate == 'separate':
    figure1.savefig(figure_filename_axes1, bbox_inches='tight', dpi=dpi)
    figure2.savefig(figure_filename_axes2, bbox_inches='tight', dpi=dpi)


plt.show()

time: 262 ms (started: 2023-09-28 17:19:57 -07:00)

In [19]:

#sample size, population size, margin of error, confidence interval

#intialize parameters
z_score = 1.96; confidence_interval = .95

p_value = 0.5684326710816777
margin_of_error = .0381
N_population_size = 2.4 * 10**8

#known 652 surveys acceptance, https://jmlr.org/papers/volume18/16-003/16-003.pdf



#calculate sample size
sample_size = (z_score**2*p_value*(1-p_value)/margin_of_error**2)/(1+(z_score**2*p_value*(1-p_value)/margin_of_error**2/N_population_size))
print('The '+ str(round(confidence_interval*100, None)) +'% confidence interval for a ' + str(round(sample_size)) + ' sample size has a ' \
      + str(margin_of_error) + ' margin of error and ' + str(round(p_value, 5)) + ' p-value representing a '+ '{:,}'.format(round(N_population_size)) +' population size'
      + '. The sample of 652 survey participants is slightly more than 649.\n')

round_by_number = None
print('Therefore, '+ str(round(confidence_interval*100, None)) +'% of the time, we expect a coupon acceptance rate between ' \
      + str(round((p_value-margin_of_error/2)*100, round_by_number)) + '% and ' \
      + str(round((p_value+margin_of_error/2)*100, round_by_number)) + '%')

The 95% confidence interval for a 649 sample size has a 0.0381 margin of error and 0.56843 p-value representing a 240,000,000 population size. The sample of 652 survey participants is slightly more than 649.

Therefore, 95% of the time, we expect a coupon acceptance rate between 55% and 59%
time: 1.66 ms (started: 2023-09-28 17:19:58 -07:00)

In [20]:

feature_column_name = 'expiration'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'frequency'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['1d', '2h']
xtick_dictionary = {'1d':'1 day', '2h':'2 hours'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]


figsize=(8,8)


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number)

time: 170 ms (started: 2023-09-28 17:19:58 -07:00)

In [21]:

feature_column_name = 'income'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Percentage'

xtick_rotation=90
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2
rectangle_annotation_y_offset = -45

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['Less than \$12500', '\$12500 - \$24999', '\$25000 - \$37499', '\$37500 - \$49999', '\$50000 - \$62499', '\$62500 - \$74999',  '\$75000 - \$87499', '\$87500 - \$99999', '\$100000 or More',]
xtick_dictionary = None

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]


figsize=(8,8)


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, rectangle_annotation_y_offset=rectangle_annotation_y_offset)

time: 305 ms (started: 2023-09-28 17:19:58 -07:00)

Coupon Venue Drive Direction Frequency Distribution (Stacked Bar Plot Percentage Annotation)¶

In [22]:

feature_column_name = 'direction_same_or_opposite'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = [0,1]
xtick_dictionary = {0:'Same', 1:'Opposite'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,8)


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit)

time: 150 ms (started: 2023-09-28 17:19:58 -07:00)

Expiration Frequency Distribution (Stacked Bar Plot Percentage Annotation)¶

In [23]:

feature_column_name = 'expiration'
feature_column_name_label='Coupon '+feature_column_name.capitalize()
xlabel='Coupon '+feature_column_name
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['1d', '2h']
xtick_dictionary = {'1d':'1 day', '2h':'2 hours'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,8)


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit)

time: 159 ms (started: 2023-09-28 17:19:58 -07:00)

Gender Frequency Distribution (Stacked Bar Plot Percentage Annotation)¶

In [24]:

feature_column_name = 'gender'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['Female','Male', ]
xtick_dictionary = None

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=7500

figsize=(8,8)


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit)

time: 164 ms (started: 2023-09-28 17:19:58 -07:00)

In [25]:

feature_column_name = 'destination'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['No Urgent Place', 'Work', 'Home']
xtick_dictionary = {'No Urgent Place':'No urgent place', 'Work':'Work', 'Home':'Home'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,8)
rectangle_annotation_y_offset = -100


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset)

time: 172 ms (started: 2023-09-28 17:19:59 -07:00)

In [26]:

feature_column_name = 'passenger'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['Alone', 'Friend(s)', 'Partner', 'Kid(s)', ]
xtick_dictionary = None

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,8)

rectangle_annotation_y_offset=-140

#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset)

time: 188 ms (started: 2023-09-28 17:19:59 -07:00)

In [27]:

feature_column_name = 'weather'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['Sunny', 'Snowy', 'Rainy', ]
xtick_dictionary = None

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,8)

rectangle_annotation_y_offset=-140

#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset)

time: 166 ms (started: 2023-09-28 17:19:59 -07:00)

In [28]:

feature_column_name = 'time'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['7AM', '10AM', '2PM', '6PM', '10PM']
xtick_dictionary = None

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,8)

rectangle_annotation_y_offset=-80

#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset)

time: 206 ms (started: 2023-09-28 17:19:59 -07:00)

In [29]:

feature_column_name = 'coupon_venue_type'
feature_column_name_label='Coupon Venue Type'#feature_column_name.capitalize()
xlabel='Coupon venue type'#feature_column_name_label
ylabel='Frequency'

xtick_rotation=90
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['Coffee House', 'Restaurant(<20)', 'Carry out & Take away', 'Bar', 'Restaurant(20-50)']
xtick_dictionary = {'Coffee House':'Coffee house', 'Restaurant(<20)':'Low-cost restaurant', 'Carry out & Take away':'Takeout', 'Bar':'Bar', 'Restaurant(20-50)':'Mid-range restaurant'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,6)

rectangle_annotation_y_offset=-80

#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize)

time: 240 ms (started: 2023-09-28 17:19:59 -07:00)

In [30]:

feature_column_name = 'age'
feature_column_name_label='Age Group'#feature_column_name.capitalize()
xlabel='Age'#feature_column_name_label
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['<21', '21-25', '26-30', '31-35', '36-40', '41-45', '46-49', '50+']
xtick_dictionary = None #{}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,6)

rectangle_annotation_y_offset=-40

#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize)

time: 236 ms (started: 2023-09-28 17:20:00 -07:00)

In [31]:

feature_column_name = 'maritalStatus'
feature_column_name_label='Marital Status'#feature_column_name.capitalize()
xlabel='Marital status'#feature_column_name_label
ylabel='Frequency'

xtick_rotation=90
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['Married partner', 'Single', 'Unmarried partner', 'Divorced', 'Widowed']
xtick_dictionary = None #{}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,8)

rectangle_annotation_y_offset=-40

#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize)

time: 217 ms (started: 2023-09-28 17:20:00 -07:00)

In [32]:

feature_column_name = 'education'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Frequency'

xtick_rotation=90
bar_orientation = 'vertical'
annotation_type = 'percentage'
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = ['Some High School', 'High School Graduate', 'Some college - no degree', 'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)',]
xtick_dictionary = None #{}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,8)

rectangle_annotation_y_offset=-40

#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=14, dpi=dpi, xtick_rotation=xtick_rotation, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize)

time: 253 ms (started: 2023-09-28 17:20:00 -07:00)

In [33]:

feature_column_name = 'occupation'
feature_column_name_label=feature_column_name.capitalize()
xlabel=feature_column_name_label
ylabel='Frequency'

xtick_rotation=90
bar_orientation = 'vertical'
annotation_type = 'percentage'
annotation_text_size = 13
frequency_annotation_round_by_number=-1

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = \
['Unemployed', 'Student', 'Computer & Mathematical', 'Sales & Related', 'Education&Training&Library', 'Management', 
 'Office & Administrative Support', 'Arts Design Entertainment Sports & Media',
 'Business & Financial', 'Retired', 'Food Preparation & Serving Related', 'Healthcare Practitioners & Technical',
 'Healthcare Support', 'Community & Social Services', 'Legal', 'Transportation & Material Moving',
 'Architecture & Engineering', 'Personal Care & Service', 'Protective Service', 'Life Physical Social Science',
 'Construction & Extraction', 'Installation Maintenance & Repair', 'Production Occupations', 'Farming Fishing & Forestry',
 'Building & Grounds Cleaning & Maintenance']

xtick_dictionary = None #{}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(19,10)

rectangle_annotation_y_offset=-20


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=annotation_text_size, dpi=dpi, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize, xtick_rotation=xtick_rotation)

time: 787 ms (started: 2023-09-28 17:20:00 -07:00)

Bar Monthly Visit Frequency Frequency Distribution¶

In [34]:

icr.reverse_key_value_of_dictionary({'never':1, '<1':2, '1-3':3, '4-8':4, '>8':5, 'no response':0})

Out[34]:

{1: 'never', 2: '<1', 3: '1-3', 4: '4-8', 5: '>8', 0: 'no response'}

time: 2.36 ms (started: 2023-09-28 17:20:01 -07:00)

In [35]:

feature_column_name = 'Bar_venue_visit_frequency_yes_response_ordinal_integer_encoding'
feature_column_name_label='Bar Monthly Visit'
xlabel='Bar monthly visits'
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
annotation_text_size = 13
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = [0, 1, 2, 3, 4, 5]
xtick_dictionary = {1: 'never', 2: '<1', 3: '1-3', 4: '4-8', 5: '>8', 0: 'no response'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,6)

rectangle_annotation_y_offset=-20


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=annotation_text_size, dpi=dpi, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize, xtick_rotation=xtick_rotation)

time: 202 ms (started: 2023-09-28 17:20:01 -07:00)

In [36]:

feature_column_name = 'CoffeeHouse_venue_visit_frequency_yes_response_ordinal_integer_encoding'
feature_column_name_label='Coffee House Monthly Visit'
xlabel='Coffee house monthly visits'
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
annotation_text_size = 13
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = [0, 1, 2, 3, 4, 5]
xtick_dictionary = {1: 'never', 2: '<1', 3: '1-3', 4: '4-8', 5: '>8', 0: 'no response'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,6)

rectangle_annotation_y_offset=-20


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=annotation_text_size, dpi=dpi, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize, xtick_rotation=xtick_rotation)

time: 199 ms (started: 2023-09-28 17:20:01 -07:00)

In [37]:

feature_column_name = 'CarryAway_venue_visit_frequency_yes_response_ordinal_integer_encoding'
feature_column_name_label='Takeout Monthly Visit'
xlabel='Takeout monthly visits'
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
annotation_text_size = 13
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = [0, 1, 2, 3, 4, 5]
xtick_dictionary = {1: 'never', 2: '<1', 3: '1-3', 4: '4-8', 5: '>8', 0: 'no response'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,6)

rectangle_annotation_y_offset=-20


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=annotation_text_size, dpi=dpi, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize, xtick_rotation=xtick_rotation)

time: 190 ms (started: 2023-09-28 17:20:01 -07:00)

In [38]:

feature_column_name = 'RestaurantLessThan20_venue_visit_frequency_yes_response_ordinal_integer_encoding'
feature_column_name_label='Low-Cost Restaurant Monthly Visit'
xlabel='Low-cost restaurant monthly visits'
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
annotation_text_size = 13
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = [0, 1, 2, 3, 4, 5]
xtick_dictionary = {1: 'never', 2: '<1', 3: '1-3', 4: '4-8', 5: '>8', 0: 'no response'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,6)

rectangle_annotation_y_offset=-20


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=annotation_text_size, dpi=dpi, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize, xtick_rotation=xtick_rotation)

time: 195 ms (started: 2023-09-28 17:20:02 -07:00)

In [39]:

feature_column_name = 'Restaurant20To50_venue_visit_frequency_yes_response_ordinal_integer_encoding'
feature_column_name_label='Mid-Range Restaurant Monthly Visit'
xlabel='Mid-Range restaurant monthly visits'
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
annotation_text_size = 13
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = [0, 1, 2, 3, 4, 5]
xtick_dictionary = {1: 'never', 2: '<1', 3: '1-3', 4: '4-8', 5: '>8', 0: 'no response'}

title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,6)

rectangle_annotation_y_offset=-20


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=annotation_text_size, dpi=dpi, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize, xtick_rotation=xtick_rotation)

time: 199 ms (started: 2023-09-28 17:20:02 -07:00)

In [40]:

feature_column_name = 'temperature_ordinal_integer_encoding'
feature_column_name_label='Temperature'
xlabel='Temperature (F)'
ylabel='Frequency'

xtick_rotation=0
bar_orientation = 'vertical'
annotation_type = 'percentage'
annotation_text_size = 13
frequency_annotation_round_by_number=-2

dpi=100
figure_filename = '../reports/figures/figure_'+ str(feature_column_name) +'_frequency_distribution_'+ str(bar_orientation) + '_stacked_bar_coupon_acceptance_coupon_refusal_'+ str(annotation_type) + '_annotation_all_data_dpi_' + str(dpi) + '_v' + filename_version + '.png'

feature_value_order_list = [1, 2, 3,]
xtick_dictionary = {1: 30, 2: 55, 3: 80,}


title= str(feature_column_name_label) + ' Frequency Distribution'

color_index_list=[3,0]
colors = [color_list[color_index_list[0]], color_list[color_index_list[1]]]

y_upper_limit=None

figsize=(8,6)

rectangle_annotation_y_offset=-20


#get column name, coupon acceptance (frequency), coupon refusal (frequency)
df_income_coupon_acceptance_coupon_refusal = icr.get_feature_target_frequency_data_frame(df, feature_column_name=feature_column_name, append_percentage_true_false=True)

#sort by value order list
df_income_coupon_acceptance_coupon_refusal = icr.sort_data_frame(df=df_income_coupon_acceptance_coupon_refusal, feature_column_name=feature_column_name, feature_value_order_list=feature_value_order_list, ascending_true_false=True).drop(columns=[str(feature_column_name)+'_rank'])

#index feature column
df_index_income_coupon_refusal_coupon_acceptance = df_income_coupon_acceptance_coupon_refusal.set_index(feature_column_name)

icr.plot_vertical_stacked_bar_graph(df=df_index_income_coupon_refusal_coupon_acceptance, feature_column_name=feature_column_name, feature_column_name_label=feature_column_name_label, ylabel=ylabel, xlabel=xlabel, figure_filename=figure_filename, annotation_text_size=annotation_text_size, dpi=dpi, xtick_dictionary=xtick_dictionary, colors=colors, annotation_type=annotation_type, frequency_annotation_round_by_number=frequency_annotation_round_by_number, y_upper_limit=y_upper_limit, rectangle_annotation_y_offset=rectangle_annotation_y_offset, figsize=figsize, xtick_rotation=xtick_rotation)

time: 161 ms (started: 2023-09-28 17:20:02 -07:00)

In [ ]:

Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

Data Preprocessing

Convert to Indicator Variables from Categorical Variables and Add Back to Numeric Variables¶

In [41]:

#get column name lists
column_name_list_numeric = list(df.select_dtypes('number').columns)
column_name_list_not_numeric = [column_name for column_name in df.columns if not column_name in column_name_list_numeric]

#get categorical features, numeric features, and target data frame
df_categorical_features = df.loc[:, column_name_list_not_numeric]
df_numeric_features = df.loc[:, column_name_list_numeric].drop(columns='Y')
df_target = df.loc[:, ['Y']]

#get category indicators data frame from df_categorical_features
df_category_indicator_features = pd.get_dummies(df_categorical_features)

del df_categorical_features

#combine category indicator features and numeric features
df_indicator_numeric_features = pd.concat([df_category_indicator_features, df_numeric_features], axis=1)

del df_numeric_features, 

p(df_indicator_numeric_features)

(12684, 140)

Out[41]:

	destination_Home	destination_No Urgent Place	destination_Work	passenger_Alone	passenger_Kid(s)	weather_Snowy	weather_Sunny	time_10AM	time_10PM	time_2PM	time_6PM	time_7AM	coupon_venue_type_Bar	coupon_venue_type_Carry out & Take away	coupon_venue_type_Restaurant(20-50)	coupon_venue_type_Restaurant(<20)	expiration_1d	expiration_2h	gender_Female	gender_Male	age_21-25	age_26-30	age_31-35	age_36-40	age_46-49	maritalStatus_Married partner	maritalStatus_Single	maritalStatus_Unmarried partner	education_Bachelors degree	education_Graduate degree (Masters or Doctorate)	education_Some college - no degree	occupation_Education&Training&Library	occupation_Food Preparation & Serving Related	occupation_Legal	occupation_Management	occupation_Production Occupations	occupation_Retired	occupation_Sales & Related	occupation_Student	income_Less than \$12500	income_\$12500 - \$24999	income_\$25000 - \$37499	income_\$37500 - \$49999	income_\$50000 - \$62499	income_\$87500 - \$99999	car_no response	Bar_1-3	Bar_<1	Bar_never	CoffeeHouse_<1	CoffeeHouse_never	CoffeeHouse_no response	CarryAway_<1	CarryAway_>8	CarryAway_no response	RestaurantLessThan20_4-8	RestaurantLessThan20_<1	RestaurantLessThan20_>8	RestaurantLessThan20_never	RestaurantLessThan20_no response	Restaurant20To50_1-3	Restaurant20To50_<1	Restaurant20To50_>8	Restaurant20To50_never	temperature	has_children	toCoupon_GEQ15min	toCoupon_GEQ25min	direction_same_or_opposite	expiration_category_representative_numeric_encoding	time_category_representative_numeric_encoding	age_category_representative_numeric_encoding	income_category_representative_numeric_encoding	gender_binary_encoding	expiration_binary_encoding	coupon_venue_type_ordinal_integer_encoding	education_ordinal_integer_encoding	income_ordinal_integer_encoding	age_ordinal_integer_encoding	time_ordinal_integer_encoding	temperature_ordinal_integer_encoding	Bar_venue_visit_frequency_yes_response_ordinal_integer_encoding	CoffeeHouse_venue_visit_frequency_yes_response_ordinal_integer_encoding	CarryAway_venue_visit_frequency_yes_response_ordinal_integer_encoding	RestaurantLessThan20_venue_visit_frequency_yes_response_ordinal_integer_encoding	Restaurant20To50_venue_visit_frequency_yes_response_ordinal_integer_encoding	CoffeeHouse_venue_visit_frequency_no_response_indicator	CarryAway_venue_visit_frequency_no_response_indicator	RestaurantLessThan20_venue_visit_frequency_no_response_indicator
11199	1	0	0	1	0	0	1	0	0	0	1	0	0	1	0	0	0	1	0	1	1	0	0	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	1	0	0	1	0	0	1	0	0	1	0	0	0	0	1	0	0	0	1	80	0	1	0	0	2	18	23	18749.5	1	0	3	5	2	2	4	3	1	0	0	0	1	1	1	1
1474	0	0	1	1	0	0	1	0	0	0	0	1	1	0	0	0	1	0	0	1	0	0	0	0	1	1	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	1	1	0	0	1	1	0	0	1	0	0	0	0	0	1	0	0	1	0	0	55	1	1	1	1	24	7	48	93749.5	1	1	2	6	8	7	1	2	1	2	2	1	2	0	0	0
10836	1	0	0	1	0	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	0	1	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	1	0	0	1	0	1	0	0	0	1	0	1	0	0	1	0	0	0	0	0	0	1	30	0	0	0	1	24	18	28	43749.5	1	1	3	3	4	3	4	1	2	0	5	2	1	1	0	0
4567	0	1	0	1	0	0	1	1	0	0	0	0	1	0	0	0	1	0	1	0	1	0	0	0	0	0	0	1	0	1	0	1	0	0	0	0	0	0	0	0	0	0	1	0	0	1	1	0	0	1	0	0	1	0	0	0	1	0	0	0	0	0	0	1	80	0	0	0	1	24	10	23	43749.5	0	1	2	6	4	2	2	3	3	2	2	2	1	0	0	0
5658	0	1	0	1	0	0	1	0	0	1	0	0	0	0	0	1	0	1	1	0	0	0	1	0	0	0	1	0	1	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	1	0	0	1	0	1	0	0	1	0	1	0	0	0	0	0	0	0	1	80	1	1	0	1	2	14	33	43749.5	0	0	4	5	4	4	3	3	1	1	5	4	1	0	0	0
11343	0	0	1	1	0	0	1	0	0	0	0	1	0	0	1	0	0	1	1	0	0	0	0	1	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	0	1	1	0	0	1	0	0	0	1	0	0	0	0	1	0	0	0	0	1	80	1	0	0	0	2	7	38	18749.5	0	0	5	5	2	5	1	3	3	2	5	1	1	0	0	0
9036	1	0	0	1	0	1	0	0	1	0	0	0	0	0	0	1	0	1	0	1	0	1	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0	1	0	1	0	0	0	0	1	0	1	0	0	1	0	0	1	0	0	0	0	1	0	1	0	0	0	30	0	1	1	1	2	22	28	18749.5	1	0	4	3	2	3	5	1	2	1	5	1	3	0	0	0
11050	1	0	0	1	0	0	1	0	0	0	1	0	0	0	1	0	1	0	0	1	0	0	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	1	0	1	0	0	0	0	0	1	0	1	0	1	0	0	0	1	0	0	0	1	0	0	0	0	1	0	80	0	0	0	0	24	18	48	6250.0	1	1	5	3	1	7	4	3	2	2	5	5	5	0	0	0
784	0	0	1	1	0	0	1	0	0	0	0	1	0	1	0	0	0	1	1	0	1	0	0	0	0	0	1	0	0	1	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	1	0	1	0	1	0	0	1	0	0	0	0	0	0	1	0	1	0	0	80	0	0	0	0	2	7	23	31249.5	0	0	3	6	3	2	1	3	2	2	2	0	2	0	0	1
1818	0	1	0	0	1	0	1	1	0	0	0	0	1	0	0	0	1	0	1	0	0	0	0	1	0	1	0	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	1	1	0	0	0	1	0	0	1	0	0	1	0	0	0	0	0	0	1	80	1	1	0	1	24	10	38	56249.5	0	1	2	5	5	5	2	3	3	1	5	2	1	0	0	0

time: 61 ms (started: 2023-09-28 17:20:02 -07:00)

Get Data Frame Collection With Train and Test Features¶

In [42]:

data_frame_collection_filename='data_frame_collection_train_test_v' + filename_version + '.pkl'

df_readback = icr.return_processed_collection_if_it_exists(filename=data_frame_collection_filename, parse_dates=False)
if df_readback != None:
    data_frame_collection = df_readback
else:
    data_frame_collection = {}

    #split the data into train and test
    data_frame_collection['X_train'], data_frame_collection['X_test'], data_frame_collection['Y_train'], data_frame_collection['Y_test'] = \
    train_test_split(df_indicator_numeric_features, df_target, test_size=.2, random_state=200)

    del df_indicator_numeric_features

    #save preprocessed data frame collection
    data_frame_collection = icr.save_and_return_collection(data_frame_collection=data_frame_collection, filename=data_frame_collection_filename)

p(data_frame_collection['X_train'])

This file already exists
(10147, 140)

Out[42]:

	destination_Home	destination_No Urgent Place	passenger_Alone	passenger_Friend(s)	passenger_Kid(s)	weather_Rainy	weather_Snowy	weather_Sunny	time_10AM	time_2PM	time_6PM	coupon_venue_type_Bar	coupon_venue_type_Carry out & Take away	coupon_venue_type_Coffee House	coupon_venue_type_Restaurant(20-50)	expiration_1d	expiration_2h	gender_Female	gender_Male	age_21-25	age_26-30	age_31-35	age_36-40	age_41-45	age_46-49	age_<21	maritalStatus_Married partner	maritalStatus_Single	maritalStatus_Unmarried partner	education_Bachelors degree	education_Graduate degree (Masters or Doctorate)	education_Some college - no degree	occupation_Arts Design Entertainment Sports & Media	occupation_Building & Grounds Cleaning & Maintenance	occupation_Computer & Mathematical	occupation_Education&Training&Library	occupation_Healthcare Support	occupation_Legal	occupation_Sales & Related	occupation_Student	occupation_Unemployed	income_\$100000 or More	income_\$12500 - \$24999	income_\$25000 - \$37499	income_\$37500 - \$49999	income_\$75000 - \$87499	income_\$87500 - \$99999	car_no response	Bar_1-3	Bar_<1	Bar_>8	Bar_never	Bar_no response	CoffeeHouse_1-3	CoffeeHouse_<1	CoffeeHouse_>8	CoffeeHouse_never	CarryAway_4-8	CarryAway_<1	CarryAway_>8	CarryAway_no response	RestaurantLessThan20_<1	RestaurantLessThan20_>8	RestaurantLessThan20_never	Restaurant20To50_<1	Restaurant20To50_>8	Restaurant20To50_never	temperature	has_children	toCoupon_GEQ15min	direction_same_or_opposite	expiration_category_representative_numeric_encoding	time_category_representative_numeric_encoding	age_category_representative_numeric_encoding	income_category_representative_numeric_encoding	gender_binary_encoding	expiration_binary_encoding	coupon_venue_type_ordinal_integer_encoding	education_ordinal_integer_encoding	income_ordinal_integer_encoding	age_ordinal_integer_encoding	time_ordinal_integer_encoding	temperature_ordinal_integer_encoding	Bar_venue_visit_frequency_yes_response_ordinal_integer_encoding	CoffeeHouse_venue_visit_frequency_yes_response_ordinal_integer_encoding	CarryAway_venue_visit_frequency_yes_response_ordinal_integer_encoding	RestaurantLessThan20_venue_visit_frequency_yes_response_ordinal_integer_encoding	Restaurant20To50_venue_visit_frequency_yes_response_ordinal_integer_encoding	Bar_venue_visit_frequency_no_response_indicator	CarryAway_venue_visit_frequency_no_response_indicator
8630	0	1	0	1	0	0	0	1	0	0	1	0	0	1	0	1	0	1	0	1	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	1	1	0	1	0	0	0	0	0	0	1	1	0	0	0	0	0	1	0	0	1	80	0	0	1	24	18	23	93749.5	0	1	1	5	8	2	4	3	2	1	4	1	1	0	0
2418	0	1	0	1	0	0	0	1	0	1	0	0	0	1	0	0	1	1	0	0	1	0	0	0	0	0	0	1	0	1	0	0	1	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	1	0	0	0	0	1	0	0	0	1	0	0	1	0	0	0	1	55	0	0	1	2	14	28	18749.5	0	0	1	5	2	3	3	2	5	5	5	5	1	0	0
10804	0	1	1	0	0	1	0	0	0	1	0	0	1	0	0	0	1	1	0	0	0	1	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	0	1	0	0	0	0	1	0	0	1	0	0	0	1	0	0	1	0	0	1	0	55	0	0	1	2	14	33	81249.5	0	0	3	6	7	4	3	2	0	5	5	5	5	1	0
747	0	1	0	1	0	0	0	1	0	1	0	0	0	1	0	1	0	1	0	0	0	0	1	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	1	0	1	0	0	0	0	1	0	0	0	0	1	0	0	1	0	1	0	0	80	0	1	1	24	14	38	31249.5	0	1	1	3	3	5	3	3	2	2	5	5	2	0	0
7333	0	1	0	1	0	0	0	1	0	0	1	0	0	1	0	1	0	0	1	0	0	0	0	0	0	1	0	0	1	0	0	1	0	1	0	0	0	0	0	0	0	0	0	1	0	0	0	1	0	1	0	0	0	0	0	0	1	0	1	0	0	1	0	0	0	0	1	80	1	0	1	24	18	18	31249.5	1	1	1	3	3	1	4	3	2	1	2	2	1	0	0
10949	0	1	0	0	1	0	1	0	0	0	1	1	0	0	0	1	0	1	0	0	0	1	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	1	1	0	0	0	0	0	0	0	1	0	0	0	1	0	1	0	0	0	1	30	1	0	1	24	18	33	43749.5	0	1	2	5	4	4	4	1	3	1	0	5	1	0	1
11937	0	1	1	0	0	1	0	0	1	0	0	1	0	0	0	1	0	1	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	0	1	0	0	0	0	1	0	0	0	1	0	1	0	0	0	1	55	0	1	1	24	10	33	150000.0	0	1	2	3	9	4	2	2	1	1	0	5	1	0	1
735	1	0	1	0	0	0	0	1	0	0	1	0	0	0	1	1	0	0	1	0	0	0	0	1	0	0	0	1	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	1	1	0	0	0	0	1	0	0	0	0	0	1	0	1	0	0	0	0	1	55	0	1	1	24	18	43	43749.5	1	1	5	3	4	6	4	2	3	3	5	2	1	0	0
67	0	1	0	1	0	0	0	1	1	0	0	0	0	1	0	0	1	0	1	0	0	0	0	0	1	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	1	0	0	0	1	0	0	1	0	0	0	0	1	80	1	0	1	2	10	48	81249.5	1	0	1	5	7	7	2	3	1	2	2	2	1	0	0
3251	0	1	0	1	0	0	0	1	1	0	0	0	0	1	0	0	1	1	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	1	1	0	0	0	0	0	0	0	1	0	1	0	0	0	0	1	0	0	1	80	1	0	1	2	10	28	31249.5	0	0	1	3	3	3	2	3	3	1	2	1	1	0	0

time: 36.8 ms (started: 2023-09-28 17:20:02 -07:00)

Get Stratified 5-Fold Cross Validation X_train, X_test, Y_train, Y_test Collection¶

In [43]:

#stratified 5-fold cross validation train-test data row selection

#Stratified 5-Fold object
StratifiedKFold_5_splits = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

#get stratified 5-fold data frame splits
stratified_fold_number_X_train_X_test_Y_train_Y_test_collection = {}


for index, (train_index, test_index) in enumerate(StratifiedKFold_5_splits.split(data_frame_collection['X_train'], data_frame_collection['Y_train'].loc[:, 'Y'])):
    print('fold ' + str(index) + " TRAIN:", train_index, "TEST:", test_index)
    stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(index)] = \
    {'X_train' : data_frame_collection['X_train'].iloc[train_index, :], \
     'X_test' : data_frame_collection['X_train'].iloc[test_index, :], \
     'Y_train' : data_frame_collection['Y_train'].loc[:, 'Y'].iloc[train_index], \
     'Y_test' : data_frame_collection['Y_train'].loc[:, 'Y'].iloc[test_index]}

#use .loc for selection by index label, .iloc for selection by index location

print()
p(stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold 0']['X_train'])

fold 0 TRAIN: [ 2028  2030  2031 ... 10144 10145 10146] TEST: [   0    1    2 ... 2027 2029 2032]
fold 1 TRAIN: [    0     1     2 ... 10144 10145 10146] TEST: [2028 2030 2031 ... 4095 4101 4102]
fold 2 TRAIN: [    0     1     2 ... 10144 10145 10146] TEST: [4009 4010 4012 ... 6131 6133 6135]
fold 3 TRAIN: [    0     1     2 ... 10144 10145 10146] TEST: [6036 6037 6043 ... 8126 8127 8128]
fold 4 TRAIN: [   0    1    2 ... 8126 8127 8128] TEST: [ 8099  8105  8109 ... 10144 10145 10146]

(8117, 140)

Out[43]:

	destination_Home	destination_No Urgent Place	destination_Work	passenger_Alone	passenger_Friend(s)	passenger_Kid(s)	passenger_Partner	weather_Rainy	weather_Snowy	weather_Sunny	time_10AM	time_6PM	time_7AM	coupon_venue_type_Bar	coupon_venue_type_Carry out & Take away	coupon_venue_type_Coffee House	coupon_venue_type_Restaurant(20-50)	coupon_venue_type_Restaurant(<20)	expiration_1d	expiration_2h	gender_Female	gender_Male	age_26-30	age_31-35	age_36-40	age_41-45	age_46-49	age_<21	maritalStatus_Married partner	maritalStatus_Single	maritalStatus_Unmarried partner	education_Associates degree	education_Bachelors degree	education_Some college - no degree	occupation_Business & Financial	occupation_Computer & Mathematical	occupation_Education&Training&Library	occupation_Healthcare Support	occupation_Office & Administrative Support	occupation_Retired	occupation_Sales & Related	occupation_Student	occupation_Unemployed	income_\$100000 or More	income_\$25000 - \$37499	income_\$37500 - \$49999	income_\$50000 - \$62499	income_\$75000 - \$87499	car_no response	Bar_1-3	Bar_<1	Bar_never	CoffeeHouse_1-3	CoffeeHouse_<1	CoffeeHouse_>8	CoffeeHouse_never	CarryAway_<1	CarryAway_>8	CarryAway_no response	RestaurantLessThan20_<1	RestaurantLessThan20_>8	RestaurantLessThan20_never	RestaurantLessThan20_no response	Restaurant20To50_<1	Restaurant20To50_never	temperature	has_children	toCoupon_GEQ15min	toCoupon_GEQ25min	direction_same_or_opposite	expiration_category_representative_numeric_encoding	time_category_representative_numeric_encoding	age_category_representative_numeric_encoding	income_category_representative_numeric_encoding	gender_binary_encoding	expiration_binary_encoding	coupon_venue_type_ordinal_integer_encoding	education_ordinal_integer_encoding	income_ordinal_integer_encoding	age_ordinal_integer_encoding	time_ordinal_integer_encoding	temperature_ordinal_integer_encoding	Bar_venue_visit_frequency_yes_response_ordinal_integer_encoding	CoffeeHouse_venue_visit_frequency_yes_response_ordinal_integer_encoding	CarryAway_venue_visit_frequency_yes_response_ordinal_integer_encoding	RestaurantLessThan20_venue_visit_frequency_yes_response_ordinal_integer_encoding	Restaurant20To50_venue_visit_frequency_yes_response_ordinal_integer_encoding	CarryAway_venue_visit_frequency_no_response_indicator	RestaurantLessThan20_venue_visit_frequency_no_response_indicator
11981	0	1	0	0	1	0	0	0	0	1	1	0	0	0	1	0	0	0	0	1	1	0	1	0	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	1	1	0	0	0	0	0	1	1	0	0	1	0	0	0	0	1	30	1	0	0	1	2	10	28	43749.5	0	0	3	5	4	3	2	1	3	1	2	2	1	0	0
3996	0	0	1	1	0	0	0	0	0	1	0	0	1	0	0	0	0	1	1	0	0	1	0	0	0	0	0	1	0	1	0	0	1	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	1	0	1	0	0	0	0	1	0	0	0	1	1	0	55	0	0	0	1	24	7	18	150000.0	1	1	4	5	9	1	1	2	1	2	0	0	2	1	1
1301	0	0	1	1	0	0	0	0	0	1	0	0	1	0	0	0	0	1	1	0	0	1	0	0	1	0	0	0	0	0	1	1	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	1	1	0	0	0	0	1	0	0	1	0	1	0	0	0	0	1	55	1	0	0	1	24	7	38	56249.5	1	1	4	4	5	5	1	2	3	5	5	2	1	0	0
9720	1	0	0	1	0	0	0	1	0	0	0	1	0	1	0	0	0	0	1	0	1	0	0	0	0	0	1	0	0	1	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	1	0	0	1	1	0	0	0	1	0	0	0	0	1	0	1	0	0	1	0	55	0	1	1	1	24	18	48	43749.5	0	1	2	3	4	7	4	2	3	2	0	5	2	1	0
752	0	1	0	0	0	0	1	0	0	1	1	0	0	1	0	0	0	0	0	1	1	0	0	0	1	0	0	0	1	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	1	0	0	0	1	0	1	0	0	1	0	0	0	1	0	0	1	0	0	1	0	80	0	0	0	1	2	10	38	31249.5	0	0	2	3	3	5	2	3	2	2	5	5	2	0	0
10949	0	1	0	0	0	1	0	0	1	0	0	1	0	1	0	0	0	0	1	0	1	0	0	1	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0	0	1	0	0	0	1	0	0	1	1	0	0	0	0	0	1	0	0	1	0	1	0	0	0	1	30	1	0	0	1	24	18	33	43749.5	0	1	2	5	4	4	4	1	3	1	0	5	1	1	0
11937	0	1	0	1	0	0	0	1	0	0	1	0	0	1	0	0	0	0	1	0	1	0	0	1	0	0	0	0	1	0	0	0	0	1	0	1	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	1	0	0	0	1	0	0	1	0	1	0	0	0	1	55	0	1	0	1	24	10	33	150000.0	0	1	2	3	9	4	2	2	1	1	0	5	1	1	0
735	1	0	0	1	0	0	0	0	0	1	0	1	0	0	0	0	1	0	1	0	0	1	0	0	0	1	0	0	0	1	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	1	0	0	1	1	0	0	1	0	0	0	0	1	0	1	0	0	0	0	1	55	0	1	0	1	24	18	43	43749.5	1	1	5	3	4	6	4	2	3	3	5	2	1	0	0
67	0	1	0	0	1	0	0	0	0	1	1	0	0	0	0	1	0	0	0	1	0	1	0	0	0	0	1	0	1	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	1	1	0	0	1	0	1	0	0	1	0	0	1	0	0	0	0	1	80	1	0	0	1	2	10	48	81249.5	1	0	1	5	7	7	2	3	1	2	2	2	1	0	0
3251	0	1	0	0	1	0	0	0	0	1	1	0	0	0	0	1	0	0	0	1	1	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	1	0	0	0	0	0	1	1	0	0	0	0	1	0	0	1	80	1	0	0	1	2	10	28	31249.5	0	0	1	3	3	3	2	3	3	1	2	1	1	0	0

time: 68.8 ms (started: 2023-09-28 17:20:02 -07:00)

Standardize and Scale Features of Stratified Data Frame Collection for Hyperparameter Tuning¶

In [44]:

#standardize and scale fold 0 thru 4 train and test sets

number_of_folds=5

for fold_number in range(number_of_folds):
    
    standard_scaler = StandardScaler()
    standardized_scaled_stratified_fold_number_X_train_ndarray=standard_scaler.fit_transform(stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(fold_number)]['X_train'])
    

    stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(fold_number)]['X_train'] = \
    pd.DataFrame(standardized_scaled_stratified_fold_number_X_train_ndarray, 
                 columns=stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(fold_number)]['X_train'].columns, 
                 index=stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(fold_number)]['X_train'].index)
    
    
    standardized_scaled_stratified_fold_number_X_test_ndarray=standard_scaler.transform(stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(fold_number)]['X_test'])
    
    stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(fold_number)]['X_test'] = \
    pd.DataFrame(standardized_scaled_stratified_fold_number_X_test_ndarray, 
                 columns=stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(fold_number)]['X_test'].columns, 
                 index=stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(fold_number)]['X_test'].index)

print()
p(stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold 0']['X_test'])

(2030, 140)

Out[44]:

	destination_Home	destination_No Urgent Place	destination_Work	passenger_Alone	passenger_Friend(s)	passenger_Kid(s)	passenger_Partner	weather_Rainy	weather_Snowy	weather_Sunny	time_10AM	time_10PM	time_2PM	time_6PM	time_7AM	coupon_venue_type_Bar	coupon_venue_type_Carry out & Take away	coupon_venue_type_Coffee House	coupon_venue_type_Restaurant(20-50)	coupon_venue_type_Restaurant(<20)	expiration_1d	expiration_2h	gender_Female	gender_Male	age_21-25	age_26-30	age_31-35	age_36-40	age_41-45	age_46-49	age_50+	age_<21	maritalStatus_Divorced	maritalStatus_Married partner	maritalStatus_Single	maritalStatus_Unmarried partner	maritalStatus_Widowed	education_Associates degree	education_Bachelors degree	education_Graduate degree (Masters or Doctorate)	education_High School Graduate	education_Some High School	education_Some college - no degree	occupation_Architecture & Engineering	occupation_Arts Design Entertainment Sports & Media	occupation_Building & Grounds Cleaning & Maintenance	occupation_Business & Financial	occupation_Community & Social Services	occupation_Computer & Mathematical	occupation_Construction & Extraction	occupation_Education&Training&Library	occupation_Farming Fishing & Forestry	occupation_Food Preparation & Serving Related	occupation_Healthcare Practitioners & Technical	occupation_Healthcare Support	occupation_Installation Maintenance & Repair	occupation_Legal	occupation_Life Physical Social Science	occupation_Management	occupation_Office & Administrative Support	occupation_Personal Care & Service	occupation_Production Occupations	occupation_Protective Service	occupation_Retired	occupation_Sales & Related	occupation_Student	occupation_Transportation & Material Moving	occupation_Unemployed	income_Less than \$12500	income_\$100000 or More	income_\$12500 - \$24999	income_\$25000 - \$37499	income_\$37500 - \$49999	income_\$50000 - \$62499	income_\$62500 - \$74999	income_\$75000 - \$87499	income_\$87500 - \$99999	car_Car that is too old to install Onstar :D	car_Mazda5	car_Scooter and motorcycle	car_crossover	car_do not drive	car_no response	Bar_1-3	Bar_4-8	Bar_<1	Bar_>8	Bar_never	Bar_no response	CoffeeHouse_1-3	CoffeeHouse_4-8	CoffeeHouse_<1	CoffeeHouse_>8	CoffeeHouse_never	CoffeeHouse_no response	CarryAway_1-3	CarryAway_4-8	CarryAway_<1	CarryAway_>8	CarryAway_never	CarryAway_no response	RestaurantLessThan20_1-3	RestaurantLessThan20_4-8	RestaurantLessThan20_<1	RestaurantLessThan20_>8	RestaurantLessThan20_never	RestaurantLessThan20_no response	Restaurant20To50_1-3	Restaurant20To50_4-8	Restaurant20To50_<1	Restaurant20To50_>8	Restaurant20To50_never	Restaurant20To50_no response	temperature	has_children	toCoupon_GEQ15min	toCoupon_GEQ25min	direction_same_or_opposite	expiration_category_representative_numeric_encoding	time_category_representative_numeric_encoding	age_category_representative_numeric_encoding	income_category_representative_numeric_encoding	gender_binary_encoding	expiration_binary_encoding	coupon_venue_type_ordinal_integer_encoding	education_ordinal_integer_encoding	income_ordinal_integer_encoding	age_ordinal_integer_encoding	time_ordinal_integer_encoding	temperature_ordinal_integer_encoding	Bar_venue_visit_frequency_yes_response_ordinal_integer_encoding	CoffeeHouse_venue_visit_frequency_yes_response_ordinal_integer_encoding	CarryAway_venue_visit_frequency_yes_response_ordinal_integer_encoding	RestaurantLessThan20_venue_visit_frequency_yes_response_ordinal_integer_encoding	Restaurant20To50_venue_visit_frequency_yes_response_ordinal_integer_encoding	Bar_venue_visit_frequency_no_response_indicator	CoffeeHouse_venue_visit_frequency_no_response_indicator	CarryAway_venue_visit_frequency_no_response_indicator	RestaurantLessThan20_venue_visit_frequency_no_response_indicator	Restaurant20To50_venue_visit_frequency_no_response_indicator
8630	-0.587167	1.005312	-0.571612	-1.159860	1.683508	-0.296765	-0.304077	-0.325822	-0.350262	0.508187	-0.469292	-0.434891	-0.434490	1.707507	-0.571612	-0.438291	-0.481562	1.466632	-0.363860	-0.525411	0.884749	-0.884749	0.977221	-0.977221	1.991112	-0.511448	-0.437092	-0.412026	-0.304561	-0.242267	-0.209411	-0.339527	-0.203573	1.212198	-0.766297	-0.459687	-0.104691	-0.314360	1.390123	-0.415089	-0.280739	-0.085568	-0.720544	-0.111685	-0.226771	-0.062912	-0.211327	-0.143153	-0.3549	-0.115578	-0.281759	-0.055583	-0.154401	-0.141803	-0.138143	-0.102259	-0.128573	-0.122497	-0.263770	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	-0.375272	-0.132959	2.388038	-0.301167	-0.39489	-0.405457	-0.430677	-0.413048	-0.389454	-0.269074	-0.268547	3.588205	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	-0.837023	-0.101022	2.022924	-0.297502	-0.612995	-0.166909	-0.551684	-0.131026	-0.587546	-0.404426	1.669106	-0.311513	-0.109976	8.864448	-0.765487	-0.706780	-0.417941	-0.376553	-0.134865	-0.105291	-0.860871	-0.620607	2.251900	-0.338398	-0.445063	-0.128077	-0.597604	-0.243404	1.042931	-0.144492	0.875073	-0.842996	-1.149405	-0.367756	0.520827	0.884749	0.768378	-0.905191	0.807280	-0.977221	0.884749	-1.171215	0.688807	1.282399	-0.920802	0.768470	0.875073	-0.290234	-0.856444	0.749330	-0.885148	-0.750207	-0.166909	-0.311513	-0.376553	-0.338398	-0.144492
2418	-0.587167	1.005312	-0.571612	-1.159860	1.683508	-0.296765	-0.304077	-0.325822	-0.350262	0.508187	-0.469292	-0.434891	2.301549	-0.585649	-0.571612	-0.438291	-0.481562	1.466632	-0.363860	-0.525411	-1.130264	1.130264	0.977221	-0.977221	-0.502232	1.955233	-0.437092	-0.412026	-0.304561	-0.242267	-0.209411	-0.339527	-0.203573	-0.824947	1.304977	-0.459687	-0.104691	-0.314360	1.390123	-0.415089	-0.280739	-0.085568	-0.720544	-0.111685	4.409744	-0.062912	-0.211327	-0.143153	-0.3549	-0.115578	-0.281759	-0.055583	-0.154401	-0.141803	-0.138143	-0.102259	-0.128573	-0.122497	-0.263770	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	-0.375272	-0.132959	-0.418754	-0.301167	-0.39489	2.466353	-0.430677	-0.413048	-0.389454	-0.269074	-0.268547	-0.278691	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	-0.837023	-0.101022	-0.494334	3.361322	-0.612995	-0.166909	-0.551684	-0.131026	-0.587546	2.472637	-0.599123	-0.311513	-0.109976	-0.112810	-0.765487	1.414867	-0.417941	-0.376553	-0.134865	-0.105291	-0.860871	1.611325	-0.444069	-0.338398	-0.445063	-0.128077	-0.597604	-0.243404	1.042931	-0.144492	-0.433506	-0.842996	-1.149405	-0.367756	0.520827	-1.130264	0.028112	-0.389169	-0.932032	-0.977221	-1.130264	-1.171215	0.688807	-1.080025	-0.388227	0.069571	-0.433506	2.206229	1.889114	1.300065	1.469349	-0.750207	-0.166909	-0.311513	-0.376553	-0.338398	-0.144492
10804	-0.587167	1.005312	-0.571612	0.862173	-0.593998	-0.296765	-0.304077	3.069164	-0.350262	-1.967778	-0.469292	-0.434891	2.301549	-0.585649	-0.571612	-0.438291	2.076577	-0.681834	-0.363860	-0.525411	-1.130264	1.130264	0.977221	-0.977221	-0.502232	-0.511448	2.287849	-0.412026	-0.304561	-0.242267	-0.209411	-0.339527	-0.203573	-0.824947	1.304977	-0.459687	-0.104691	-0.314360	-0.719361	2.409119	-0.280739	-0.085568	-0.720544	-0.111685	-0.226771	-0.062912	-0.211327	-0.143153	-0.3549	-0.115578	-0.281759	-0.055583	-0.154401	-0.141803	-0.138143	-0.102259	7.777688	-0.122497	-0.263770	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	-0.375272	-0.132959	-0.418754	-0.301167	-0.39489	-0.405457	-0.430677	-0.413048	-0.389454	-0.269074	3.723748	-0.278691	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	-0.837023	-0.101022	-0.494334	-0.297502	-0.612995	5.991282	-0.551684	-0.131026	-0.587546	2.472637	-0.599123	-0.311513	-0.109976	-0.112810	-0.765487	1.414867	-0.417941	-0.376553	-0.134865	-0.105291	-0.860871	1.611325	-0.444069	-0.338398	-0.445063	-0.128077	-0.597604	4.108388	-0.958836	-0.144492	-0.433506	-0.842996	-1.149405	-0.367756	0.520827	-1.130264	0.028112	0.126854	0.517395	-0.977221	-1.130264	0.244745	1.476279	0.888662	0.144347	0.069571	-0.433506	-1.954543	1.889114	1.300065	1.469349	2.798930	5.991282	-0.311513	-0.376553	-0.338398	-0.144492
747	-0.587167	1.005312	-0.571612	-1.159860	1.683508	-0.296765	-0.304077	-0.325822	-0.350262	0.508187	-0.469292	-0.434891	2.301549	-0.585649	-0.571612	-0.438291	-0.481562	1.466632	-0.363860	-0.525411	0.884749	-0.884749	0.977221	-0.977221	-0.502232	-0.511448	-0.437092	2.427034	-0.304561	-0.242267	-0.209411	-0.339527	-0.203573	1.212198	-0.766297	-0.459687	-0.104691	-0.314360	-0.719361	-0.415089	-0.280739	-0.085568	1.387840	-0.111685	-0.226771	-0.062912	-0.211327	-0.143153	-0.3549	-0.115578	-0.281759	-0.055583	-0.154401	-0.141803	7.238875	-0.102259	-0.128573	-0.122497	-0.263770	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	-0.375272	-0.132959	-0.418754	-0.301167	-0.39489	-0.405457	2.321926	-0.413048	-0.389454	-0.269074	-0.268547	-0.278691	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	-0.837023	-0.101022	2.022924	-0.297502	-0.612995	-0.166909	-0.551684	-0.131026	1.701994	-0.404426	-0.599123	-0.311513	-0.109976	-0.112810	-0.765487	1.414867	-0.417941	-0.376553	-0.134865	-0.105291	-0.860871	1.611325	-0.444069	-0.338398	-0.445063	-0.128077	1.673348	-0.243404	-0.958836	-0.144492	0.875073	-0.842996	0.870015	-0.367756	0.520827	0.884749	0.028112	0.642876	-0.642147	-0.977221	0.884749	-1.171215	-0.886136	-0.686288	0.676921	0.069571	0.875073	-0.290234	-0.170054	1.300065	1.469349	0.137077	-0.166909	-0.311513	-0.376553	-0.338398	-0.144492
7333	-0.587167	1.005312	-0.571612	-1.159860	1.683508	-0.296765	-0.304077	-0.325822	-0.350262	0.508187	-0.469292	-0.434891	-0.434490	1.707507	-0.571612	-0.438291	-0.481562	1.466632	-0.363860	-0.525411	0.884749	-0.884749	-1.023310	1.023310	-0.502232	-0.511448	-0.437092	-0.412026	-0.304561	-0.242267	-0.209411	2.945270	-0.203573	-0.824947	-0.766297	2.175395	-0.104691	-0.314360	-0.719361	-0.415089	-0.280739	-0.085568	1.387840	-0.111685	-0.226771	15.895164	-0.211327	-0.143153	-0.3549	-0.115578	-0.281759	-0.055583	-0.154401	-0.141803	-0.138143	-0.102259	-0.128573	-0.122497	-0.263770	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	-0.375272	-0.132959	-0.418754	-0.301167	-0.39489	-0.405457	2.321926	-0.413048	-0.389454	-0.269074	-0.268547	-0.278691	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	-0.837023	-0.101022	2.022924	-0.297502	-0.612995	-0.166909	-0.551684	-0.131026	-0.587546	-0.404426	1.669106	-0.311513	-0.109976	-0.112810	1.306357	-0.706780	-0.417941	-0.376553	-0.134865	-0.105291	1.161615	-0.620607	-0.444069	-0.338398	-0.445063	-0.128077	-0.597604	-0.243404	1.042931	-0.144492	0.875073	1.186245	-1.149405	-0.367756	0.520827	0.884749	0.768378	-1.421214	-0.642147	1.023310	0.884749	-1.171215	-0.886136	-0.686288	-1.453376	0.768470	0.875073	-0.290234	-0.856444	-0.352139	-0.296524	-0.750207	-0.166909	-0.311513	-0.376553	-0.338398	-0.144492
1460	-0.587167	1.005312	-0.571612	-1.159860	1.683508	-0.296765	-0.304077	-0.325822	-0.350262	0.508187	-0.469292	-0.434891	2.301549	-0.585649	-0.571612	-0.438291	-0.481562	1.466632	-0.363860	-0.525411	-1.130264	1.130264	-1.023310	1.023310	-0.502232	-0.511448	-0.437092	-0.412026	-0.304561	4.127684	-0.209411	-0.339527	-0.203573	1.212198	-0.766297	-0.459687	-0.104691	-0.314360	-0.719361	2.409119	-0.280739	-0.085568	-0.720544	-0.111685	-0.226771	-0.062912	-0.211327	-0.143153	-0.3549	-0.115578	-0.281759	-0.055583	-0.154401	-0.141803	-0.138143	-0.102259	-0.128573	-0.122497	3.791188	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	-0.375272	-0.132959	-0.418754	-0.301167	-0.39489	-0.405457	-0.430677	-0.413048	-0.389454	-0.269074	-0.268547	3.588205	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	-0.837023	-0.101022	-0.494334	-0.297502	1.631335	-0.166909	-0.551684	-0.131026	1.701994	-0.404426	-0.599123	-0.311513	-0.109976	-0.112810	1.306357	-0.706780	-0.417941	-0.376553	-0.134865	-0.105291	-0.860871	-0.620607	2.251900	-0.338398	-0.445063	-0.128077	1.673348	-0.243404	-0.958836	-0.144492	0.875073	1.186245	0.870015	-0.367756	0.520827	-1.130264	0.028112	1.674921	0.807280	1.023310	-1.130264	-1.171215	1.476279	1.282399	1.742070	0.069571	0.875073	-1.122388	-0.170054	-0.352139	-0.885148	0.137077	-0.166909	-0.311513	-0.376553	-0.338398	-0.144492
8449	-0.587167	-0.994716	1.749437	0.862173	-0.593998	-0.296765	-0.304077	-0.325822	2.855008	-1.967778	-0.469292	-0.434891	-0.434490	-0.585649	1.749437	-0.438291	-0.481562	-0.681834	2.748311	-0.525411	0.884749	-0.884749	-1.023310	1.023310	-0.502232	-0.511448	-0.437092	-0.412026	3.283419	-0.242267	-0.209411	-0.339527	-0.203573	1.212198	-0.766297	-0.459687	-0.104691	3.181066	-0.719361	-0.415089	-0.280739	-0.085568	-0.720544	-0.111685	-0.226771	-0.062912	-0.211327	-0.143153	-0.3549	-0.115578	-0.281759	17.991109	-0.154401	-0.141803	-0.138143	-0.102259	-0.128573	-0.122497	-0.263770	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	-0.375272	-0.132959	-0.418754	-0.301167	-0.39489	-0.405457	2.321926	-0.413048	-0.389454	-0.269074	-0.268547	-0.278691	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	1.194710	-0.101022	-0.494334	-0.297502	-0.612995	-0.166909	1.812633	-0.131026	-0.587546	-0.404426	-0.599123	-0.311513	9.092877	-0.112810	-0.765487	-0.706780	-0.417941	-0.376553	-0.134865	-0.105291	1.161615	-0.620607	-0.444069	-0.338398	2.246872	-0.128077	-0.597604	-0.243404	-0.958836	-0.144492	-1.742085	1.186245	0.870015	2.719191	0.520827	0.884749	-1.267352	1.158899	-0.642147	1.023310	0.884749	1.660704	-0.098664	-0.686288	1.209496	-1.328226	-1.742085	0.541920	0.516335	0.198596	-0.296524	1.024361	-0.166909	-0.311513	-0.376553	-0.338398	-0.144492
11224	1.703094	-0.994716	-0.571612	0.862173	-0.593998	-0.296765	-0.304077	3.069164	-0.350262	-1.967778	-0.469292	2.299429	-0.434490	-0.585649	-0.571612	-0.438291	-0.481562	1.466632	-0.363860	-0.525411	-1.130264	1.130264	-1.023310	1.023310	-0.502232	-0.511448	-0.437092	-0.412026	-0.304561	-0.242267	4.775303	-0.339527	-0.203573	-0.824947	-0.766297	2.175395	-0.104691	-0.314360	-0.719361	-0.415089	-0.280739	-0.085568	1.387840	-0.111685	-0.226771	-0.062912	-0.211327	-0.143153	-0.3549	-0.115578	-0.281759	-0.055583	-0.154401	-0.141803	-0.138143	-0.102259	-0.128573	-0.122497	-0.263770	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	2.664732	-0.132959	-0.418754	-0.301167	-0.39489	-0.405457	2.321926	-0.413048	-0.389454	-0.269074	-0.268547	-0.278691	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	1.194710	-0.101022	-0.494334	-0.297502	-0.612995	-0.166909	1.812633	-0.131026	-0.587546	-0.404426	-0.599123	-0.311513	-0.109976	-0.112810	1.306357	-0.706780	-0.417941	-0.376553	-0.134865	-0.105291	-0.860871	-0.620607	2.251900	-0.338398	2.246872	-0.128077	-0.597604	-0.243404	-0.958836	-0.144492	-0.433506	-0.842996	-1.149405	-0.367756	-1.920023	-1.130264	1.508643	2.500558	-0.642147	1.023310	-1.130264	-1.171215	-0.886136	-0.686288	2.274644	1.467368	-0.433506	0.541920	0.516335	-0.352139	-0.885148	1.024361	-0.166909	-0.311513	-0.376553	-0.338398	-0.144492
5085	-0.587167	1.005312	-0.571612	-1.159860	1.683508	-0.296765	-0.304077	-0.325822	-0.350262	0.508187	-0.469292	-0.434891	2.301549	-0.585649	-0.571612	-0.438291	-0.481562	-0.681834	-0.363860	1.903270	-1.130264	1.130264	-1.023310	1.023310	-0.502232	1.955233	-0.437092	-0.412026	-0.304561	-0.242267	-0.209411	-0.339527	-0.203573	-0.824947	1.304977	-0.459687	-0.104691	-0.314360	-0.719361	-0.415089	3.562026	-0.085568	-0.720544	-0.111685	-0.226771	-0.062912	-0.211327	-0.143153	-0.3549	-0.115578	-0.281759	-0.055583	-0.154401	-0.141803	-0.138143	-0.102259	-0.128573	-0.122497	-0.263770	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	-0.375272	-0.132959	2.388038	-0.301167	-0.39489	-0.405457	2.321926	-0.413048	-0.389454	-0.269074	-0.268547	-0.278691	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	-0.837023	-0.101022	-0.494334	-0.297502	1.631335	-0.166909	-0.551684	-0.131026	1.701994	-0.404426	-0.599123	-0.311513	-0.109976	-0.112810	-0.765487	1.414867	-0.417941	-0.376553	-0.134865	-0.105291	1.161615	-0.620607	-0.444069	-0.338398	-0.445063	-0.128077	-0.597604	-0.243404	1.042931	-0.144492	0.875073	-0.842996	0.870015	-0.367756	0.520827	-1.130264	0.028112	-0.389169	-0.642147	1.023310	-1.130264	0.952725	-1.673608	-0.686288	-0.388227	0.069571	0.875073	-1.122388	-0.170054	1.300065	-0.296524	-0.750207	-0.166909	-0.311513	-0.376553	-0.338398	-0.144492
7156	-0.587167	1.005312	-0.571612	-1.159860	1.683508	-0.296765	-0.304077	-0.325822	-0.350262	0.508187	-0.469292	-0.434891	2.301549	-0.585649	-0.571612	-0.438291	-0.481562	-0.681834	2.748311	-0.525411	-1.130264	1.130264	-1.023310	1.023310	-0.502232	-0.511448	2.287849	-0.412026	-0.304561	-0.242267	-0.209411	-0.339527	-0.203573	1.212198	-0.766297	-0.459687	-0.104691	-0.314360	-0.719361	-0.415089	-0.280739	-0.085568	1.387840	-0.111685	-0.226771	-0.062912	-0.211327	-0.143153	-0.3549	8.652157	-0.281759	-0.055583	-0.154401	-0.141803	-0.138143	-0.102259	-0.128573	-0.122497	-0.263770	-0.226169	-0.114479	-0.096571	-0.116668	-0.203573	-0.310798	-0.375272	-0.132959	-0.418754	-0.301167	-0.39489	-0.405457	-0.430677	2.421027	-0.389454	-0.269074	-0.268547	-0.278691	-0.045812	-0.036838	-0.041566	-0.041566	-0.040052	0.092594	1.194710	-0.101022	-0.494334	-0.297502	-0.612995	-0.166909	1.812633	-0.131026	-0.587546	-0.404426	-0.599123	-0.311513	-0.109976	-0.112810	-0.765487	1.414867	-0.417941	-0.376553	-0.134865	-0.105291	-0.860871	1.611325	-0.444069	-0.338398	-0.445063	-0.128077	1.673348	-0.243404	-0.958836	-0.144492	0.875073	1.186245	0.870015	-0.367756	0.520827	-1.130264	0.028112	0.126854	-0.352261	1.023310	-1.130264	1.660704	-0.886136	-0.292550	0.144347	0.069571	0.875073	0.541920	0.516335	1.300065	1.469349	0.137077	-0.166909	-0.311513	-0.376553	-0.338398	-0.144492

time: 135 ms (started: 2023-09-28 17:20:02 -07:00)

In [ ]:

Standardize and Scale Features of Data Frame Collection for Campaign Model Train and Test¶

In [45]:

data_frame_collection_filename='data_frame_collection_train_test_standardize_scale_v' + filename_version + '.pkl'

df_readback = icr.return_processed_collection_if_it_exists(filename=data_frame_collection_filename, parse_dates=False)
if df_readback != None:
    data_frame_collection = df_readback
else:
    #standardize and scale feature data frame
    standard_scaler = StandardScaler()
    standardized_scaled_data_frame_collection_X_train_ndarray = standard_scaler.fit_transform(data_frame_collection['X_train'])
    data_frame_collection['X_train'] = pd.DataFrame(standardized_scaled_data_frame_collection_X_train_ndarray, columns=data_frame_collection['X_train'].columns, index=data_frame_collection['X_train'].index)


    standardized_scaled_df_collection_X_test_ndarray = standard_scaler.transform(data_frame_collection['X_test'])
    data_frame_collection['X_test'] = pd.DataFrame(standardized_scaled_df_collection_X_test_ndarray, columns=data_frame_collection['X_test'].columns, index=data_frame_collection['X_test'].index)

    data_frame_collection = icr.save_and_return_collection(data_frame_collection=data_frame_collection, filename=data_frame_collection_filename)

    del standardized_scaled_data_frame_collection_X_train_ndarray, standardized_scaled_df_collection_X_test_ndarray, standard_scaler

print(data_frame_collection['X_train'].shape, data_frame_collection['X_test'].shape, data_frame_collection['Y_train'].shape, data_frame_collection['Y_test'].shape)
p(data_frame_collection['X_train'])

This file already exists
(10147, 140) (2537, 140) (10147, 1) (2537, 1)
(10147, 140)

Out[45]:

	destination_Home	destination_No Urgent Place	destination_Work	passenger_Alone	passenger_Friend(s)	passenger_Kid(s)	passenger_Partner	weather_Rainy	weather_Snowy	weather_Sunny	time_10AM	time_10PM	time_2PM	time_6PM	time_7AM	coupon_venue_type_Bar	coupon_venue_type_Carry out & Take away	coupon_venue_type_Coffee House	coupon_venue_type_Restaurant(20-50)	coupon_venue_type_Restaurant(<20)	expiration_1d	expiration_2h	gender_Female	gender_Male	age_21-25	age_26-30	age_31-35	age_36-40	age_41-45	age_46-49	age_50+	age_<21	maritalStatus_Divorced	maritalStatus_Married partner	maritalStatus_Single	maritalStatus_Unmarried partner	maritalStatus_Widowed	education_Associates degree	education_Bachelors degree	education_Graduate degree (Masters or Doctorate)	education_High School Graduate	education_Some High School	education_Some college - no degree	occupation_Architecture & Engineering	occupation_Arts Design Entertainment Sports & Media	occupation_Building & Grounds Cleaning & Maintenance	occupation_Business & Financial	occupation_Community & Social Services	occupation_Computer & Mathematical	occupation_Construction & Extraction	occupation_Education&Training&Library	occupation_Farming Fishing & Forestry	occupation_Food Preparation & Serving Related	occupation_Healthcare Practitioners & Technical	occupation_Healthcare Support	occupation_Installation Maintenance & Repair	occupation_Legal	occupation_Life Physical Social Science	occupation_Management	occupation_Office & Administrative Support	occupation_Personal Care & Service	occupation_Production Occupations	occupation_Protective Service	occupation_Retired	occupation_Sales & Related	occupation_Student	occupation_Transportation & Material Moving	occupation_Unemployed	income_Less than \$12500	income_\$100000 or More	income_\$12500 - \$24999	income_\$25000 - \$37499	income_\$37500 - \$49999	income_\$50000 - \$62499	income_\$62500 - \$74999	income_\$75000 - \$87499	income_\$87500 - \$99999	car_Car that is too old to install Onstar :D	car_Mazda5	car_Scooter and motorcycle	car_crossover	car_do not drive	car_no response	Bar_1-3	Bar_4-8	Bar_<1	Bar_>8	Bar_never	Bar_no response	CoffeeHouse_1-3	CoffeeHouse_4-8	CoffeeHouse_<1	CoffeeHouse_>8	CoffeeHouse_never	CoffeeHouse_no response	CarryAway_1-3	CarryAway_4-8	CarryAway_<1	CarryAway_>8	CarryAway_never	CarryAway_no response	RestaurantLessThan20_1-3	RestaurantLessThan20_4-8	RestaurantLessThan20_<1	RestaurantLessThan20_>8	RestaurantLessThan20_never	RestaurantLessThan20_no response	Restaurant20To50_1-3	Restaurant20To50_4-8	Restaurant20To50_<1	Restaurant20To50_>8	Restaurant20To50_never	Restaurant20To50_no response	temperature	has_children	toCoupon_GEQ15min	toCoupon_GEQ25min	direction_same_or_opposite	expiration_category_representative_numeric_encoding	time_category_representative_numeric_encoding	age_category_representative_numeric_encoding	income_category_representative_numeric_encoding	gender_binary_encoding	expiration_binary_encoding	coupon_venue_type_ordinal_integer_encoding	education_ordinal_integer_encoding	income_ordinal_integer_encoding	age_ordinal_integer_encoding	time_ordinal_integer_encoding	temperature_ordinal_integer_encoding	Bar_venue_visit_frequency_yes_response_ordinal_integer_encoding	CoffeeHouse_venue_visit_frequency_yes_response_ordinal_integer_encoding	CarryAway_venue_visit_frequency_yes_response_ordinal_integer_encoding	RestaurantLessThan20_venue_visit_frequency_yes_response_ordinal_integer_encoding	Restaurant20To50_venue_visit_frequency_yes_response_ordinal_integer_encoding	Bar_venue_visit_frequency_no_response_indicator	CoffeeHouse_venue_visit_frequency_no_response_indicator	CarryAway_venue_visit_frequency_no_response_indicator	RestaurantLessThan20_venue_visit_frequency_no_response_indicator	Restaurant20To50_venue_visit_frequency_no_response_indicator
8630	-0.587858	1.003257	-0.569346	-1.154170	1.678978	-0.298862	-0.304693	-0.327333	-0.350292	0.509468	-0.468919	-0.436673	-0.434912	1.705934	-0.569346	-0.435552	-0.480608	1.478469	-0.364148	-0.534082	0.886994	-0.886994	0.976717	-0.976717	1.987399	-0.513302	-0.438271	-0.407826	-0.307965	-0.241458	-0.210362	-0.336300	-0.203894	1.216187	-0.768725	-0.459509	-0.105167	-0.315953	1.385337	-0.409960	-0.277188	-0.085126	-0.724533	-0.115245	-0.22790	-0.062115	-0.209851	-0.140709	-0.354533	-0.112132	-0.283324	-0.057121	-0.156638	-0.137022	-0.138877	-0.104205	-0.129358	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	-0.30854	-0.375171	-0.134005	2.401912	-0.301398	-0.398243	-0.410943	-0.429612	-0.40799	-0.387547	-0.267797	-0.270954	3.618433	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	-0.834929	-0.097216	2.031585	-0.301787	-0.615056	-0.166278	-0.551733	-0.132087	-0.582547	-0.408319	1.661606	-0.309306	-0.11303	8.918057	-0.768562	-0.702825	-0.413561	-0.381299	-0.131701	-0.103721	-0.862943	-0.619167	2.249784	-0.339559	-0.447346	-0.126171	-0.595448	-0.243508	1.041565	-0.143595	0.873060	-0.839015	-1.140802	-0.367266	0.519885	0.886994	0.764335	-0.907620	0.800884	-0.976717	0.886994	-1.181367	0.694779	1.277192	-0.923430	0.764521	0.873060	-0.291221	-0.858924	0.754965	-0.882039	-0.750685	-0.166278	-0.309306	-0.381299	-0.339559	-0.143595
2418	-0.587858	1.003257	-0.569346	-1.154170	1.678978	-0.298862	-0.304693	-0.327333	-0.350292	0.509468	-0.468919	-0.436673	2.299318	-0.586189	-0.569346	-0.435552	-0.480608	1.478469	-0.364148	-0.534082	-1.127403	1.127403	0.976717	-0.976717	-0.503170	1.948171	-0.438271	-0.407826	-0.307965	-0.241458	-0.210362	-0.336300	-0.203894	-0.822242	1.300856	-0.459509	-0.105167	-0.315953	1.385337	-0.409960	-0.277188	-0.085126	-0.724533	-0.115245	4.38788	-0.062115	-0.209851	-0.140709	-0.354533	-0.112132	-0.283324	-0.057121	-0.156638	-0.137022	-0.138877	-0.104205	-0.129358	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	-0.30854	-0.375171	-0.134005	-0.416335	-0.301398	-0.398243	2.433425	-0.429612	-0.40799	-0.387547	-0.267797	-0.270954	-0.276363	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	-0.834929	-0.097216	-0.492227	3.313598	-0.615056	-0.166278	-0.551733	-0.132087	-0.582547	2.449067	-0.601827	-0.309306	-0.11303	-0.112132	-0.768562	1.422829	-0.413561	-0.381299	-0.131701	-0.103721	-0.862943	1.615074	-0.444487	-0.339559	-0.447346	-0.126171	-0.595448	-0.243508	1.041565	-0.143595	-0.437240	-0.839015	-1.140802	-0.367266	0.519885	-1.127403	0.024145	-0.391530	-0.929130	-0.976717	-1.127403	-1.181367	0.694779	-1.075831	-0.390648	0.065576	-0.437240	2.193379	1.876853	1.305812	1.473237	-0.750685	-0.166278	-0.309306	-0.381299	-0.339559	-0.143595
10804	-0.587858	1.003257	-0.569346	0.866424	-0.595600	-0.298862	-0.304693	3.054995	-0.350292	-1.962830	-0.468919	-0.436673	2.299318	-0.586189	-0.569346	-0.435552	2.080699	-0.676375	-0.364148	-0.534082	-1.127403	1.127403	0.976717	-0.976717	-0.503170	-0.513302	2.281692	-0.407826	-0.307965	-0.241458	-0.210362	-0.336300	-0.203894	-0.822242	1.300856	-0.459509	-0.105167	-0.315953	-0.721846	2.439262	-0.277188	-0.085126	-0.724533	-0.115245	-0.22790	-0.062115	-0.209851	-0.140709	-0.354533	-0.112132	-0.283324	-0.057121	-0.156638	-0.137022	-0.138877	-0.104205	7.730490	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	-0.30854	-0.375171	-0.134005	-0.416335	-0.301398	-0.398243	-0.410943	-0.429612	-0.40799	-0.387547	-0.267797	3.690669	-0.276363	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	-0.834929	-0.097216	-0.492227	-0.301787	-0.615056	6.014025	-0.551733	-0.132087	-0.582547	2.449067	-0.601827	-0.309306	-0.11303	-0.112132	-0.768562	1.422829	-0.413561	-0.381299	-0.131701	-0.103721	-0.862943	1.615074	-0.444487	-0.339559	-0.447346	-0.126171	-0.595448	4.106633	-0.960094	-0.143595	-0.437240	-0.839015	-1.140802	-0.367266	0.519885	-1.127403	0.024145	0.124559	0.512548	-0.976717	-1.127403	0.234404	1.485928	0.885022	0.142135	0.065576	-0.437240	-1.947620	1.876853	1.305812	1.473237	2.798803	6.014025	-0.309306	-0.381299	-0.339559	-0.143595
747	-0.587858	1.003257	-0.569346	-1.154170	1.678978	-0.298862	-0.304693	-0.327333	-0.350292	0.509468	-0.468919	-0.436673	2.299318	-0.586189	-0.569346	-0.435552	-0.480608	1.478469	-0.364148	-0.534082	0.886994	-0.886994	0.976717	-0.976717	-0.503170	-0.513302	-0.438271	2.452028	-0.307965	-0.241458	-0.210362	-0.336300	-0.203894	1.216187	-0.768725	-0.459509	-0.105167	-0.315953	-0.721846	-0.409960	-0.277188	-0.085126	1.380200	-0.115245	-0.22790	-0.062115	-0.209851	-0.140709	-0.354533	-0.112132	-0.283324	-0.057121	-0.156638	-0.137022	7.200622	-0.104205	-0.129358	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	-0.30854	-0.375171	-0.134005	-0.416335	-0.301398	-0.398243	-0.410943	2.327679	-0.40799	-0.387547	-0.267797	-0.270954	-0.276363	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	-0.834929	-0.097216	2.031585	-0.301787	-0.615056	-0.166278	-0.551733	-0.132087	1.716599	-0.408319	-0.601827	-0.309306	-0.11303	-0.112132	-0.768562	1.422829	-0.413561	-0.381299	-0.131701	-0.103721	-0.862943	1.615074	-0.444487	-0.339559	-0.447346	-0.126171	1.679407	-0.243508	-0.960094	-0.143595	0.873060	-0.839015	0.876576	-0.367266	0.519885	0.886994	0.024145	0.640649	-0.640794	-0.976717	0.886994	-1.181367	-0.887518	-0.683661	0.674918	0.065576	0.873060	-0.291221	-0.174980	1.305812	1.473237	0.136687	-0.166278	-0.309306	-0.381299	-0.339559	-0.143595
7333	-0.587858	1.003257	-0.569346	-1.154170	1.678978	-0.298862	-0.304693	-0.327333	-0.350292	0.509468	-0.468919	-0.436673	-0.434912	1.705934	-0.569346	-0.435552	-0.480608	1.478469	-0.364148	-0.534082	0.886994	-0.886994	-1.023838	1.023838	-0.503170	-0.513302	-0.438271	-0.407826	-0.307965	-0.241458	-0.210362	2.973533	-0.203894	-0.822242	-0.768725	2.176237	-0.105167	-0.315953	-0.721846	-0.409960	-0.277188	-0.085126	1.380200	-0.115245	-0.22790	16.099052	-0.209851	-0.140709	-0.354533	-0.112132	-0.283324	-0.057121	-0.156638	-0.137022	-0.138877	-0.104205	-0.129358	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	-0.30854	-0.375171	-0.134005	-0.416335	-0.301398	-0.398243	-0.410943	2.327679	-0.40799	-0.387547	-0.267797	-0.270954	-0.276363	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	-0.834929	-0.097216	2.031585	-0.301787	-0.615056	-0.166278	-0.551733	-0.132087	-0.582547	-0.408319	1.661606	-0.309306	-0.11303	-0.112132	1.301130	-0.702825	-0.413561	-0.381299	-0.131701	-0.103721	1.158826	-0.619167	-0.444487	-0.339559	-0.447346	-0.126171	-0.595448	-0.243508	1.041565	-0.143595	0.873060	1.191874	-1.140802	-0.367266	0.519885	0.886994	0.764335	-1.423709	-0.640794	1.023838	0.886994	-1.181367	-0.887518	-0.683661	-1.456213	0.764521	0.873060	-0.291221	-0.858924	-0.346729	-0.293220	-0.750685	-0.166278	-0.309306	-0.381299	-0.339559	-0.143595
10949	-0.587858	1.003257	-0.569346	-1.154170	-0.595600	3.346030	-0.304693	-0.327333	2.854765	-1.962830	-0.468919	-0.436673	-0.434912	1.705934	-0.569346	2.295936	-0.480608	-0.676375	-0.364148	-0.534082	0.886994	-0.886994	0.976717	-0.976717	-0.503170	-0.513302	2.281692	-0.407826	-0.307965	-0.241458	-0.210362	-0.336300	-0.203894	1.216187	-0.768725	-0.459509	-0.105167	-0.315953	1.385337	-0.409960	-0.277188	-0.085126	-0.724533	-0.115245	-0.22790	-0.062115	-0.209851	-0.140709	-0.354533	-0.112132	-0.283324	-0.057121	-0.156638	-0.137022	-0.138877	-0.104205	-0.129358	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	-0.30854	2.665452	-0.134005	-0.416335	-0.301398	-0.398243	-0.410943	-0.429612	2.45104	-0.387547	-0.267797	-0.270954	-0.276363	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	1.197707	-0.097216	-0.492227	-0.301787	-0.615056	-0.166278	-0.551733	-0.132087	-0.582547	-0.408319	1.661606	-0.309306	-0.11303	-0.112132	-0.768562	-0.702825	-0.413561	2.622614	-0.131701	-0.103721	-0.862943	1.615074	-0.444487	-0.339559	-0.447346	-0.126171	-0.595448	-0.243508	1.041565	-0.143595	-1.747541	1.191874	-1.140802	-0.367266	0.519885	0.886994	0.764335	0.124559	-0.352458	-0.976717	0.886994	-0.473481	0.694779	-0.291490	0.142135	0.764521	-1.747541	0.536979	-0.858924	-1.448423	1.473237	-0.750685	-0.166278	-0.309306	2.622614	-0.339559	-0.143595
11937	-0.587858	1.003257	-0.569346	0.866424	-0.595600	-0.298862	-0.304693	3.054995	-0.350292	-1.962830	2.132567	-0.436673	-0.434912	-0.586189	-0.569346	2.295936	-0.480608	-0.676375	-0.364148	-0.534082	0.886994	-0.886994	0.976717	-0.976717	-0.503170	-0.513302	2.281692	-0.407826	-0.307965	-0.241458	-0.210362	-0.336300	-0.203894	1.216187	-0.768725	-0.459509	-0.105167	-0.315953	-0.721846	-0.409960	-0.277188	-0.085126	1.380200	-0.115245	-0.22790	-0.062115	-0.209851	-0.140709	2.820615	-0.112132	-0.283324	-0.057121	-0.156638	-0.137022	-0.138877	-0.104205	-0.129358	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	-0.30854	-0.375171	-0.134005	-0.416335	-0.301398	2.511027	-0.410943	-0.429612	-0.40799	-0.387547	-0.267797	-0.270954	-0.276363	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	-0.834929	-0.097216	-0.492227	-0.301787	1.625868	-0.166278	-0.551733	-0.132087	-0.582547	-0.408319	1.661606	-0.309306	-0.11303	-0.112132	-0.768562	-0.702825	-0.413561	2.622614	-0.131701	-0.103721	-0.862943	1.615074	-0.444487	-0.339559	-0.447346	-0.126171	-0.595448	-0.243508	1.041565	-0.143595	-0.437240	-0.839015	0.876576	-0.367266	0.519885	0.886994	-0.716044	0.124559	2.098405	-0.976717	0.886994	-0.473481	-0.887518	1.669363	0.142135	-0.633370	-0.437240	-1.119421	-0.858924	-1.448423	1.473237	-0.750685	-0.166278	-0.309306	2.622614	-0.339559	-0.143595
735	1.701090	-0.996753	-0.569346	0.866424	-0.595600	-0.298862	-0.304693	-0.327333	-0.350292	0.509468	-0.468919	-0.436673	-0.434912	1.705934	-0.569346	-0.435552	-0.480608	-0.676375	2.746133	-0.534082	0.886994	-0.886994	-1.023838	1.023838	-0.503170	-0.513302	-0.438271	-0.407826	3.247122	-0.241458	-0.210362	-0.336300	-0.203894	-0.822242	1.300856	-0.459509	-0.105167	-0.315953	-0.721846	-0.409960	-0.277188	-0.085126	1.380200	-0.115245	-0.22790	-0.062115	-0.209851	-0.140709	-0.354533	-0.112132	-0.283324	-0.057121	-0.156638	-0.137022	-0.138877	-0.104205	-0.129358	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	3.24107	-0.375171	-0.134005	-0.416335	-0.301398	-0.398243	-0.410943	-0.429612	2.45104	-0.387547	-0.267797	-0.270954	-0.276363	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	1.197707	-0.097216	-0.492227	-0.301787	-0.615056	-0.166278	1.812471	-0.132087	-0.582547	-0.408319	-0.601827	-0.309306	-0.11303	-0.112132	-0.768562	1.422829	-0.413561	-0.381299	-0.131701	-0.103721	1.158826	-0.619167	-0.444487	-0.339559	-0.447346	-0.126171	-0.595448	-0.243508	1.041565	-0.143595	-0.437240	-0.839015	0.876576	-0.367266	0.519885	0.886994	0.764335	1.156738	-0.352458	1.023838	0.886994	1.650174	-0.887518	-0.291490	1.207700	0.764521	-0.437240	0.536979	0.508964	1.305812	-0.293220	-0.750685	-0.166278	-0.309306	-0.381299	-0.339559	-0.143595
67	-0.587858	1.003257	-0.569346	-1.154170	1.678978	-0.298862	-0.304693	-0.327333	-0.350292	0.509468	2.132567	-0.436673	-0.434912	-0.586189	-0.569346	-0.435552	-0.480608	1.478469	-0.364148	-0.534082	-1.127403	1.127403	-1.023838	1.023838	-0.503170	-0.513302	-0.438271	-0.407826	-0.307965	4.141504	-0.210362	-0.336300	-0.203894	1.216187	-0.768725	-0.459509	-0.105167	-0.315953	1.385337	-0.409960	-0.277188	-0.085126	-0.724533	-0.115245	-0.22790	-0.062115	-0.209851	-0.140709	-0.354533	-0.112132	3.529527	-0.057121	-0.156638	-0.137022	-0.138877	-0.104205	-0.129358	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	-0.30854	-0.375171	-0.134005	-0.416335	-0.301398	-0.398243	-0.410943	-0.429612	-0.40799	-0.387547	-0.267797	3.690669	-0.276363	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	-0.834929	-0.097216	-0.492227	-0.301787	1.625868	-0.166278	-0.551733	-0.132087	1.716599	-0.408319	-0.601827	-0.309306	-0.11303	-0.112132	1.301130	-0.702825	-0.413561	-0.381299	-0.131701	-0.103721	1.158826	-0.619167	-0.444487	-0.339559	-0.447346	-0.126171	-0.595448	-0.243508	1.041565	-0.143595	0.873060	1.191874	-1.140802	-0.367266	0.519885	-1.127403	-0.716044	1.672828	0.512548	1.023838	-1.127403	-1.181367	0.694779	0.885022	1.740483	-0.633370	0.873060	-1.119421	-0.174980	-0.346729	-0.293220	-0.750685	-0.166278	-0.309306	-0.381299	-0.339559	-0.143595
3251	-0.587858	1.003257	-0.569346	-1.154170	1.678978	-0.298862	-0.304693	-0.327333	-0.350292	0.509468	2.132567	-0.436673	-0.434912	-0.586189	-0.569346	-0.435552	-0.480608	1.478469	-0.364148	-0.534082	-1.127403	1.127403	0.976717	-0.976717	-0.503170	1.948171	-0.438271	-0.407826	-0.307965	-0.241458	-0.210362	-0.336300	-0.203894	1.216187	-0.768725	-0.459509	-0.105167	-0.315953	-0.721846	-0.409960	-0.277188	-0.085126	1.380200	-0.115245	-0.22790	-0.062115	-0.209851	-0.140709	-0.354533	-0.112132	-0.283324	-0.057121	-0.156638	-0.137022	-0.138877	-0.104205	-0.129358	-0.120404	-0.270534	-0.225256	-0.117851	-0.095654	-0.116989	-0.202051	-0.30854	-0.375171	-0.134005	2.401912	-0.301398	-0.398243	-0.410943	2.327679	-0.40799	-0.387547	-0.267797	-0.270954	-0.276363	-0.042155	-0.040966	-0.042155	-0.038477	-0.040966	0.091911	1.197707	-0.097216	-0.492227	-0.301787	-0.615056	-0.166278	-0.551733	-0.132087	-0.582547	-0.408319	1.661606	-0.309306	-0.11303	-0.112132	1.301130	-0.702825	-0.413561	-0.381299	-0.131701	-0.103721	-0.862943	-0.619167	2.249784	-0.339559	-0.447346	-0.126171	-0.595448	-0.243508	1.041565	-0.143595	0.873060	1.191874	-1.140802	-0.367266	0.519885	-1.127403	-0.716044	-0.391530	-0.640794	-0.976717	-1.127403	-1.181367	-0.887518	-0.683661	-0.390648	-0.633370	0.873060	0.536979	-0.858924	-0.346729	-0.882039	-0.750685	-0.166278	-0.309306	-0.381299	-0.339559	-0.143595

time: 66.8 ms (started: 2023-09-28 17:20:03 -07:00)

Convert to Y Train Data Frame from Y Test Data Frame Collection¶

In [46]:

#convert to y_actual data frame from y_actual data frame collection
data_frame_list_stratified_fold_number_Y_test = [stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(fold_number)]['Y_test'] for fold_number in range(5)]

df_stratified_fold_number_y_test = pd.concat(data_frame_list_stratified_fold_number_Y_test, axis=0, join='outer', ignore_index=False, copy=True).to_frame()
p(df_stratified_fold_number_y_test)

(10147, 1)

Out[46]:

	Y
8630	1
2418	1
10804	0
747	1
7333	1
10949	0
11937	0
735	1
67	1
3251	1

time: 4.08 ms (started: 2023-09-28 17:20:03 -07:00)

In [ ]:

Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

Modeling

Initialize Filename & Results Collection Variables¶

In [47]:

classifier_name_list = ['random_forest_classifier', 'gradient_boosting_classifier']

#filename
grid_search_models_filename_collection={}
grid_search_models_local_optimum_filename_collection={}

model_filename_collection = {}
model_cross_validation_results_filename_collection = {}

cross_validation_model_collection_filename_collection = {}
model_cross_validation_prediction_probability_collection_filename_collection = {}
model_cross_validation_prediction_collection_filename_collection = {}

learning_curve_results_filename_collection = {}

for classifier_name in classifier_name_list:
    grid_search_models_filename_collection[classifier_name]='stratified_5_fold_grid_search_cross_validation_' + classifier_name + '_v' + filename_version + '.pkl'
    grid_search_models_local_optimum_filename_collection[classifier_name]='stratified_5_fold_grid_search_cross_validation_' + classifier_name + '_local_optimum_v' + filename_version + '.pkl'
    
    model_filename_collection[classifier_name]='best_stratified_5_fold_grid_search_cross_validation_'+ classifier_name + '_v' + filename_version + '.pkl'
    model_cross_validation_results_filename_collection[classifier_name]='best_' + classifier_name + '_stratified_5_fold_cross_validation_results_v' + filename_version + '.pkl'

    cross_validation_model_collection_filename_collection[classifier_name] = 'stratified_5_fold_cross_validation_' + str(classifier_name) + '_collection_v' + filename_version + '.pkl'
    model_cross_validation_prediction_probability_collection_filename_collection[classifier_name] = 'stratified_5_fold_cross_validation_' + str(classifier_name) + '_prediction_probability_collection_v' + filename_version + '.pkl'
    model_cross_validation_prediction_collection_filename_collection[classifier_name] = 'stratified_5_fold_cross_validation_' + str(classifier_name) + '_prediction_collection_v' + filename_version + '.pkl'
    
    learning_curve_results_filename_collection[classifier_name]='learning_curve_results_best_' + classifier_name + '_v' + filename_version + '.pkl'


test_random_forest_metric_replicate_filename_collection={}; column_name_list=['Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']; model_name='random_forest'
for column_name in column_name_list:
    test_random_forest_metric_replicate_filename_collection[column_name]='df_test_'+str(model_name)+'_number_metric_estimated_'+str(number_of_replicates)+'_metric_replicates_from_'+str(number_of_replicates)+'_nonparametric_subsamples_'+str(column_name.lower().replace(' ','_'))+'_v'+str(filename_version)+'.csv'

model_version=filename_version

time: 1.76 ms (started: 2023-09-28 17:20:03 -07:00)

Initialize Other Variables¶

In [48]:

StratifiedKFold_5_splits = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

stratified_5_fold_cross_validation_model_classifier_prediction_probability_series_collection = {}

stratified_5_fold_cross_validation_model_classifier_prediction_probability_data_frame_collection = {}

stratified_5_fold_cross_validation_model_classifier_prediction_data_frame_collection = {}

model_stratified_5_fold_cross_validation_results_collection = {}

time: 903 µs (started: 2023-09-28 17:20:03 -07:00)

In [ ]:

Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

Modeling 1

Get Random Forest Classifier Stratified 5-Fold Cross Validation Grid Search by Accuracy¶

In [49]:

#get grid search stratified 5-fold cross validation results
models_readback = icr.return_saved_model_if_it_exists(filename=grid_search_models_filename_collection['random_forest_classifier'])
if models_readback != None:
    stratified_5_fold_grid_search_cross_validation_random_forest_classifier = models_readback
else:
    #random forest hyperparameter tuning using 5-fold cross validation

    # Create the parameter grid based on the results of random search
    param_grid = {
        'n_estimators': [200], #more is better, otherwise these are just random variation
        'criterion':['gini'],
        'max_depth': [3, 5, 10, 20, 25, None],
        'min_samples_split': [1, 10, 50, 100],
        'min_samples_leaf':[1],
        'min_weight_fraction_leaf':[0.0],
        'max_features':['auto'],
        'max_leaf_nodes':[None],
        'min_impurity_decrease':[0.0],
        'bootstrap': [False ,True],
        'oob_score': [False],
        'n_jobs': [None],
        'warm_start':[False],
        'class_weight':[None],
        'ccp_alpha':[0.0],
        'max_samples':[None],
    }

    # Create a base model
    random_forest_classifier = RandomForestClassifier(random_state=200)

    # Instantiate the Stratified 5-Fold Grid Search Cross Validation
    stratified_5_fold_grid_search_cross_validation_random_forest_classifier = GridSearchCV(estimator=random_forest_classifier, 
                                                                                           param_grid=param_grid, 
                                                                                           cv=StratifiedKFold_5_splits,
                                                                                           n_jobs=-1, 
                                                                                           verbose=0,
                                                                                           scoring=None)
    # Fit the grid search to the data
    stratified_5_fold_grid_search_cross_validation_random_forest_classifier.fit(data_frame_collection['X_train'], data_frame_collection['Y_train'].loc[:, 'Y'])

    #save it
    stratified_5_fold_grid_search_cross_validation_random_forest_classifier = icr.save_and_return_model(stratified_5_fold_grid_search_cross_validation_random_forest_classifier, 
                                                                                                        filename=grid_search_models_filename_collection['random_forest_classifier'])

print()
print('Global Optimum Grid Search Cross Validation Object:')
print(stratified_5_fold_grid_search_cross_validation_random_forest_classifier)
print()
print('Best Random Forest Classifier by GridSearchCV Global Optimum:')
print(stratified_5_fold_grid_search_cross_validation_random_forest_classifier.best_estimator_)
print('Accuracy Score: '+str(stratified_5_fold_grid_search_cross_validation_random_forest_classifier.best_score_))

This file already exists

Global Optimum Grid Search Cross Validation Object:
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=200), n_jobs=-1,
             param_grid={'bootstrap': [False, True], 'ccp_alpha': [0.0],
                         'class_weight': [None], 'criterion': ['gini'],
                         'max_depth': [3, 5, 10, 20, 25, None],
                         'max_features': ['auto'], 'max_leaf_nodes': [None],
                         'max_samples': [None], 'min_impurity_decrease': [0.0],
                         'min_samples_leaf': [1],
                         'min_samples_split': [1, 10, 50, 100],
                         'min_weight_fraction_leaf': [0.0],
                         'n_estimators': [200], 'n_jobs': [None],
                         'oob_score': [False], 'warm_start': [False]})

Best Random Forest Classifier by GridSearchCV Global Optimum:
RandomForestClassifier(bootstrap=False, max_depth=25, min_samples_split=10,
                       n_estimators=200, random_state=200)
Accuracy Score: 0.7606187133849818
time: 72.2 ms (started: 2023-09-28 17:20:03 -07:00)

Get Random Forest Classifier Stratified 5-Fold Cross Validation Grid Search by Accuracy For Local Optimum¶

In [50]:

models_readback = icr.return_saved_model_if_it_exists(filename=grid_search_models_local_optimum_filename_collection['random_forest_classifier'])
if models_readback != None:
    stratified_5_fold_grid_search_cross_validation_random_forest_classifier_local_optimum = models_readback
else:
    #random forest hyperparameter tuning using 5-fold cross validation
    param_grid = {
        'bootstrap': [False ,True],
        'criterion':['gini'],
        'max_features':['auto'],
        'max_depth': [3, 5, 10, 20, 25, None], # lower numbers reduce growth
        'min_samples_split': [1, 5, 10, 15, 20], # higher numbers reduce growth
        'min_samples_leaf':[1],
        'n_estimators': [200], #more is better, otherwise these are just random variation
        'warm_start':[False]
    }
    #min_samples_leaf can be used as an alternative to 'min_samples_split'

    
    # Create a base model
    random_forest_classifier = RandomForestClassifier(random_state=200)

    # Instantiate the Stratified 5-Fold Grid Search Cross Validation
    stratified_5_fold_grid_search_cross_validation_random_forest_classifier_local_optimum = GridSearchCV(estimator=random_forest_classifier, 
                                                                                                         param_grid=param_grid, 
                                                                                                         cv=StratifiedKFold_5_splits,
                                                                                                         n_jobs=-1, 
                                                                                                         verbose=0,
                                                                                                         scoring=None)
    # Fit the grid search to the data
    stratified_5_fold_grid_search_cross_validation_random_forest_classifier_local_optimum.fit(data_frame_collection['X_train'], data_frame_collection['Y_train'].loc[:, 'Y'])

    #save it
    stratified_5_fold_grid_search_cross_validation_random_forest_classifier_local_optimum = icr.save_and_return_model(stratified_5_fold_grid_search_cross_validation_random_forest_classifier_local_optimum, 
                                                                                                                      filename=grid_search_models_local_optimum_filename_collection['random_forest_classifier'])

#del stratified_5_fold_grid_search_cross_validation_random_forest_classifier

print()
print('Local Optimum Grid Search Cross Validation Object:')
print(stratified_5_fold_grid_search_cross_validation_random_forest_classifier_local_optimum)

print()
print('Best Random Forest Classifier by GridSearchCV Local Optimum:')
print(stratified_5_fold_grid_search_cross_validation_random_forest_classifier_local_optimum.best_estimator_)
print('Accuracy Score: '+str(stratified_5_fold_grid_search_cross_validation_random_forest_classifier_local_optimum.best_score_))

This file already exists

Local Optimum Grid Search Cross Validation Object:
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=200), n_jobs=-1,
             param_grid={'bootstrap': [False, True], 'criterion': ['gini'],
                         'max_depth': [3, 5, 10, 20, 25, None],
                         'max_features': ['auto'], 'min_samples_leaf': [1],
                         'min_samples_split': [1, 5, 10, 15, 20],
                         'n_estimators': [200], 'warm_start': [False]})

Best Random Forest Classifier by GridSearchCV Local Optimum:
RandomForestClassifier(bootstrap=False, max_depth=25, min_samples_split=5,
                       n_estimators=200, random_state=200)
Accuracy Score: 0.7640680575012079
time: 114 ms (started: 2023-09-28 17:20:03 -07:00)

Get Best Random Forest Classifier by Accuracy¶

In [51]:

#get best random forest classifier
model_readback = icr.return_saved_model_if_it_exists(filename=model_filename_collection['random_forest_classifier'])
if model_readback != None:
    best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier = model_readback
else:   
    #add model environment data to model

    #get best random forest classifier from grid search cross validation
    best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier = stratified_5_fold_grid_search_cross_validation_random_forest_classifier_local_optimum.best_estimator_

    best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier.version = model_version
    best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier.pandas_version = pd.__version__
    best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier.numpy_version = np.__version__
    best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier.sklearn_version = sklearn_version
    best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier.X_columns = [column_name for column_name in data_frame_collection['X_train'].columns]
    best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier.build_datetime = datetime.datetime.now()

    
    best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier = icr.save_and_return_model(best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier, 
                                                                                                              filename=model_filename_collection['random_forest_classifier'])
    
best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier

This file already exists

Out[51]:

RandomForestClassifier(bootstrap=False, max_depth=25, min_samples_split=5,
                       n_estimators=200, random_state=200)

time: 96.7 ms (started: 2023-09-28 17:20:03 -07:00)

Get Stratified 5-Fold Cross Validation Random Forest Classifier Collection¶

In [52]:

models_readback = icr.return_saved_model_if_it_exists(filename=cross_validation_model_collection_filename_collection['random_forest_classifier'])
if models_readback != None:
    stratified_5_fold_cross_validation_random_forest_classifier_collection = models_readback
else:
    stratified_5_fold_cross_validation_random_forest_classifier_collection = {}
    for index in range(5):

        #create random forest classifier
        random_forest_classifier=RandomForestClassifier(bootstrap=False, 
                                                        max_depth=25, 
                                                        min_samples_split=5, 
                                                        n_estimators=200,
                                                        min_samples_leaf=1,
                                                        max_features='auto',
                                                        criterion='gini',
                                                        warm_start='False',
                                                        random_state=200)

        #train random forest classifier and save random forest classifer per fold
        random_forest_classifier.fit(X=stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(index)]['X_train'],
                                     y=stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(index)]['Y_train'])

        stratified_5_fold_cross_validation_random_forest_classifier_collection['fold ' + str(index)] = random_forest_classifier

    #save stratified 5-fold cross validation random forest classifier collection
    stratified_5_fold_cross_validation_random_forest_classifier_collection = icr.save_and_return_model(stratified_5_fold_cross_validation_random_forest_classifier_collection, 
                                                                                                       filename=cross_validation_model_collection_filename_collection['random_forest_classifier'])
stratified_5_fold_cross_validation_random_forest_classifier_collection

This file already exists

Out[52]:

{'fold 0': RandomForestClassifier(bootstrap=False, max_depth=25, min_samples_split=5,
                        n_estimators=200, random_state=200, warm_start='False'),
 'fold 1': RandomForestClassifier(bootstrap=False, max_depth=25, min_samples_split=5,
                        n_estimators=200, random_state=200, warm_start='False'),
 'fold 2': RandomForestClassifier(bootstrap=False, max_depth=25, min_samples_split=5,
                        n_estimators=200, random_state=200, warm_start='False'),
 'fold 3': RandomForestClassifier(bootstrap=False, max_depth=25, min_samples_split=5,
                        n_estimators=200, random_state=200, warm_start='False'),
 'fold 4': RandomForestClassifier(bootstrap=False, max_depth=25, min_samples_split=5,
                        n_estimators=200, random_state=200, warm_start='False')}

time: 333 ms (started: 2023-09-28 17:20:03 -07:00)

Get Stratified 5-Fold Cross Validation Random Forest Classifier Prediction Probability Collection¶

In [53]:

prediction_probability_ndarray_collection = icr.return_processed_collection_if_it_exists(filename=model_cross_validation_prediction_probability_collection_filename_collection['random_forest_classifier'])
if prediction_probability_ndarray_collection != None:
    stratified_5_fold_cross_validation_random_forest_classifier_prediction_probability_collection = prediction_probability_ndarray_collection
else:
    stratified_5_fold_cross_validation_random_forest_classifier_prediction_probability_collection = {}
    for index in range(5):
        #get predictions for test per fold
        stratified_5_fold_cross_validation_random_forest_classifier_prediction_probability_collection['fold ' + str(index)] = \
        stratified_5_fold_cross_validation_random_forest_classifier_collection['fold ' + str(index)]\
        .predict_proba(stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(index)]['X_test'])

    #save stratified 5-fold cross validation random forest classifier prediction collection
    stratified_5_fold_cross_validation_random_forest_classifier_prediction_probability_collection = icr.save_and_return_collection(data_frame_collection=stratified_5_fold_cross_validation_random_forest_classifier_prediction_probability_collection, 
                                                                                                                                   filename=model_cross_validation_prediction_probability_collection_filename_collection['random_forest_classifier'])
stratified_5_fold_cross_validation_random_forest_classifier_prediction_probability_collection['fold 0']

This file already exists

Out[53]:

array([[0.22768785, 0.77231215],
       [0.28875   , 0.71125   ],
       [0.59625   , 0.40375   ],
       ...,
       [0.75791667, 0.24208333],
       [0.14683716, 0.85316284],
       [0.51916667, 0.48083333]])

time: 3.91 ms (started: 2023-09-28 17:20:03 -07:00)

Get Stratified 5-Fold Cross Validation Random Forest Classifier Prediction Probability Series¶

In [54]:

#get data frame list
data_frame_list_stratified_5_fold_cross_validation_random_forest_classifier_prediction_probability = \
[pd.DataFrame(stratified_5_fold_cross_validation_random_forest_classifier_prediction_probability_collection['fold ' + str(number)]).loc[:, 1] for number in range(5)]

stratified_5_fold_cross_validation_model_classifier_prediction_probability_series_collection['random_forest_classifier'] = \
pd.concat(data_frame_list_stratified_5_fold_cross_validation_random_forest_classifier_prediction_probability)

p(stratified_5_fold_cross_validation_model_classifier_prediction_probability_series_collection['random_forest_classifier'])

(10147,)

Out[54]:

0       0.772312
1       0.711250
2       0.403750
3       0.824488
4       0.710913
2024    0.261250
2025    0.652119
2026    0.669310
2027    0.911190
2028    0.640171
Name: 1, dtype: float64

time: 3.71 ms (started: 2023-09-28 17:20:03 -07:00)

In [55]:

warnings.filterwarnings('ignore')

time: 501 µs (started: 2023-09-28 17:20:03 -07:00)

In [ ]:

Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

Modeling 2

Get Gradient Boosting Classifier Stratified 5-Fold Grid Seach Cross Validation by Accuracy Score¶

In [56]:

#get grid search stratified 5-fold cross validation results
models_readback = icr.return_saved_model_if_it_exists(filename=grid_search_models_filename_collection['gradient_boosting_classifier'])
if models_readback != None:
    stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier = models_readback
else:
    gradient_boosting_classifier = GradientBoostingClassifier(random_state=200)
    param_grid = {'loss' : ['log_loss', 'exponential'],
                  'learning_rate' : [0.01, 0.1, 1, 10, 100],
                  'n_estimators' : [5, 50, 250, 500],
                  'subsample' : [1.0],
                  'criterion' : ['friedman_mse', 'squared_error'],
                  'min_samples_split' : [2],
                  'min_samples_leaf' : [1],
                  'min_weight_fraction_leaf' : [0.0],
                  'max_depth' : [1, 3, 5, 7, 9, None],
                  'min_impurity_decrease' : [0.0],
                  'init' : [None],
                  'max_features' : [None],
                  'max_leaf_nodes' : [None],
                  'warm_start' : [False],
                  'n_iter_no_change' : [None],
                 }

    # Instantiate the Stratified 5-Fold Grid Search Cross Validation
    stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier = GridSearchCV(estimator=gradient_boosting_classifier, 
                                                                                               param_grid=param_grid, 
                                                                                               cv=StratifiedKFold_5_splits,
                                                                                               n_jobs=-1, 
                                                                                               verbose=0,
                                                                                               scoring=None,
                                                                                               pre_dispatch="2*n_jobs")

    stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier.fit(X=data_frame_collection['X_train'], 
                                                                                    y=data_frame_collection['Y_train'].loc[:, 'Y'],
                                                                                    groups=None)

    #save it
    stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier = icr.save_and_return_model(stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier, 
                                                                                                             filename=grid_search_models_filename_collection['gradient_boosting_classifier'],
                                                                                                             add_compressed_file=False)
    
stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier

This file already exists

Out[56]:

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=GradientBoostingClassifier(random_state=200), n_jobs=-1,
             param_grid={'criterion': ['friedman_mse', 'squared_error'],
                         'init': [None],
                         'learning_rate': [0.01, 0.1, 1, 10, 100],
                         'loss': ['log_loss', 'exponential'],
                         'max_depth': [1, 3, 5, 7, 9, None],
                         'max_features': [None], 'max_leaf_nodes': [None],
                         'min_impurity_decrease': [0.0],
                         'min_samples_leaf': [1], 'min_samples_split': [2],
                         'min_weight_fraction_leaf': [0.0],
                         'n_estimators': [5, 50, 250, 500],
                         'n_iter_no_change': [None], 'subsample': [1.0],
                         'warm_start': [False]})

time: 23.1 ms (started: 2023-09-28 17:20:03 -07:00)

Get Best Gradient Boosting Classifier by Accuracy Score¶

In [57]:

model_readback = icr.return_saved_model_if_it_exists(filename=model_filename_collection['gradient_boosting_classifier'])
if model_readback != None:
    best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier = model_readback
else:   
    #add model environment data to model

    #get best random forest classifier from grid search cross validation
    best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier = stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier.best_estimator_

    best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier.version = model_version
    best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier.pandas_version = pd.__version__
    best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier.numpy_version = np.__version__
    best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier.sklearn_version = sklearn_version
    best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier.X_columns = [column_name for column_name in data_frame_collection['X_train'].columns]
    best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier.build_datetime = datetime.datetime.now()    

    best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier = icr.save_and_return_model(best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier, 
                                                                                                                  filename=model_filename_collection['gradient_boosting_classifier'],
                                                                                                                  add_compressed_file=False)
    
best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier

This file already exists

Out[57]:

GradientBoostingClassifier(loss='exponential', max_depth=9, n_estimators=250,
                           random_state=200)

time: 19.3 ms (started: 2023-09-28 17:20:03 -07:00)

Get Stratified 5-Fold Cross Validation Gradient Boosting Classifier Collection¶

In [58]:

models_readback = icr.return_saved_model_if_it_exists(filename=cross_validation_model_collection_filename_collection['gradient_boosting_classifier'])
if models_readback != None:
    stratified_5_fold_cross_validation_gradient_boosting_classifier_collection = models_readback
else:
    stratified_5_fold_cross_validation_gradient_boosting_classifier_collection = {}
    for index in range(5):

        #create gradient boosting classifier
        gradient_boosting_classifier = GradientBoostingClassifier(criterion='squared_error',
                                                                  loss='exponential',
                                                                  learning_rate = 0.1, 
                                                                  max_depth = 9, 
                                                                  n_estimators = 250,
                                                                  random_state=200,
                                                                  subsample=1.0,
                                                                  min_samples_split=2,
                                                                  min_samples_leaf=1,
                                                                  min_weight_fraction_leaf=0.0,
                                                                  min_impurity_decrease=0.0,
                                                                  init=None,
                                                                  max_features=None,
                                                                  verbose=0,
                                                                  max_leaf_nodes=None,
                                                                  warm_start=False,
                                                                  n_iter_no_change=None)

        
        #train gradient boosting classifier and save gradient boosting classifer per fold
        gradient_boosting_classifier.fit(X=stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(index)]['X_train'],
                                         y=stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(index)]['Y_train'])

        stratified_5_fold_cross_validation_gradient_boosting_classifier_collection['fold ' + str(index)] = gradient_boosting_classifier


    #save stratified 5-fold cross validation gradient boosting classifier collection
    stratified_5_fold_cross_validation_gradient_boosting_classifier_collection = icr.save_and_return_model(stratified_5_fold_cross_validation_gradient_boosting_classifier_collection, 
                                                                                                           filename=cross_validation_model_collection_filename_collection['gradient_boosting_classifier'])
stratified_5_fold_cross_validation_gradient_boosting_classifier_collection

This file already exists

Out[58]:

{'fold 0': GradientBoostingClassifier(criterion='squared_error', loss='exponential',
                            max_depth=9, n_estimators=250, random_state=200),
 'fold 1': GradientBoostingClassifier(criterion='squared_error', loss='exponential',
                            max_depth=9, n_estimators=250, random_state=200),
 'fold 2': GradientBoostingClassifier(criterion='squared_error', loss='exponential',
                            max_depth=9, n_estimators=250, random_state=200),
 'fold 3': GradientBoostingClassifier(criterion='squared_error', loss='exponential',
                            max_depth=9, n_estimators=250, random_state=200),
 'fold 4': GradientBoostingClassifier(criterion='squared_error', loss='exponential',
                            max_depth=9, n_estimators=250, random_state=200)}

time: 89.6 ms (started: 2023-09-28 17:20:03 -07:00)

Get Stratified 5-Fold Cross Validation Gradient Boosting Classifier Prediction Probability Collection¶

In [59]:

prediction_probability_ndarray_collection = icr.return_processed_collection_if_it_exists(filename=model_cross_validation_prediction_probability_collection_filename_collection['gradient_boosting_classifier'])
if prediction_probability_ndarray_collection != None:
    stratified_5_fold_cross_validation_gradient_boosting_classifier_prediction_probability_collection = prediction_probability_ndarray_collection
else:
    stratified_5_fold_cross_validation_gradient_boosting_classifier_prediction_probability_collection = {}
    for index in range(5):
        #get predictions for test per fold
        stratified_5_fold_cross_validation_gradient_boosting_classifier_prediction_probability_collection['fold ' + str(index)] = \
        stratified_5_fold_cross_validation_gradient_boosting_classifier_collection['fold ' + str(index)]\
        .predict_proba(stratified_fold_number_X_train_X_test_Y_train_Y_test_collection['fold ' + str(index)]['X_test'])
    

    #save stratified 5-fold cross validation random forest classifier prediction probability collection
    stratified_5_fold_cross_validation_gradient_boosting_classifier_prediction_probability_collection = icr.save_and_return_collection(data_frame_collection=stratified_5_fold_cross_validation_gradient_boosting_classifier_prediction_probability_collection, 
                                                                                                                                   filename=model_cross_validation_prediction_probability_collection_filename_collection['gradient_boosting_classifier'])
stratified_5_fold_cross_validation_gradient_boosting_classifier_prediction_probability_collection['fold 0']

This file already exists

Out[59]:

array([[2.48407057e-01, 7.51592943e-01],
       [2.05103256e-02, 9.79489674e-01],
       [9.98658831e-01, 1.34116928e-03],
       ...,
       [9.99920319e-01, 7.96805318e-05],
       [3.63162218e-05, 9.99963684e-01],
       [4.67138033e-02, 9.53286197e-01]])

time: 3.76 ms (started: 2023-09-28 17:20:03 -07:00)

Get Stratified 5-Fold Cross Validation Gradient Boosting Classifier Prediction Probability Data Frame¶

In [60]:

#get data frame list
data_frame_list_stratified_5_fold_cross_validation_gradient_boosting_classifier_prediction_probability = \
[pd.DataFrame(stratified_5_fold_cross_validation_gradient_boosting_classifier_prediction_probability_collection['fold ' + str(number)]).loc[:, 1] for number in range(5)]

stratified_5_fold_cross_validation_model_classifier_prediction_probability_series_collection['gradient_boosting_classifier'] = \
pd.concat(data_frame_list_stratified_5_fold_cross_validation_gradient_boosting_classifier_prediction_probability)

p(stratified_5_fold_cross_validation_model_classifier_prediction_probability_series_collection['gradient_boosting_classifier'])

(10147,)

Out[60]:

0       0.751593
1       0.979490
2       0.001341
3       0.993639
4       0.826168
2024    0.577856
2025    0.947631
2026    0.991084
2027    0.999991
2028    0.938312
Name: 1, dtype: float64

time: 5.61 ms (started: 2023-09-28 17:20:03 -07:00)

In [ ]:

Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

Modeling Train Results

Learning Curve for Random Forest and Gradient Boosting Classifier¶

In [61]:

dpi=100
figure_filename='../reports/figures/figure_train_size_score_random_forest_classifier_gradient_boosting_classifier_dpi_'+str(dpi)+'_v'+filename_version+'.png'

figure_filename_exists=os.path.isfile(figure_filename)
if figure_filename_exists == True:
    img = mpimg.imread(figure_filename)
    plt.figure(figsize=(60, 15))
    plt.grid(False)
    plt.axis('off')
    plt.imshow(img)
else:
    StratifiedKFold_5_splits = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
    
    fig, axes = plt.subplots(3, 2, figsize=(10, 15))

    title=r"Learning Curves (Random Forest)"
    estimator=RandomForestClassifier(bootstrap=False, max_depth=25, min_samples_split=5, n_estimators=200, random_state=200, min_samples_leaf=1, max_features='auto', criterion='gini', warm_start='False',)

    plt, learning_curve_random_forest_classifier=icr.plot_learning_curve(estimator=estimator, 
                                                                           title=title, 
                                                                           X=data_frame_collection['X_train'], 
                                                                           y=data_frame_collection['Y_train'].loc[:, 'Y'],
                                                                           filename=learning_curve_results_filename_collection['random_forest_classifier'],
                                                                           axes=axes[:, 0], 
                                                                           ylim=(0.65, 1.01), 
                                                                           cv=StratifiedKFold_5_splits, 
                                                                           n_jobs=4, 
                                                                           scoring="accuracy",
                                                                           train_sizes=np.linspace(0.1, 1.0, 5))


    title = r"Learning Curves (Gradient Boosting)"
    estimator = GradientBoostingClassifier(learning_rate=0.1, 
                                           max_depth=9, 
                                           n_estimators = 250,
                                           random_state=200,
                                           max_features=None,
                                           verbose=0,
                                           max_leaf_nodes=None,
                                           warm_start=False,
                                           n_iter_no_change=None)

    plt, learning_curve_gradient_boosting_classifier=icr.plot_learning_curve(estimator=estimator, 
                                                                               title=title, 
                                                                               X=data_frame_collection['X_train'], 
                                                                               y=data_frame_collection['Y_train'].loc[:, 'Y'],
                                                                               filename=learning_curve_results_filename_collection['gradient_boosting_classifier'],
                                                                               axes=axes[:, 1], 
                                                                               ylim=(0.65, 1.01), 
                                                                               cv=StratifiedKFold_5_splits, 
                                                                               n_jobs=4,
                                                                               scoring="accuracy",
                                                                               train_sizes=np.linspace(0.1, 1.0, 5))
    
    plt.title('Model Learning Curve')

    fig.subplots_adjust(wspace=.4)

    plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)

plt.show()

time: 192 ms (started: 2023-09-28 17:20:03 -07:00)

Get Model Coupon Acceptance Rate vs Pecentage of Coupon Acceptances Captured Plot¶

In [62]:

xlabel_string='Percentage of Coupon Acceptances Captured'
ylabel_string='Coupon Acceptance Rate'

filename = '../reports/figures/figure_precision_recall_curve_random_forest_gradient_boosting_metric_auc_v' + filename_version + '.png'
markersize=1
linewidth=1

figure, axes = plt.subplots(ncols=1, nrows=1, figsize=(12,9))

plt.rcParams.update({'font.size': 16})

#calculate precision-recall points
random_forest_classifier_precision_array, random_forest_classifier_recall_array, random_forest_classifier_decision_threshold_array = \
precision_recall_curve(y_true=data_frame_collection['Y_train'].loc[:, 'Y'], probas_pred=stratified_5_fold_cross_validation_model_classifier_prediction_probability_series_collection['random_forest_classifier'])


#calculate precision-recall curve auc
random_forest_classifier_auc = auc(random_forest_classifier_recall_array, random_forest_classifier_precision_array)

# plot the precision-recall curve
plt.plot(random_forest_classifier_recall_array, random_forest_classifier_precision_array, marker='.', markersize=markersize, linewidth=linewidth, label='Random Forest AUC=' + str(round(random_forest_classifier_auc, 3)))


#calculate precision-recall points
gradient_boosting_classifier_precision_array, gradient_boosting_classifier_recall_array, gradient_boosting_classifier_threshold_array = \
precision_recall_curve(y_true=data_frame_collection['Y_train'].loc[:, 'Y'], probas_pred=stratified_5_fold_cross_validation_model_classifier_prediction_probability_series_collection['gradient_boosting_classifier'])

#calculate precision-recall curve auc
gradient_boosting_classifier_auc = auc(gradient_boosting_classifier_recall_array, gradient_boosting_classifier_precision_array)

# plot the precision-recall curve
plt.plot(gradient_boosting_classifier_recall_array, gradient_boosting_classifier_precision_array, marker='.', markersize=markersize, linewidth=linewidth, label='Gradient Boosting AUC=' + str(round(gradient_boosting_classifier_auc, 3)))


#calculate no skill classifier curve
no_skill_classifier_auc = data_frame_collection['Y_train'].loc[data_frame_collection['Y_train'].loc[:, 'Y']==1, 'Y'].shape[0] / data_frame_collection['Y_train'].loc[:, 'Y'].shape[0]
plt.plot([0, 1], [no_skill_classifier_auc, no_skill_classifier_auc], linestyle='--', label='No Skill AUC='+str(round(no_skill_classifier_auc, 3)))

plt.xticks([.0, .1 ,.2, .3 ,.4, .5, .6 ,.7, .8, .9, 1 ])
plt.yticks([.6 ,.7, .8, .9, 1 ])
plt.gca().xaxis.set_major_formatter(PercentFormatter(1))
plt.gca().set_yticklabels(['{:.0f}%'.format(y*100) for y in plt.gca().get_yticks()])
plt.xlabel(xlabel_string)
plt.ylabel(ylabel_string)
plt.title(str(ylabel_string)+' vs. '+str(xlabel_string))
plt.legend()


#save it
plt.savefig(filename, bbox_inches='tight', dpi=100)

plt.show()

time: 247 ms (started: 2023-09-28 17:20:04 -07:00)

In [ ]:

Get Random Forest Classifier Decision Threshold, Precision, Recall DataFrame¶

In [63]:

#get data frame random forest decision threshold, precision, and recall
random_forest_classifier_decision_threshold_array = np.append(0, random_forest_classifier_decision_threshold_array)

df_random_forest_decision_threshold_precision_recall = pd.DataFrame({'random_forest_decision_threshold':random_forest_classifier_decision_threshold_array, 'random_forest_precision':random_forest_classifier_precision_array, 'random_forest_recall':random_forest_classifier_recall_array})
del random_forest_classifier_decision_threshold_array

#get random forest 90% precision decision threshold
random_forest_90_precision_estimated_recall=df_random_forest_decision_threshold_precision_recall.loc[df_random_forest_decision_threshold_precision_recall.loc[:, 'random_forest_precision']>=.9, :].head(1).loc[:, 'random_forest_recall'].values[0]

p(df_random_forest_decision_threshold_precision_recall)

(8701, 3)

Out[63]:

	random_forest_decision_threshold	random_forest_precision	random_forest_recall
0	0.000000	0.568763	1.000000
1	0.034167	0.568720	0.999826
2	0.035000	0.568777	0.999826
3	0.036347	0.568833	0.999826
4	0.036667	0.568889	0.999826
8696	0.987500	1.000000	0.000694
8697	0.987708	1.000000	0.000521
8698	0.988750	1.000000	0.000347
8699	0.990833	1.000000	0.000174
8700	0.993750	1.000000	0.000000

time: 7.32 ms (started: 2023-09-28 17:20:04 -07:00)

In [ ]:

Get Gradient Boosting Classifier Decision Threshold, Precision, Recall DataFrame¶

In [64]:

#get data frame gradient boosting decision threshold, precision, and recall
gradient_boosting_classifier_threshold_array = np.append(0, gradient_boosting_classifier_threshold_array)

df_gradient_boosting_decision_threshold_precision_recall = pd.DataFrame({'gradient_boosting_decision_threshold':gradient_boosting_classifier_threshold_array, 'gradient_boosting_precision':gradient_boosting_classifier_precision_array, 'gradient_boosting_recall':gradient_boosting_classifier_recall_array})
del gradient_boosting_classifier_threshold_array

p(df_gradient_boosting_decision_threshold_precision_recall)

(10108, 3)

Out[64]:

	gradient_boosting_decision_threshold	gradient_boosting_precision	gradient_boosting_recall
0	0.000000	0.569494	1.000000
1	0.000004	0.569451	0.999826
2	0.000004	0.569508	0.999826
3	0.000004	0.569564	0.999826
4	0.000005	0.569620	0.999826
10103	0.999999	1.000000	0.000694
10104	1.000000	1.000000	0.000521
10105	1.000000	1.000000	0.000347
10106	1.000000	1.000000	0.000174
10107	1.000000	1.000000	0.000000

time: 5.06 ms (started: 2023-09-28 17:20:04 -07:00)

Get Random Forest Classifier Top 50 Features Bar Plot¶

In [65]:

#get feature importants plot
top_number_features = 50
figsize=(6, 10)
dpi=100
figure_filename = '../reports/figures/figure_random_forest_classifier_train_top_'+ str(top_number_features) +'_feature_importances_v' + filename_version + '.png'


plt.subplots(figsize=figsize)

feature_importances = best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier.feature_importances_
series_random_forest_classifier_feature_importances = pd.Series(feature_importances, 
                                                                 index=best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier.X_columns).sort_values(ascending=False)
series_random_forest_classifier_feature_importances[0:top_number_features].sort_values(ascending=True).plot(kind='barh')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Random Forest Classifier Top ' + str(top_number_features)   + ' Feature Importances');

plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)

plt.show()

time: 1.1 s (started: 2023-09-28 17:20:04 -07:00)

In [66]:

#get random forest top 50 features by importance
column_name_list_random_forest_classifier_top_50_by_feature_importance = series_random_forest_classifier_feature_importances[0:50].index.to_list()

column_name_list_random_forest_classifier_top_50_by_feature_importance_target = column_name_list_random_forest_classifier_top_50_by_feature_importance + ['Y']
del column_name_list_random_forest_classifier_top_50_by_feature_importance

time: 609 µs (started: 2023-09-28 17:20:05 -07:00)

Get Random Forest Classifier Top 50 Features Correlation Heat Map¶

In [67]:

dpi=100
figure_filename = '../reports/figures/figure_correlation_heatmap_random_forest_classifier_train_top_50_and_target_dpi_' + str(dpi) + '_v' + filename_version + '.png'

figure_filename_exists = os.path.isfile(figure_filename)
if figure_filename_exists == True:
    img = mpimg.imread(figure_filename)
    plt.figure(figsize=(35, 28))
    plt.grid(False)
    plt.axis('off')
    plt.imshow(img)
else:
    #correlation heatmap of top 50 features and target
    
    df_train = pd.concat([data_frame_collection['X_train'], data_frame_collection['Y_train']], axis=1)

    df_corr = df_train.loc[:, column_name_list_random_forest_classifier_top_50_by_feature_importance_target].corr()

    fig, ax = plt.subplots(figsize=(30, 24))

    mask = np.triu(np.ones_like(df_corr, dtype=bool))
    mask = mask[1:, :-1]

    corr = df_corr.iloc[1:,:-1].copy()

    sns.set(font_scale=1.4)
    sns.set_style("white")

    res = sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", annot_kws={"size": 12}, cmap='YlOrBr', vmin=-1, vmax=1, cbar_kws={"shrink": .8})
    res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 14, rotation=90)
    res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 14, rotation=0)
    plt.title('Correlation Heatmap Random Forest of Top 50 Features and Target', fontsize=18)

    #save it
    plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)

plt.show()

time: 1.05 s (started: 2023-09-28 17:20:05 -07:00)

Get Train DataFrame Random Forest Prediction Probability, Gradient Boosting Prediction Probability, Y Actual, Coupon Venue Type¶

In [68]:

feature_column_name_list = ['coupon_venue_type']

## Get Prediction Probabilities for Gradient Boosting, Prediction Probabilities for Random Forest, Y_actual, and coupon venue type
df_Y_train_random_forest_prediction_probability=stratified_5_fold_cross_validation_model_classifier_prediction_probability_series_collection['random_forest_classifier'].to_frame().rename(columns={1:'Y_train_random_forest_prediction_probability'}).reset_index(drop=True)
df_Y_train_gradient_boosting_prediction_probability=stratified_5_fold_cross_validation_model_classifier_prediction_probability_series_collection['gradient_boosting_classifier'].to_frame().rename(columns={1:'Y_train_gradient_boosting_prediction_probability'}).reset_index(drop=True)


df_y_train_model_name_prediction_probability_y_actual_coupon_venue_type = \
pd.concat([df_Y_train_random_forest_prediction_probability, 
           df_Y_train_gradient_boosting_prediction_probability, 
           data_frame_collection['Y_train'].reset_index(drop=True),
           df_collection['X_train'].loc[:, feature_column_name_list].reset_index(drop=True)], axis=1)

p(df_y_train_model_name_prediction_probability_y_actual_coupon_venue_type)

(10147, 4)

Out[68]:

	Y_train_random_forest_prediction_probability	Y_train_gradient_boosting_prediction_probability	Y	coupon_venue_type
0	0.772312	0.751593	1	Coffee House
1	0.711250	0.979490	1	Coffee House
2	0.403750	0.001341	0	Carry out & Take away
3	0.824488	0.993639	1	Coffee House
4	0.710913	0.826168	1	Coffee House
10142	0.261250	0.577856	0	Bar
10143	0.652119	0.947631	0	Bar
10144	0.669310	0.991084	1	Restaurant(20-50)
10145	0.911190	0.999991	1	Coffee House
10146	0.640171	0.938312	1	Coffee House

time: 8.93 ms (started: 2023-09-28 17:20:06 -07:00)

Get DataFrame Model Prediction, Survey Prediction, Y Actual, Coupon Venue Type¶

In [69]:

# Get Random Forest Classifier Y Predicted from Y Prediction Probabilities and Decision Threshold .9 Precision Estimated
model_type='random_forest'
df_Y_train_random_forest_predicted = icr.get_model_predictions_from_prediction_probabilities_and_decision_threshold_proportion_metric_estimated(df=df_random_forest_decision_threshold_precision_recall, 
                                                                                                                                                model_proportion_precision=.9, 
                                                                                                                                                model_proportion_recall=None,
                                                                                                                                                model_precision_column_name=model_type+'_precision', 
                                                                                                                                                model_recall_column_name=model_type+'_recall',
                                                                                                                                                model_decision_threshold_column_name=model_type+'_decision_threshold',
                                                                                                                                                df_Y_train_test_model_prediction_probability=df_Y_train_random_forest_prediction_probability.iloc[:, 0],
                                                                                                                                                train_test='train',
                                                                                                                                                filename_version=filename_version)
df_Y_train_random_forest_predicted=df_Y_train_random_forest_predicted.rename(columns={'Y_train_predicted':'Y_train_'+str(model_type)+'_predicted'})


#get gradient boosting 80% recall estimate predictions
model_type = 'gradient_boosting'
model_proportion_precision=None
model_proportion_recall=.8

df_Y_train_gradient_boosting_predicted = icr.get_model_predictions_from_prediction_probabilities_and_decision_threshold_proportion_metric_estimated(df=df_gradient_boosting_decision_threshold_precision_recall,
                                                                                                                                                   model_proportion_precision=model_proportion_precision,
                                                                                                                                                   model_proportion_recall=model_proportion_recall,
                                                                                                                                                   model_precision_column_name=model_type+'_precision',
                                                                                                                                                   model_recall_column_name=model_type+'_recall',
                                                                                                                                                   model_decision_threshold_column_name=model_type+'_decision_threshold',
                                                                                                                                                   df_Y_train_test_model_prediction_probability=df_Y_train_gradient_boosting_prediction_probability.iloc[:, 0],
                                                                                                                                                   train_test='train',
                                                                                                                                                   filename_version=filename_version)
df_Y_train_gradient_boosting_predicted=df_Y_train_gradient_boosting_predicted.rename(columns={'Y_train_predicted':'Y_train_'+str(model_type)+'_predicted'})



data_fold_type='train'
number_of_predictions=data_frame_collection['X_'+data_fold_type].shape[0]


#get survey 27% recall estimate predictions
recall_estimated=random_forest_90_precision_estimated_recall
df_Y_train_survey_27_recall_estimate_predicted = icr.get_survey_coupon_recommendations_by_recall_estimate(number_of_predictions=number_of_predictions, recall_estimated=recall_estimated, random_state=200, train_test='train')

#get survey 80% recall estimate predictions
recall_estimated=.8
df_Y_train_survey_80_recall_estimate_predicted=icr.get_survey_coupon_recommendations_by_recall_estimate(number_of_predictions=number_of_predictions, recall_estimated=recall_estimated, random_state=200, train_test='train')

#get survey 100% recall estimate predictions
recall_estimated=1
df_Y_train_survey_100_recall_estimate_predicted=icr.get_survey_coupon_recommendations_by_recall_estimate(number_of_predictions=number_of_predictions, recall_estimated=recall_estimated, random_state=200, train_test='train')


### Get Data Frame Y Train Random Forest Predicted, Y Train Gradient Boosting Predicted, Y Train Survey Predicted, Y Actual, and Coupon Venue Type
feature_column_name_list = ['coupon_venue_type']

df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type = \
pd.concat([df_Y_train_random_forest_predicted,
           df_Y_train_gradient_boosting_predicted,
           df_Y_train_survey_27_recall_estimate_predicted,
           df_Y_train_survey_80_recall_estimate_predicted,
           df_Y_train_survey_100_recall_estimate_predicted,
           data_frame_collection['Y_train'].reset_index(drop=True),
           df_collection['X_train'].loc[:, feature_column_name_list].reset_index(drop=True)], axis=1)

p(df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type)

This file already exists.
This file already exists.
0.26870335011282764
0.7312966498871724
0.8
0.19999999999999996
1
0
(10147, 7)

Out[69]:

	Y_train_random_forest_predicted	Y_train_gradient_boosting_predicted	Y_train_survey_27_recall_estimate_predicted	Y_train_survey_80_recall_estimate_predicted	Y_train_survey_100_recall_estimate_predicted	Y	coupon_venue_type
0	0	1	1	1	1	1	Coffee House
1	0	1	0	1	1	1	Coffee House
2	0	0	0	1	1	0	Carry out & Take away
3	0	1	0	1	1	1	Coffee House
4	0	1	1	1	1	1	Coffee House
10142	0	0	0	1	1	0	Bar
10143	0	1	0	1	1	0	Bar
10144	0	1	0	0	1	1	Restaurant(20-50)
10145	1	1	0	1	1	1	Coffee House
10146	0	1	0	0	1	1	Coffee House

time: 22.1 ms (started: 2023-09-28 17:20:06 -07:00)

Initialize Variables¶

In [70]:

multiple_index=icr.get_metric_multiple_index(proportion_or_percentage='proportion')

feature_column_name_filter_value_list_dictionary_key_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']
feature_column_name_filter_value_two_dimensional_list=[['Coffee House', 'Bar', 'Carry out & Take away', 'Restaurant(<20)', 'Restaurant(20-50)'], ['Coffee House'], ['Bar'], ['Carry out & Take away'], ['Restaurant(<20)'], ['Restaurant(20-50)']]

feature_column_name_filter_value_list_dictionary=dict(zip(feature_column_name_filter_value_list_dictionary_key_list, feature_column_name_filter_value_two_dimensional_list))

feature_column_name_filter='coupon_venue_type'
y_predicted_column_name_base_survey='Y_train_survey_100_recall_estimate_predicted'
venue_type_average_sale_dictionary={'Coffee House':[5.50], 'Bar':[15], 'Takeout':[15], 'Low-Cost Restaurant':[12], 'Mid-Range Restaurant':[35],}

time: 1.85 ms (started: 2023-09-28 17:20:06 -07:00)

Get DataFrame's Coupon Venue Type Recommendation Cost Per Random Forest and Survey and Gradient Boosting and Survey Using 20% of Ad Revenue Principle¶

In [71]:

### Get Average Coupon Recommendation Cost Estimated (Per Coupon Venue Type) From Survey 100% Recall Metrics and Average Sale Estimated
column_name_y_predicted='Y_train_survey_100_recall_estimate_predicted'
column_name_y_actual='Y'

df_train_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated=\
icr.get_survey_or_model_average_coupon_recommendation_cost_estimated(df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type=df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type,
                                                                     column_name_y_predicted=column_name_y_predicted,
                                                                     column_name_y_actual=column_name_y_actual,
                                                                     feature_column_name_filter=feature_column_name_filter,
                                                                     feature_column_name_filter_value_two_dimensional_list=feature_column_name_filter_value_two_dimensional_list,
                                                                     feature_column_name_filter_value_list_dictionary_key_list=feature_column_name_filter_value_list_dictionary_key_list,
                                                                     venue_type_average_sale_dictionary=venue_type_average_sale_dictionary,
                                                                     model_survey='Control')


### Get Average Coupon Recommendation Cost Estimated (Per Coupon Venue Type) From Random Forest 90% Coupon Acceptance Rate Estimated Metrics and Average Sale Estimated
column_name_y_predicted='Y_train_random_forest_predicted'
column_name_y_actual='Y'

df_train_random_forest_coupon_recommendation_cost_estimated_sale_estimated=\
icr.get_survey_or_model_average_coupon_recommendation_cost_estimated(df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type=df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type,
                                                                     column_name_y_predicted=column_name_y_predicted,
                                                                     column_name_y_actual=column_name_y_actual,
                                                                     feature_column_name_filter=feature_column_name_filter,
                                                                     feature_column_name_filter_value_two_dimensional_list=feature_column_name_filter_value_two_dimensional_list,
                                                                     feature_column_name_filter_value_list_dictionary_key_list=feature_column_name_filter_value_list_dictionary_key_list,
                                                                     venue_type_average_sale_dictionary=venue_type_average_sale_dictionary,
                                                                     model_survey='Treatment')


### Get Average Coupon Recommendation Cost Estimated (Per Coupon Venue Type) From Gradient Boosting 80% Recall Estimated Metrics and Average Sale Estimated
column_name_y_predicted='Y_train_gradient_boosting_predicted'
column_name_y_actual='Y'


df_train_gradient_boosting_coupon_recommendation_cost_estimated_sale_estimated=\
icr.get_survey_or_model_average_coupon_recommendation_cost_estimated(df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type=df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type,
                                                                     column_name_y_predicted=column_name_y_predicted,
                                                                     column_name_y_actual=column_name_y_actual,
                                                                     feature_column_name_filter=feature_column_name_filter,
                                                                     feature_column_name_filter_value_two_dimensional_list=feature_column_name_filter_value_two_dimensional_list,
                                                                     feature_column_name_filter_value_list_dictionary_key_list=feature_column_name_filter_value_list_dictionary_key_list,
                                                                     venue_type_average_sale_dictionary=venue_type_average_sale_dictionary,
                                                                     model_survey='Treatment')




#get df_train_random_forest_29_precision_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated
df_train_random_forest_29_precision_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated=\
pd.concat([df_train_random_forest_coupon_recommendation_cost_estimated_sale_estimated,
           df_train_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated,],
          axis=0)


#get df_train_gradient_boosting_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated
df_train_gradient_boosting_80_recall_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated=\
pd.concat([df_train_gradient_boosting_coupon_recommendation_cost_estimated_sale_estimated,
           df_train_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated,],
          axis=0)

df_train_gradient_boosting_80_recall_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated

Out[71]:

		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Average Coupon Recommendation Cost Estimated	NaN	0.848469	2.223881	2.456336	1.976224	4.638037
Treatment	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000
Control	Average Coupon Recommendation Cost Estimated	NaN	0.541538	1.244129	2.201155	1.707282	3.081650
Control	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000

time: 54.3 ms (started: 2023-09-28 17:20:06 -07:00)

Get Train Random Forest 90% Coupon Acceptance Rate Estimated Metrics (Per Coupon Venue Type) Table¶

In [72]:

model_type='random_forest'
survey_number_recall_estimated_y_predicted_column_name='Y_train_survey_27_recall_estimate_predicted'
metrics_coupon_venue_type_list = []

for feature_column_name_filter_value_list_dictionary_key in feature_column_name_filter_value_list_dictionary.keys():
    
    metric_list=icr.get_model_and_survey_metrics(df=df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type,
                                                model_y_predicted_column_name='Y_train_'+model_type+'_predicted',
                                                survey_number_recall_estimated_y_predicted_column_name=survey_number_recall_estimated_y_predicted_column_name,
                                                y_predicted_column_name_base_survey=y_predicted_column_name_base_survey,
                                                y_actual_column_name='Y',
                                                feature_column_name_filter=feature_column_name_filter,
                                                feature_column_name_filter_value_list=feature_column_name_filter_value_list_dictionary[feature_column_name_filter_value_list_dictionary_key],
                                                metrics_column_name_list=None,)
    metrics_coupon_venue_type_list+=[metric_list]

df_train_random_forest_model_survey_metrics=\
pd.DataFrame(metrics_coupon_venue_type_list, 
             index=feature_column_name_filter_value_list_dictionary_key_list,
             columns=multiple_index[0:int(len(multiple_index)*2/3)]).T

df_train_random_forest_metrics=icr.calculate_and_add_model_survey_difference(df_train_random_forest_model_survey_metrics, multiple_index)

#add Venue Type Coupon Recommendation Cost Estimated, Sale Estimated
df_train_random_forest_metrics_coupon_recommendation_cost_estimated_sale_estimated=\
pd.concat([df_train_random_forest_metrics, df_train_random_forest_29_precision_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated], axis=0)


#get and add Ad Revenue, Ad Spend, ROAS, Profit, Spend, ROI
df_train_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=icr.get_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI(df=df_train_random_forest_metrics_coupon_recommendation_cost_estimated_sale_estimated)
del df_train_random_forest_metrics_coupon_recommendation_cost_estimated_sale_estimated


#select and reorder basic metrics, recommendation cost estimated, and average sale estimate
multiindex_basic_metrics=icr.get_the_multiindex_object_with_basic_metrics()
multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated=icr.get_the_multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated()

multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated=pd.MultiIndex.from_tuples(list(multiindex_basic_metrics)+list(multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated))

#display combined metrics
df_train_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated,:]

Out[72]:

		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	90.000000	89.411765	79.487179	88.888889	91.480996	95.000000
	Percentage of Coupon Acceptances Captured	26.870335	19.387755	4.619970	35.504653	43.570537	3.632887
	Coupon Acceptances	1548.000000	304.000000	31.000000	496.000000	698.000000	19.000000
	Coupon Acceptances Possible	5761.000000	1568.000000	671.000000	1397.000000	1602.000000	523.000000
	Coupon Recommendations	1720.000000	340.000000	39.000000	558.000000	763.000000	20.000000
	Coupon Recommendations Possible	10147.000000	3185.000000	1618.000000	1904.000000	2252.000000	1188.000000
	Ad Revenue	18618.000000	1672.000000	465.000000	7440.000000	8376.000000	665.000000
	Ad Spend	3723.600000	334.400000	93.000000	1488.000000	1675.200000	133.000000
	ROAS	500.000000	500.000000	500.000000	500.000000	500.000000	500.000000
Control	Coupon Acceptance Rate	57.722988	51.113716	41.630901	76.388889	69.666667	47.462687
	Percentage of Coupon Acceptances Captured	27.634091	27.806122	28.912072	27.559055	26.092385	30.401530
	Coupon Acceptances	1592.000000	436.000000	194.000000	385.000000	418.000000	159.000000
	Coupon Acceptances Possible	5761.000000	1568.000000	671.000000	1397.000000	1602.000000	523.000000
	Coupon Recommendations	2758.000000	853.000000	466.000000	504.000000	600.000000	335.000000
	Coupon Recommendations Possible	10147.000000	3185.000000	1618.000000	1904.000000	2252.000000	1188.000000
	Ad Revenue	21664.000000	2398.000000	2910.000000	5775.000000	5016.000000	5565.000000
	Ad Spend	6839.257701	838.950588	1111.230769	1344.000000	1317.326343	2227.750000
	ROAS	316.759522	285.833282	261.871798	429.687500	380.771251	249.803614
Uplift	Coupon Acceptance Rate	32.277012	38.298048	37.856278	12.500000	21.814329	47.537313
	Percentage of Coupon Acceptances Captured	-0.763756	-8.418367	-24.292101	7.945598	17.478152	-26.768642
	Coupon Acceptances	-44.000000	-132.000000	-163.000000	111.000000	280.000000	-140.000000
	Coupon Acceptances Possible	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	Coupon Recommendations	-1038.000000	-513.000000	-427.000000	54.000000	163.000000	-315.000000
	Coupon Recommendations Possible	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	Ad Revenue	-3046.000000	-726.000000	-2445.000000	1665.000000	3360.000000	-4900.000000
	Ad Spend	-3115.657701	-504.550588	-1018.230769	144.000000	357.873657	-2094.750000
	ROAS	183.240478	214.166718	238.128202	70.312500	119.228749	250.196386
Treatment	Average Coupon Recommendation Cost Estimated	NaN	0.983529	2.384615	2.666667	2.195544	6.650000
Treatment	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000
Control	Average Coupon Recommendation Cost Estimated	NaN	0.541538	1.244129	2.201155	1.707282	3.081650
Control	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000

time: 96.3 ms (started: 2023-09-28 17:20:06 -07:00)

In [73]:

pd.DataFrame(metrics_coupon_venue_type_list, 
             index=feature_column_name_filter_value_list_dictionary_key_list,
             columns=multiple_index[0:int(len(multiple_index)*2/3)]).T

Out[73]:

		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	90.000000	89.411765	79.487179	88.888889	91.480996	95.000000
	Percentage of Coupon Acceptances Captured	26.870335	19.387755	4.619970	35.504653	43.570537	3.632887
	Proportion of Coupon Acceptances	1.000000	0.196382	0.020026	0.320413	0.450904	0.012274
	Coupon Acceptances	1548.000000	304.000000	31.000000	496.000000	698.000000	19.000000
	Coupon Acceptances Possible	5761.000000	1568.000000	671.000000	1397.000000	1602.000000	523.000000
	Proportion of Coupon Recommendations	1.000000	0.197674	0.022674	0.324419	0.443605	0.011628
	Coupon Recommendations	1720.000000	340.000000	39.000000	558.000000	763.000000	20.000000
	Coupon Recommendations Possible	10147.000000	3185.000000	1618.000000	1904.000000	2252.000000	1188.000000
	Coupon Acceptances to Base Survey Coupon Recommendations Ratio	0.152557	0.095447	0.019159	0.260504	0.309947	0.015993
	Coupon Acceptances to Survey Coupon Acceptances Ratio	0.972362	0.697248	0.159794	1.288312	1.669856	0.119497
	Coupon Recommendations to Survey Coupon Recommendations Ratio	0.623640	0.398593	0.083691	1.107143	1.271667	0.059701
	Coupon Recommendations to Base Survey Coupon Recommendations Ratio	0.169508	0.106750	0.024104	0.293067	0.338810	0.016835
Control	Coupon Acceptance Rate	57.722988	51.113716	41.630901	76.388889	69.666667	47.462687
	Percentage of Coupon Acceptances Captured	27.634091	27.806122	28.912072	27.559055	26.092385	30.401530
	Proportion of Coupon Acceptances	1.000000	0.273869	0.121859	0.241834	0.262563	0.099874
	Coupon Acceptances	1592.000000	436.000000	194.000000	385.000000	418.000000	159.000000
	Coupon Acceptances Possible	5761.000000	1568.000000	671.000000	1397.000000	1602.000000	523.000000
	Proportion of Coupon Recommendations	1.000000	0.309282	0.168963	0.182741	0.217549	0.121465
	Coupon Recommendations	2758.000000	853.000000	466.000000	504.000000	600.000000	335.000000
	Coupon Recommendations Possible	10147.000000	3185.000000	1618.000000	1904.000000	2252.000000	1188.000000
	Coupon Acceptances to Base Survey Coupon Recommendations Ratio	0.156894	0.136892	0.119901	0.202206	0.185613	0.133838
	Coupon Acceptances to Survey Coupon Acceptances Ratio	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
	Coupon Recommendations to Survey Coupon Recommendations Ratio	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
	Coupon Recommendations to Base Survey Coupon Recommendations Ratio	0.271804	0.267818	0.288010	0.264706	0.266430	0.281987

time: 9 ms (started: 2023-09-28 17:20:06 -07:00)

In [74]:

#ROI metrics
icr.profit_spend_roi_number_table(df=df_train_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI)

#del df_train_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI

Out[74]:

Additional Production Cost	200			2000			20000
Metric	Profit	Spend	ROI	Profit	Spend	ROI	Profit	Spend	ROI
Group
Control	14624.742299	7039.257701	207.759723	12824.742299	8839.257701	145.088454	-5175.257701	26839.257701	-19.282417
Treatment	14694.400000	3923.600000	374.513202	12894.400000	5723.600000	225.284786	-5105.600000	23723.600000	-21.521186
Uplift	69.657701	-3115.657701	166.753479	69.657701	-3115.657701	80.196332	69.657701	-3115.657701	-2.238768

time: 11.6 ms (started: 2023-09-28 17:20:06 -07:00)

Get Train Random Forest 90% Coupon Acceptance Rate Estimated 95% Confidence Interval Metrics (Per Coupon Venue Type) Table¶

In [75]:

model_type='random_forest'

### Get Train Random Forest 90% Coupon Acceptance Rate Estimated 95% Confidence Interval Metrics (Per Coupon Venue Type) Table 
if st != 'yes':
    number_of_replicates=10000

quantile_lower_upper_list=[0.025, 0.975]

feature_column_name_filter='coupon_venue_type'
save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']


random_forest_model_train_survey_number_confidence_interval_metric_collection={}
df_train_random_forest_90_precision_estimated_feature_filter_number_bootstrap_replicates_metrics_collection={}

for feature_column_name_filter_value_list_dictionary_key in feature_column_name_filter_value_list_dictionary_key_list:

    random_forest_model_train_survey_number_confidence_interval_metric_collection[feature_column_name_filter_value_list_dictionary_key],\
    df_train_random_forest_90_precision_estimated_feature_filter_number_bootstrap_replicates_metrics_collection[feature_column_name_filter_value_list_dictionary_key]=\
    icr.get_metric_confidence_interval_table_by_feature_column_name_filter_value_list_dictionary_key(df_y_train_test_model_name_predicted_y_train_test_survey_recall_estimate_predicted_y_actual_feature_column_name_filter=df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type,
                                                                                                     feature_column_name_filter=feature_column_name_filter,
                                                                                                     feature_column_name_filter_value_list_dictionary_key=feature_column_name_filter_value_list_dictionary_key,
                                                                                                     feature_column_name_filter_value_list_dictionary=feature_column_name_filter_value_list_dictionary,
                                                                                                     multiple_index=multiple_index,
                                                                                                     number_of_replicates=number_of_replicates,
                                                                                                     quantile_lower_upper_list=quantile_lower_upper_list,
                                                                                                     model_type=model_type,
                                                                                                     survey_number_recall_estimated_y_predicted_column_name=survey_number_recall_estimated_y_predicted_column_name,
                                                                                                     save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list=save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list,
                                                                                                     filename_version=filename_version,
                                                                                                     train_test='train',
                                                                                                     sample_size=2537)


df_random_forest_model_train_survey_number_confidence_interval_metric_feature_column_name_filter_value_sample_size_2537=\
icr.convert_collection_to_data_frame_and_drop_top_column_level(random_forest_model_train_survey_number_confidence_interval_metric_collection)

#mid-range restaurant Coupon Acceptances 95% confidence interval (100, 100) looks wrong.

#select and reorder basic metrics, recommendation cost estimated, and average sale estimate
metric_list_refined=['Coupon Acceptance Rate', 'Percentage of Coupon Acceptances Captured', 'Coupon Acceptances', 'Coupon Acceptances Possible', 'Coupon Recommendations', 'Coupon Recommendations Possible']
multiindex_basic_metrics=icr.get_the_multiindex_object_with_basic_metrics(metric_list_refined=metric_list_refined)
#multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated=icr.get_the_multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated()

#multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated=pd.MultiIndex.from_tuples(list(multiindex_basic_metrics)+list(multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated))

#display combined metrics
df_random_forest_model_train_survey_number_confidence_interval_metric_feature_column_name_filter_value_sample_size_2537.loc[multiindex_basic_metrics,:]

This file already exists
This file already exists
This file already exists
This file already exists
This file already exists
This file already exists

Out[75]:

		95% Confidence Interval
		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	(87%, 92%)	(82%, 95%)	(50%, 100%)	(83%, 93%)	(87%, 95%)	(66%, 100%)
	Percentage of Coupon Acceptances Captured	(24%, 29%)	(15%, 23%)	(100%, 100%)	(30%, 40%)	(38%, 48%)	(78.1%, 714.3%)
	Coupon Acceptances	(352, 422)	(60, 93)	(3, 14)	(103, 145)	(150, 200)	(100, 100)
	Coupon Acceptances Possible	(1391, 1489)	(356, 428)	(143, 193)	(315, 384)	(365, 437)	(109, 153)
	Coupon Recommendations	(393, 467)	(68, 103)	(4, 16)	(118, 162)	(164, 217)	(100, 100)
	Coupon Recommendations Possible	(2537, 2537)	(750, 842)	(369, 441)	(438, 515)	(522, 604)	(266, 328)
Control	Coupon Acceptance Rate	(53%, 61%)	(44%, 57%)	(32%, 50%)	(68%, 83%)	(62%, 76%)	(36%, 58%)
	Percentage of Coupon Acceptances Captured	(25%, 29%)	(23%, 32%)	(22%, 35%)	(23%, 32%)	(21%, 30%)	(22%, 38%)
	Coupon Acceptances	(363, 434)	(89, 130)	(36, 63)	(78, 116)	(85, 124)	(28, 52)
	Coupon Acceptances Possible	(1391, 1489)	(356, 428)	(143, 193)	(315, 384)	(365, 437)	(109, 153)
	Coupon Recommendations	(645, 733)	(186, 241)	(97, 137)	(105, 148)	(127, 174)	(66, 101)
	Coupon Recommendations Possible	(2537, 2537)	(750, 842)	(369, 441)	(438, 515)	(522, 604)	(266, 328)
Uplift	Coupon Acceptance Rate	(27%, 36%)	(29%, 46%)	(7%, 62%)	(4%, 20%)	(14%, 29%)	(14%, 62%)
	Percentage of Coupon Acceptances Captured	(-4%, 2%)	(-14%, -2%)	(-31%, -16%)	(100%, 100%)	(10%, 23%)	(-35%, -18%)
	Coupon Acceptances	(-58, 36)	(-57, -10)	(-55, -27)	(3, 52)	(43, 97)	(-48, -23)
	Coupon Acceptances Possible	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)
	Coupon Recommendations	(-316, -203)	(-159, -98)	(-128, -86)	(-13, 40)	(11, 71)	(-97, -61)
	Coupon Recommendations Possible	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)

time: 1.59 s (started: 2023-09-28 17:20:06 -07:00)

In [ ]:

Get Train Gradient Boosting 80% Recall Estimated Metrics (Per Coupon Venue Type) Table¶

In [76]:

model_type='gradient_boosting'
survey_number_recall_estimated_y_predicted_column_name='Y_train_survey_80_recall_estimate_predicted'


metrics_coupon_venue_type_list = []

for feature_column_name_filter_value_list_dictionary_key in feature_column_name_filter_value_list_dictionary.keys():
    
    metric_list=icr.get_model_and_survey_metrics(df=df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type,
                                                 model_y_predicted_column_name='Y_train_'+model_type+'_predicted',
                                                 survey_number_recall_estimated_y_predicted_column_name=survey_number_recall_estimated_y_predicted_column_name,
                                                 y_predicted_column_name_base_survey=y_predicted_column_name_base_survey,
                                                 y_actual_column_name='Y',
                                                 feature_column_name_filter=feature_column_name_filter,
                                                 feature_column_name_filter_value_list=feature_column_name_filter_value_list_dictionary[feature_column_name_filter_value_list_dictionary_key],
                                                 metrics_column_name_list=None,)
    metrics_coupon_venue_type_list+=[metric_list]

df_train_gradient_boosting_model_survey_metrics=\
pd.DataFrame(metrics_coupon_venue_type_list, 
             index=feature_column_name_filter_value_list_dictionary_key_list, 
             columns=multiple_index[0:(int(len(multiple_index)*2/3))]).T

df_train_gradient_boosting_metrics=icr.calculate_and_add_model_survey_difference(df_train_gradient_boosting_model_survey_metrics, multiple_index)

#add Venue Type Coupon Recommendation Cost Estimated, Sale Estimated
df_train_gradient_boosting_metrics_coupon_recommendation_cost_estimated_sale_estimated=pd.concat([df_train_gradient_boosting_metrics, df_train_gradient_boosting_80_recall_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated], axis=0)

#get and add Ad Revenue, Ad Spend, ROAS, Profit, Spend, ROI
df_train_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=icr.get_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI(df=df_train_gradient_boosting_metrics_coupon_recommendation_cost_estimated_sale_estimated)
del df_train_gradient_boosting_metrics_coupon_recommendation_cost_estimated_sale_estimated


#select and reorder basic metrics, recommendation cost estimated, and average sale estimate
multiindex_basic_metrics=icr.get_the_multiindex_object_with_basic_metrics()
multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated=icr.get_the_multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated()

multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated=pd.MultiIndex.from_tuples(list(multiindex_basic_metrics)+list(multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated))

#display combined metrics
df_train_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated,:]

Out[76]:

		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	78.673713	77.133550	74.129353	81.877873	82.342657	66.257669
	Percentage of Coupon Acceptances Captured	80.107620	75.510204	66.616990	89.262706	88.202247	61.950287
	Coupon Acceptances	4615.000000	1184.000000	447.000000	1247.000000	1413.000000	324.000000
	Coupon Acceptances Possible	5761.000000	1568.000000	671.000000	1397.000000	1602.000000	523.000000
	Coupon Recommendations	5866.000000	1535.000000	603.000000	1523.000000	1716.000000	489.000000
	Coupon Recommendations Possible	10147.000000	3185.000000	1618.000000	1904.000000	2252.000000	1188.000000
	Ad Revenue	60218.000000	6512.000000	6705.000000	18705.000000	16956.000000	11340.000000
	Ad Spend	12043.600000	1302.400000	1341.000000	3741.000000	3391.200000	2268.000000
	ROAS	500.000000	500.000000	500.000000	500.000000	500.000000	500.000000
Control	Coupon Acceptance Rate	56.995074	49.764151	41.812643	72.727273	71.412556	44.947368
	Percentage of Coupon Acceptances Captured	80.333275	80.739796	81.818182	79.599141	79.525593	81.644359
	Coupon Acceptances	4628.000000	1266.000000	549.000000	1112.000000	1274.000000	427.000000
	Coupon Acceptances Possible	5761.000000	1568.000000	671.000000	1397.000000	1602.000000	523.000000
	Coupon Recommendations	8120.000000	2544.000000	1313.000000	1529.000000	1784.000000	950.000000
	Coupon Recommendations Possible	10147.000000	3185.000000	1618.000000	1904.000000	2252.000000	1188.000000
	Ad Revenue	62111.000000	6963.000000	8235.000000	16680.000000	15288.000000	14945.000000
	Ad Spend	16765.916704	2158.505277	2919.955224	3755.738017	3525.583217	4406.134969
	ROAS	370.459910	322.584340	282.024873	444.120434	433.630383	339.186160
Uplift	Coupon Acceptance Rate	21.678639	27.369400	32.316710	9.150600	10.930101	21.310300
	Percentage of Coupon Acceptances Captured	-0.225655	-5.229592	-15.201192	9.663565	8.676654	-19.694073
	Coupon Acceptances	-13.000000	-82.000000	-102.000000	135.000000	139.000000	-103.000000
	Coupon Acceptances Possible	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	Coupon Recommendations	-2254.000000	-1009.000000	-710.000000	-6.000000	-68.000000	-461.000000
	Coupon Recommendations Possible	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	Ad Revenue	-1893.000000	-451.000000	-1530.000000	2025.000000	1668.000000	-3605.000000
	Ad Spend	-4722.316704	-856.105277	-1578.955224	-14.738017	-134.383217	-2138.134969
	ROAS	129.540090	177.415660	217.975127	55.879566	66.369617	160.813840
Treatment	Average Coupon Recommendation Cost Estimated	NaN	0.848469	2.223881	2.456336	1.976224	4.638037
Treatment	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000
Control	Average Coupon Recommendation Cost Estimated	NaN	0.541538	1.244129	2.201155	1.707282	3.081650
Control	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000

time: 92.6 ms (started: 2023-09-28 17:20:08 -07:00)

In [77]:

icr.profit_spend_roi_number_table(df=df_train_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI)

Out[77]:

Additional Production Cost	200			2000			20000
Metric	Profit	Spend	ROI	Profit	Spend	ROI	Profit	Spend	ROI
Group
Control	45145.083296	16965.916704	266.092803	43345.083296	18765.916704	230.977703	25345.083296	36765.916704	68.936356
Treatment	47974.400000	12243.600000	391.832468	46174.400000	14043.600000	328.793187	28174.400000	32043.600000	87.925202
Uplift	2829.316704	-4722.316704	125.739665	2829.316704	-4722.316704	97.815484	2829.316704	-4722.316704	18.988846

time: 11 ms (started: 2023-09-28 17:20:08 -07:00)

Get Train Gradient Boosting 80% Coupon Acceptance Rate Estimated 95% Confidence Interval Metrics (Per Coupon Venue Type) Table¶

In [78]:

if st != 'yes':
    number_of_replicates=10000

quantile_lower_upper_list=[0.025, 0.975]

feature_column_name_filter='coupon_venue_type'
save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']


gradient_boosting_model_train_survey_95_confidence_interval_metric_collection={}
df_train_gradient_boosting_80_recall_estimated_feature_filter_number_bootstrap_replicates_metrics_collection={}

for feature_column_name_filter_value_list_dictionary_key in feature_column_name_filter_value_list_dictionary_key_list:

    gradient_boosting_model_train_survey_95_confidence_interval_metric_collection[feature_column_name_filter_value_list_dictionary_key],\
    df_train_gradient_boosting_80_recall_estimated_feature_filter_number_bootstrap_replicates_metrics_collection[feature_column_name_filter_value_list_dictionary_key]=\
    icr.get_metric_confidence_interval_table_by_feature_column_name_filter_value_list_dictionary_key(df_y_train_test_model_name_predicted_y_train_test_survey_recall_estimate_predicted_y_actual_feature_column_name_filter=df_y_train_model_name_predicted_y_train_survey_recall_estimate_predicted_y_actual_coupon_venue_type,
                                                                                                     feature_column_name_filter=feature_column_name_filter,
                                                                                                     feature_column_name_filter_value_list_dictionary_key=feature_column_name_filter_value_list_dictionary_key,
                                                                                                     feature_column_name_filter_value_list_dictionary=feature_column_name_filter_value_list_dictionary,
                                                                                                     multiple_index=multiple_index,
                                                                                                     number_of_replicates=number_of_replicates,
                                                                                                     quantile_lower_upper_list=quantile_lower_upper_list,
                                                                                                     model_type=model_type,
                                                                                                     survey_number_recall_estimated_y_predicted_column_name=survey_number_recall_estimated_y_predicted_column_name,
                                                                                                     save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list=save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list,
                                                                                                     filename_version=filename_version,
                                                                                                     train_test='train',
                                                                                                     sample_size=2537)


df_gradient_boosting_model_train_survey_95_confidence_interval_metric_feature_column_name_filter_value=icr.convert_collection_to_data_frame_and_drop_top_column_level(gradient_boosting_model_train_survey_95_confidence_interval_metric_collection)


#select and reorder basic metrics, recommendation cost estimated, and average sale estimate
metric_list_refined=['Coupon Acceptance Rate', 'Percentage of Coupon Acceptances Captured', 'Coupon Acceptances', 'Coupon Acceptances Possible', 'Coupon Recommendations', 'Coupon Recommendations Possible']
multiindex_basic_metrics=icr.get_the_multiindex_object_with_basic_metrics(metric_list_refined=metric_list_refined)

#display combined metrics
df_gradient_boosting_model_train_survey_95_confidence_interval_metric_feature_column_name_filter_value.loc[multiindex_basic_metrics,:]

This file already exists
This file already exists
This file already exists
This file already exists
This file already exists
This file already exists

Out[78]:

		95% Confidence Interval
		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	(76%, 80%)	(72%, 81%)	(66%, 81%)	(77%, 85%)	(78%, 85%)	(57%, 74%)
	Percentage of Coupon Acceptances Captured	(78%, 82%)	(71%, 79%)	(59%, 73%)	(85%, 92%)	(84%, 91%)	(53%, 70%)
	Coupon Acceptances	(1105, 1203)	(265, 328)	(92, 133)	(280, 345)	(320, 388)	(64, 99)
	Coupon Acceptances Possible	(1391, 1489)	(356, 428)	(143, 193)	(315, 384)	(365, 437)	(109, 153)
	Coupon Recommendations	(1418, 1516)	(349, 419)	(129, 175)	(346, 417)	(393, 467)	(102, 144)
	Coupon Recommendations Possible	(2537, 2537)	(750, 842)	(369, 441)	(438, 515)	(522, 604)	(266, 328)
Control	Coupon Acceptance Rate	(54%, 59%)	(45%, 53%)	(36%, 47%)	(68%, 77%)	(67%, 75%)	(38%, 51%)
	Percentage of Coupon Acceptances Captured	(78%, 82%)	(76%, 84%)	(75%, 87%)	(75%, 83%)	(75%, 83%)	(74%, 87%)
	Coupon Acceptances	(1108, 1207)	(284, 350)	(115, 160)	(248, 310)	(286, 351)	(87, 127)
	Coupon Acceptances Possible	(1391, 1489)	(356, 428)	(143, 193)	(315, 384)	(365, 437)	(109, 153)
	Coupon Recommendations	(1991, 2069)	(593, 678)	(296, 362)	(348, 419)	(408, 484)	(209, 266)
	Coupon Recommendations Possible	(2537, 2537)	(750, 842)	(369, 441)	(438, 515)	(522, 604)	(266, 328)
Uplift	Coupon Acceptance Rate	(19%, 23%)	(23%, 31%)	(26%, 38%)	(5%, 12%)	(7%, 14%)	(14%, 28%)
	Percentage of Coupon Acceptances Captured	(-3%, 2%)	(-11%, 0%)	(-24%, -5%)	(4%, 15%)	(3%, 13%)	(-30%, -8%)
	Coupon Acceptances	(-47, 40)	(-43, 2)	(-42, -10)	(15, 53)	(14, 55)	(-41, -11)
	Coupon Acceptances Possible	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)
	Coupon Recommendations	(-627, -500)	(-291, -214)	(-208, -149)	(-26, 23)	(-44, 11)	(-140, -91)
	Coupon Recommendations Possible	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)

time: 1.55 s (started: 2023-09-28 17:20:08 -07:00)

In [ ]:

Table of Contents Data Wrangling Feature Engineering Exploratory Data Analysis Data Preprocessing Modeling_1 Modeling_2 Modeling_Train_Results Modeling_Test_Results

Modeling Test Results

Get DataFrame Random Forest Prediction Probability, Gradient Boosting Prediction Probability, Y Actual, Coupon Venue Type¶

In [79]:

feature_column_name_list = ['coupon_venue_type']

## Get Prediction Probabilities for Gradient Boosting, Prediction Probabilities for Random Forest, Y_actual, and coupon venue type
Y_test_random_forest_prediction_probability_class0_class1_ndarray = best_stratified_5_fold_grid_search_cross_validation_random_forest_classifier.predict_proba(data_frame_collection['X_test'])
df_Y_test_random_forest_prediction_probability = pd.DataFrame(Y_test_random_forest_prediction_probability_class0_class1_ndarray).loc[:, 1]


Y_test_gradient_boosting_prediction_probability_class0_class1_ndarray = best_stratified_5_fold_grid_search_cross_validation_gradient_boosting_classifier.predict_proba(data_frame_collection['X_test'])
df_Y_test_gradient_boosting_prediction_probability = pd.DataFrame(Y_test_gradient_boosting_prediction_probability_class0_class1_ndarray).loc[:, 1]


df_y_test_model_name_prediction_probability_y_actual_coupon_venue_type = \
pd.concat([df_Y_test_random_forest_prediction_probability.to_frame().rename(columns={1:'Y_test_random_forest_prediction_probability'}), 
           df_Y_test_gradient_boosting_prediction_probability.to_frame().rename(columns={1:'Y_test_gradient_boosting_prediction_probability'}), 
           data_frame_collection['Y_test'].reset_index(drop=True),
           df_collection['X_test'].loc[:, feature_column_name_list].reset_index(drop=True)], axis=1)

p(df_y_test_model_name_prediction_probability_y_actual_coupon_venue_type)

(2537, 4)

Out[79]:

	Y_test_random_forest_prediction_probability	Y_test_gradient_boosting_prediction_probability	Y	coupon_venue_type
0	0.107202	0.000190	0	Coffee House
1	0.053351	0.000020	0	Coffee House
2	0.457736	0.745344	0	Coffee House
3	0.186667	0.070547	0	Restaurant(20-50)
4	0.593144	0.962529	1	Coffee House
2532	0.208086	0.991149	0	Restaurant(<20)
2533	0.522917	0.277567	1	Coffee House
2534	0.203686	0.000110	0	Bar
2535	0.655833	0.916642	1	Carry out & Take away
2536	0.851667	0.996387	1	Carry out & Take away

time: 135 ms (started: 2023-09-28 17:20:10 -07:00)

Get Data Frame Y Test Random Forest Predicted, Y Test Gradient Boosting Predicted, Y Test Survey Predicted, Y Actual, and Coupon Venue Type¶

In [80]:

### Get Random Forest Classifier Y Predicted from Y Prediction Probabilities and Decision Threshold .9 Precision Estimated
model_type='random_forest'
df_Y_test_random_forest_predicted = icr.get_model_predictions_from_prediction_probabilities_and_decision_threshold_proportion_metric_estimated(df=df_random_forest_decision_threshold_precision_recall, 
                                                                                                                                               model_proportion_precision=.90, 
                                                                                                                                               model_proportion_recall=None,
                                                                                                                                               model_precision_column_name=model_type+'_precision', 
                                                                                                                                               model_recall_column_name=model_type+'_recall',
                                                                                                                                               model_decision_threshold_column_name=model_type+'_decision_threshold',
                                                                                                                                               df_Y_train_test_model_prediction_probability=df_Y_test_random_forest_prediction_probability,
                                                                                                                                               train_test='test',
                                                                                                                                               filename_version=filename_version)
df_Y_test_random_forest_predicted=df_Y_test_random_forest_predicted.rename(columns={'Y_test_predicted':'Y_test_'+str(model_type)+'_predicted'})


### Get Gradient Boosting Classifier Y Predicted from Y Prediction Probabilities and Decision Threshold .8 Recall Estimated
model_type = 'gradient_boosting'
model_proportion_precision=None
model_proportion_recall=.8

df_Y_test_gradient_boosting_predicted = icr.get_model_predictions_from_prediction_probabilities_and_decision_threshold_proportion_metric_estimated(df=df_gradient_boosting_decision_threshold_precision_recall,
                                                                                                                                                   model_proportion_precision=model_proportion_precision,
                                                                                                                                                   model_proportion_recall=model_proportion_recall,
                                                                                                                                                   model_precision_column_name=model_type+'_precision',
                                                                                                                                                   model_recall_column_name=model_type+'_recall',
                                                                                                                                                   model_decision_threshold_column_name=model_type+'_decision_threshold',
                                                                                                                                                   df_Y_train_test_model_prediction_probability=df_Y_test_gradient_boosting_prediction_probability,
                                                                                                                                                   train_test='test',
                                                                                                                                                   filename_version=filename_version)
df_Y_test_gradient_boosting_predicted=df_Y_test_gradient_boosting_predicted.rename(columns={'Y_test_predicted':'Y_test_'+str(model_type)+'_predicted'})




#initialize variables
data_fold_type='test'
number_of_predictions=data_frame_collection['X_'+data_fold_type].shape[0]


### Get Survey 27% Recall Predictions
recall_estimated=random_forest_90_precision_estimated_recall
df_Y_test_survey_27_recall_estimate_predicted=icr.get_survey_coupon_recommendations_by_recall_estimate(number_of_predictions=number_of_predictions, recall_estimated=recall_estimated, random_state=200)



### Get Survey 80% Recall Predictions
recall_estimated=.8
df_Y_test_survey_80_recall_estimate_predicted = icr.get_survey_coupon_recommendations_by_recall_estimate(number_of_predictions=number_of_predictions, recall_estimated=recall_estimated, random_state=200)


### Get Survey 100% Recall Predictions
recall_estimated=1
df_Y_test_survey_100_recall_estimate_predicted = icr.get_survey_coupon_recommendations_by_recall_estimate(number_of_predictions=number_of_predictions, recall_estimated=recall_estimated, random_state=200)




### Get Data Frame Y Test Random Forest Predicted, Y Test Gradient Boosting Predicted, Y Test Survey Predicted, Y Actual, and Coupon Venue Type
feature_column_name_list = ['coupon_venue_type']

df_y_test_model_name_predicted_y_test_survey_recall_estimate_predicted_y_actual_coupon_venue_type = \
pd.concat([df_Y_test_random_forest_predicted,
           df_Y_test_gradient_boosting_predicted,
           df_Y_test_survey_27_recall_estimate_predicted,
           df_Y_test_survey_80_recall_estimate_predicted,
           df_Y_test_survey_100_recall_estimate_predicted,
           data_frame_collection['Y_test'].reset_index(drop=True),
           df_collection['X_test'].loc[:, feature_column_name_list].reset_index(drop=True)], axis=1)

p(df_y_test_model_name_predicted_y_test_survey_recall_estimate_predicted_y_actual_coupon_venue_type)

This file already exists.
This file already exists.
0.26870335011282764
0.7312966498871724
0.8
0.19999999999999996
1
0
(2537, 7)

Out[80]:

	Y_test_random_forest_predicted	Y_test_gradient_boosting_predicted	Y_test_survey_27_recall_estimate_predicted	Y_test_survey_80_recall_estimate_predicted	Y_test_survey_100_recall_estimate_predicted	Y	coupon_venue_type
0	0	0	1	1	1	0	Coffee House
1	0	0	0	1	1	0	Coffee House
2	0	1	0	1	1	0	Coffee House
3	0	0	0	1	1	0	Restaurant(20-50)
4	0	1	1	1	1	1	Coffee House
2532	0	1	0	1	1	0	Restaurant(<20)
2533	0	0	0	0	1	1	Coffee House
2534	0	0	0	1	1	0	Bar
2535	0	1	0	0	1	1	Carry out & Take away
2536	1	1	0	1	1	1	Carry out & Take away

time: 20.7 ms (started: 2023-09-28 17:20:10 -07:00)

In [81]:

#intialize variables
multiple_index=icr.get_metric_multiple_index(proportion_or_percentage='proportion')

feature_column_name_filter_value_list_dictionary_key_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']
feature_column_name_filter_value_two_dimensional_list=[['Coffee House', 'Bar', 'Carry out & Take away', 'Restaurant(<20)', 'Restaurant(20-50)'], ['Coffee House'], ['Bar'], ['Carry out & Take away'], ['Restaurant(<20)'], ['Restaurant(20-50)']]

feature_column_name_filter_value_list_dictionary=\
dict(zip(feature_column_name_filter_value_list_dictionary_key_list,feature_column_name_filter_value_two_dimensional_list))

pdc(feature_column_name_filter_value_list_dictionary)

Out[81]:

{'Overall': ['Coffee House',
  'Bar',
  'Carry out & Take away',
  'Restaurant(<20)',
  'Restaurant(20-50)'],
 'Coffee House': ['Coffee House'],
 'Bar': ['Bar'],
 'Takeout': ['Carry out & Take away'],
 'Low-Cost Restaurant': ['Restaurant(<20)'],
 'Mid-Range Restaurant': ['Restaurant(20-50)']}

time: 3.55 ms (started: 2023-09-28 17:20:10 -07:00)

In [82]:

feature_column_name_filter='coupon_venue_type'
y_predicted_column_name_base_survey='Y_test_survey_100_recall_estimate_predicted'

time: 525 µs (started: 2023-09-28 17:20:10 -07:00)

Get Random Forest 90% Coupon Acceptance Rate Estimated Metrics (Per Coupon Venue Type) Table¶

In [83]:

model_type='random_forest'
survey_number_recall_estimated_y_predicted_column_name='Y_test_survey_27_recall_estimate_predicted'


metrics_coupon_venue_type_list = []

for feature_column_name_filter_value_list_dictionary_key in feature_column_name_filter_value_list_dictionary.keys():
    
    metric_list=icr.get_model_and_survey_metrics(df=df_y_test_model_name_predicted_y_test_survey_recall_estimate_predicted_y_actual_coupon_venue_type,
                                                model_y_predicted_column_name='Y_test_'+model_type+'_predicted',
                                                survey_number_recall_estimated_y_predicted_column_name=survey_number_recall_estimated_y_predicted_column_name,
                                                y_predicted_column_name_base_survey=y_predicted_column_name_base_survey,
                                                y_actual_column_name='Y',
                                                feature_column_name_filter=feature_column_name_filter,
                                                feature_column_name_filter_value_list=feature_column_name_filter_value_list_dictionary[feature_column_name_filter_value_list_dictionary_key],
                                                metrics_column_name_list=None,)
    metrics_coupon_venue_type_list+=[metric_list]

df_test_random_forest_model_survey_metrics=\
pd.DataFrame(metrics_coupon_venue_type_list, 
             index=feature_column_name_filter_value_list_dictionary_key_list,
             columns=multiple_index[0:int(len(multiple_index)*2/3)]).T

df_test_random_forest_metrics=icr.calculate_and_add_model_survey_difference(df_test_random_forest_model_survey_metrics, multiple_index)


#add Venue Type Coupon Recommendation Cost Estimated, Sale Estimated
df_test_random_forest_metrics_coupon_recommendation_cost_estimated_sale_estimated=\
pd.concat([df_test_random_forest_metrics, df_train_random_forest_29_precision_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated], axis=0)


#get and add Total Ad Spend, Total Revenue, ROAS
df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=icr.get_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI(df=df_test_random_forest_metrics_coupon_recommendation_cost_estimated_sale_estimated)
del df_test_random_forest_metrics_coupon_recommendation_cost_estimated_sale_estimated

#select and reorder basic metrics, recommendation cost estimated, and average sale estimate
multiindex_basic_metrics=icr.get_the_multiindex_object_with_basic_metrics()
multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated=icr.get_the_multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated()

multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated=pd.MultiIndex.from_tuples(list(multiindex_basic_metrics)+list(multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated))

#display combined metrics
df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated,:]

Out[83]:

		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	90.604027	89.795918	92.307692	91.194969	90.116279	100.000000
	Percentage of Coupon Acceptances Captured	27.950311	20.608899	7.692308	39.944904	42.119565	3.703704
	Coupon Acceptances	405.000000	88.000000	12.000000	145.000000	155.000000	5.000000
	Coupon Acceptances Possible	1449.000000	427.000000	156.000000	363.000000	368.000000	135.000000
	Coupon Recommendations	447.000000	98.000000	13.000000	159.000000	172.000000	5.000000
	Coupon Recommendations Possible	2537.000000	811.000000	399.000000	489.000000	534.000000	304.000000
	Ad Revenue	4874.000000	484.000000	180.000000	2175.000000	1860.000000	175.000000
	Ad Spend	962.269434	96.385882	31.000000	424.000000	377.633552	33.250000
	ROAS	506.510945	502.148228	580.645161	512.971698	492.540981	526.315789
Control	Coupon Acceptance Rate	56.005789	53.879310	34.188034	73.553719	65.562914	48.571429
	Percentage of Coupon Acceptances Captured	26.708075	29.274005	25.641026	24.517906	26.902174	25.185185
	Coupon Acceptances	387.000000	125.000000	40.000000	89.000000	99.000000	34.000000
	Coupon Acceptances Possible	1449.000000	427.000000	156.000000	363.000000	368.000000	135.000000
	Coupon Recommendations	691.000000	232.000000	117.000000	121.000000	151.000000	70.000000
	Coupon Recommendations Possible	2537.000000	811.000000	399.000000	489.000000	534.000000	304.000000
	Ad Revenue	5000.500000	687.500000	600.000000	1335.000000	1188.000000	1190.000000
	Ad Spend	1626.872620	228.178824	279.000000	322.666667	331.527130	465.500000
	ROAS	307.368871	301.298775	215.053763	413.739669	358.341714	255.639098
Uplift	Coupon Acceptance Rate	34.598238	35.916608	58.119658	17.641250	24.553365	51.428571
	Percentage of Coupon Acceptances Captured	1.242236	-8.665105	-17.948718	15.426997	15.217391	-21.481481
	Coupon Acceptances	18.000000	-37.000000	-28.000000	56.000000	56.000000	-29.000000
	Coupon Acceptances Possible	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	Coupon Recommendations	-244.000000	-134.000000	-104.000000	38.000000	21.000000	-65.000000
	Coupon Recommendations Possible	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	Ad Revenue	-126.500000	-203.500000	-420.000000	840.000000	672.000000	-1015.000000
	Ad Spend	-664.603186	-131.792941	-248.000000	101.333333	46.106422	-432.250000
	ROAS	199.142074	200.849453	365.591398	99.232029	134.199267	270.676692
Treatment	Average Coupon Recommendation Cost Estimated	NaN	0.983529	2.384615	2.666667	2.195544	6.650000
Treatment	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000
Control	Average Coupon Recommendation Cost Estimated	NaN	0.541538	1.244129	2.201155	1.707282	3.081650
Control	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000

time: 71.4 ms (started: 2023-09-28 17:20:10 -07:00)

Get Random Forest 90% Coupon Acceptance Rate Estimated ROI (Overall) Table¶

In [84]:

#show ROI table
icr.profit_spend_roi_number_table(df=df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI)

Out[84]:

Additional Production Cost	200			2000			20000
Metric	Profit	Spend	ROI	Profit	Spend	ROI	Profit	Spend	ROI
Group
Control	3173.627380	1826.872620	173.719139	1373.627380	3626.872620	37.873604	-16626.372620	21626.872620	-76.878303
Treatment	3711.730566	1162.269434	319.351990	1911.730566	2962.269434	64.536012	-16088.269434	20962.269434	-76.748701
Uplift	538.103186	-664.603186	145.632851	538.103186	-664.603186	26.662408	538.103186	-664.603186	0.129602

time: 10.9 ms (started: 2023-09-28 17:20:10 -07:00)

Get Random Forest 90% Coupon Acceptance Rate Estimated 95% Confidence Interval Metrics, Ad Revenue, Ad Spend, ROAS (Per Coupon Venue Type) Table¶

In [85]:

######################################################################################################################################################################################################
# Get Random Forest 90% Coupon Acceptance Rate Estimated 95% Confidence Interval Metrics (Per Coupon Venue Type) DataFrame (and calculate bootstrap)

filename_list=['df_test_random_forest_model_survey_95_confidence_interval_metric_feature_column_name_filter_value_v'+filename_version+'.pkl', \
               'df_random_forest_90_precision_estimated_feature_filter_number_bootstrap_replicates_metrics_collection_v'+filename_version+'.pkl']
model_type='random_forest'

df_readback=icr.return_processed_data_file_if_it_exists_v2(filename=filename_list[0], column_name_row_integer_location_list=[0, 1], index_column_integer_location_list=[0, 1])
if df_readback.empty == False:
    df_test_random_forest_model_survey_95_confidence_interval_metric_feature_column_name_filter_value=df_readback

else:
    quantile_lower_upper_list=[0.025, 0.975]
    feature_column_name_filter='coupon_venue_type'
    save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']

    random_forest_model_survey_95_confidence_interval_metric_collection={}
    df_random_forest_90_precision_estimated_feature_filter_number_bootstrap_replicates_metrics_collection={}
    
    for feature_column_name_filter_value_list_dictionary_key in feature_column_name_filter_value_list_dictionary_key_list:

        random_forest_model_survey_95_confidence_interval_metric_collection[feature_column_name_filter_value_list_dictionary_key],\
        df_random_forest_90_precision_estimated_feature_filter_number_bootstrap_replicates_metrics_collection[feature_column_name_filter_value_list_dictionary_key]=\
        icr.get_metric_confidence_interval_table_by_feature_column_name_filter_value_list_dictionary_key(df_y_train_test_model_name_predicted_y_train_test_survey_recall_estimate_predicted_y_actual_feature_column_name_filter=df_y_test_model_name_predicted_y_test_survey_recall_estimate_predicted_y_actual_coupon_venue_type.copy(),
                                                                                                         feature_column_name_filter=feature_column_name_filter,
                                                                                                         feature_column_name_filter_value_list_dictionary_key=feature_column_name_filter_value_list_dictionary_key,
                                                                                                         feature_column_name_filter_value_list_dictionary=feature_column_name_filter_value_list_dictionary,
                                                                                                         multiple_index=multiple_index,
                                                                                                         number_of_replicates=number_of_replicates,
                                                                                                         quantile_lower_upper_list=quantile_lower_upper_list,
                                                                                                         model_type=model_type,
                                                                                                         survey_number_recall_estimated_y_predicted_column_name=survey_number_recall_estimated_y_predicted_column_name,
                                                                                                         save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list=save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list,
                                                                                                         filename_version=filename_version,
                                                                                                         sample_size=None)

    df_test_random_forest_model_survey_95_confidence_interval_metric_feature_column_name_filter_value=\
    icr.convert_collection_to_data_frame_and_drop_top_column_level(random_forest_model_survey_95_confidence_interval_metric_collection)
    
    #save it
    df_test_random_forest_model_survey_95_confidence_interval_metric_feature_column_name_filter_value=\
    icr.save_and_return_data_frame_v2(df_test_random_forest_model_survey_95_confidence_interval_metric_feature_column_name_filter_value, filename=filename_list[0])

    #save it
    df_random_forest_90_precision_estimated_feature_filter_number_bootstrap_replicates_metrics_collection=\
    icr.save_and_return_collection(df_random_forest_90_precision_estimated_feature_filter_number_bootstrap_replicates_metrics_collection, filename=filename_list[1])


######################################################################################################################################################################################################







######################################################################################################################################################################################################
#Get Random Forest 90% Coupon Acceptance Rate Estimated 95% Confidence Interval Ad Revenue, Ad Spend, ROAS, Profit, Spend, and ROI (Per Coupon Venue Type) Table
filename='df_random_forest_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_v'+filename_version+'.pkl'

df_readback=icr.return_processed_data_file_if_it_exists_v2(filename=filename, column_name_row_integer_location_list=[0, 1], index_column_integer_location_list=[0, 1])
if df_readback.empty == False:
    df_random_forest_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=df_readback
else:    
    #get Random Forest Model and Survey Coupon Recommendation Cost Estimated and Sale Estimated Replicate Collection by Venue Type
    df_random_forest_model_survey_coupon_recommendation_cost_estimated_sale_estimated_replicate_collection=\
    icr.get_model_survey_coupon_recommendation_cost_estimated_and_sale_estimated_replicate_collection_venue_type(df=df_train_random_forest_29_precision_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated, 
                                                                                                                 column_name_list=['Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant'], 
                                                                                                                 column_name_drop_list=['Overall'],
                                                                                                                 number_of_replicates=number_of_replicates)
    
    
    #calculate 95% confidence interval for Ad Revenue, Ad Spend, ROAS, Profit, Spend, and ROI
    if st!='yes':
        number_of_replicates=10000


    df_random_forest_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=\
    icr.calculate_Overall_and_Coupon_Venue_Type_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_95_Confidence_Intervals_from_metric_replicates_and_append_to_metric_confidence_interval_table(
        df_model_name_model_survey_coupon_recommendation_cost_estimated_sale_estimated_replicate_collection=df_random_forest_model_survey_coupon_recommendation_cost_estimated_sale_estimated_replicate_collection,
        df_test_model_name_model_survey_95_confidence_interval_metric_feature_column_name_filter_value=df_test_random_forest_model_survey_95_confidence_interval_metric_feature_column_name_filter_value,
        test_model_name_metric_replicate_filename_collection=test_random_forest_metric_replicate_filename_collection,
        model_type=model_name,
        filename_version=filename_version,
        number_of_replicates=number_of_replicates,)


    #save it
    df_random_forest_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=\
    icr.save_and_return_data_frame_v2(df_random_forest_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI, filename=filename)
######################################################################################################################################################################################################


#select and reorder basic metrics, recommendation cost estimated, and average sale estimate
multiindex_basic_metrics=icr.get_the_multiindex_object_with_basic_metrics()
multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated=icr.get_the_multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated()

multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated=pd.MultiIndex.from_tuples(list(multiindex_basic_metrics)+list(multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated))

#display combined metrics
df_random_forest_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated,:]

This file already exists
This file already exists

Out[85]:

		95% Confidence Interval
		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	(87%, 93%)	(83%, 95%)	(75%, 100%)	(86%, 95%)	(85%, 94%)	(100%, 100%)
	Percentage of Coupon Acceptances Captured	(25%, 30%)	(16%, 24%)	(3%, 12%)	(34%, 45%)	(37%, 47%)	(77.5%, 714.3%)
	Coupon Acceptances	(369, 441)	(70, 107)	(6, 19)	(122, 168)	(132, 179)	(100, 100)
	Coupon Acceptances Possible	(1401, 1498)	(390, 464)	(133, 181)	(328, 398)	(334, 403)	(113, 158)
	Coupon Recommendations	(410, 485)	(79, 118)	(6, 21)	(135, 183)	(147, 197)	(100, 100)
	Coupon Recommendations Possible	(2537, 2537)	(765, 858)	(362, 435)	(450, 529)	(494, 575)	(271, 337)
	Ad Revenue	(\$4410.49, \$5348.0)	(\$385.0, \$588.5)	(\$90.0, \$285.0)	(\$1830.0, \$2520.0)	(\$1584.0, \$2148.0)	(\$35.0, \$350.0)
	Ad Spend	(\$875.96, \$1049.24)	(\$77.7, \$116.06)	(\$14.31, \$50.08)	(\$360.0, \$488.0)	(\$322.74, \$432.52)	(\$6.65, \$66.5)
	ROAS	(490.68%, 521.11%)	(466.91%, 533.79%)	(471.77%, 629.03%)	(486.58%, 536.25%)	(467.06%, 516.03%)	(526.32%, 526.32%)
Control	Coupon Acceptance Rate	(52%, 59%)	(47%, 60%)	(25%, 43%)	(65%, 81%)	(57%, 73%)	(36%, 60%)
	Percentage of Coupon Acceptances Captured	(24%, 28%)	(25%, 33%)	(19%, 32%)	(20%, 29%)	(22%, 31%)	(18%, 32%)
	Coupon Acceptances	(352, 422)	(104, 147)	(28, 53)	(72, 108)	(80, 119)	(23, 46)
	Coupon Acceptances Possible	(1401, 1498)	(390, 464)	(133, 181)	(328, 398)	(334, 403)	(113, 158)
	Coupon Recommendations	(647, 735)	(204, 261)	(97, 138)	(100, 142)	(128, 174)	(54, 86)
	Coupon Recommendations Possible	(2537, 2537)	(765, 858)	(362, 435)	(450, 529)	(494, 575)	(271, 337)
	Ad Revenue	(\$4462.99, \$5564.01)	(\$572.0, \$808.5)	(\$420.0, \$795.0)	(\$1080.0, \$1620.0)	(\$960.0, \$1428.0)	(\$805.0, \$1610.0)
	Ad Spend	(\$1498.3, \$1757.92)	(\$200.64, \$256.7)	(\$231.31, \$329.08)	(\$266.67, \$378.67)	(\$281.03, \$382.08)	(\$359.1, \$571.9)
	ROAS	(283.02%, 331.81%)	(265.16%, 336.4%)	(161.43%, 272.42%)	(369.93%, 457.03%)	(316.63%, 400.02%)	(194.33%, 317.34%)
Uplift	Coupon Acceptance Rate	(30%, 38%)	(27%, 44%)	(39%, 72%)	(9%, 25%)	(16%, 32%)	(38%, 63%)
	Percentage of Coupon Acceptances Captured	(-2%, 4%)	(-14%, -2%)	(-26%, -9%)	(8%, 22%)	(8%, 21%)	(-29%, -13%)
	Coupon Acceptances	(-29, 65)	(-62, -11)	(-42, -15)	(30, 81)	(31, 81)	(-41, -17)
	Coupon Acceptances Possible	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)
	Coupon Recommendations	(-302, -185)	(-167, -101)	(-126, -83)	(10, 65)	(-8, 50)	(-82, -48)
	Coupon Recommendations Possible	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)
	Ad Revenue	(\$-837.02, \$542.52)	(\$-341.0, \$-60.5)	(\$-630.0, \$-225.0)	(\$450.0, \$1215.0)	(\$372.0, \$972.0)	(\$-1435.0, \$-595.0)
	Ad Spend	(\$-822.91, \$-507.03)	(\$-164.25, \$-99.34)	(\$-300.46, \$-197.92)	(\$26.67, \$173.33)	(\$-17.56, \$109.78)	(\$-545.3, \$-319.2)
	ROAS	(172.28%, 226.21%)	(154.54%, 246.39%)	(246.23%, 454.93%)	(55.82%, 143.48%)	(89.93%, 177.59%)	(208.98%, 331.98%)
Treatment	Average Coupon Recommendation Cost Estimated	NaN	(\$0.98, \$0.98)	(\$2.38, \$2.38)	(\$2.67, \$2.67)	(\$2.2, \$2.2)	(\$6.65, \$6.65)
Treatment	Average Sale Estimated	NaN	(\$5.5, \$5.5)	(\$15.0, \$15.0)	(\$15.0, \$15.0)	(\$12.0, \$12.0)	(\$35.0, \$35.0)
Control	Average Coupon Recommendation Cost Estimated	NaN	(\$0.54, \$0.54)	(\$1.24, \$1.24)	(\$2.2, \$2.2)	(\$1.71, \$1.71)	(\$3.08, \$3.08)
Control	Average Sale Estimated	NaN	(\$5.5, \$5.5)	(\$15.0, \$15.0)	(\$15.0, \$15.0)	(\$12.0, \$12.0)	(\$35.0, \$35.0)

time: 21.2 ms (started: 2023-09-28 17:20:10 -07:00)

Get Random Forest 90% Coupon Acceptance Rate Estimated 95% Confidence Interval Profit, Spend, and ROI Per Additional Production Cost (Overall) Table¶

In [86]:

#show ROI 95% Confidence Interval table
icr.profit_spend_roi_number_table(df=df_random_forest_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI)

Out[86]:

	95% Confidence Interval
	$200 Additional Production Cost			$2,000 Additional Production Cost			$20,000 Additional Production Cost
	Profit	Spend	ROI	Profit	Spend	ROI	Profit	Spend	ROI
Group
Treatment	(\$3334.66, \$4102.85)	(\$1075.96, \$1249.24)	(304.45%, 333.08%)	(\$1534.66, \$2302.85)	(\$2875.96, \$3049.24)	(53.3%, 75.7%)	(\$-16465.34, \$-15697.15)	(\$20875.96, \$21049.24)	(-78.87%, -74.59%)
Control	(\$2716.56, \$3657.38)	(\$1698.3, \$1957.92)	(151.84%, 195.44%)	(\$916.56, \$1857.38)	(\$3498.3, \$3757.92)	(25.77%, 50.2%)	(\$-17083.44, \$-16142.62)	(\$21498.3, \$21757.92)	(-79.28%, -74.37%)
Uplift	(\$-48.19, \$1101.09)	(\$-822.91, \$-507.03)	(121.24%, 169.45%)	(\$-48.19, \$1101.09)	(\$-822.91, \$-507.03)	(10.66%, 42.0%)	(\$-48.19, \$1101.09)	(\$-822.91, \$-507.03)	(-3.05%, 3.12%)

time: 17.3 ms (started: 2023-09-28 17:20:10 -07:00)

Get Random Forest 90% Precision Estimated Metric, Ad Revenue, Ad Spend, ROAS, Profit, Spend, and ROI Replicates Collection: Overall¶

In [87]:

model_type='random_forest'
number_metric='90_precision'

df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall=\
icr.combine_model_metric_replicates_and_ad_revenue_ad_spend_roas_profit_spend_roi_replicates(model_type=model_type,
                                                                                             number_metric=number_metric,
                                                                                             filename_version=filename_version,
                                                                                             number_of_replicates=number_of_replicates)

This file already exists.
time: 596 ms (started: 2023-09-28 17:20:10 -07:00)

Get Random Forest 90% Precision Estimated Metric Quantiles from Random Forest Metric Replicates: Overall¶

In [88]:

row_index_tuple_list=[('Treatment', 'Coupon Acceptance Rate'), 
                      ('Treatment', 'Percentage of Coupon Acceptances Captured'),
                      ('Uplift', 'Coupon Acceptance Rate'),
                      ('Treatment', 'Ad Revenue'),
                      ('Uplift', 'Ad Revenue'),
                      ('Treatment', 'Ad Spend'),
                      ('Uplift', 'Ad Spend'),
                      ('Treatment', 'ROAS'),
                      ('Uplift', 'ROAS'),
                      ('Treatment', 'ROI 2000'),
                      ('Uplift', 'ROI 2000')]

multiply_by_100_tuple_list=[('Treatment', 'Coupon Acceptance Rate'),
                            ('Uplift', 'Coupon Acceptance Rate'),]

random_forest_model_metric_replicate_quantile_series_collection_overall={}
for row_index_tuple in row_index_tuple_list:
    
    if row_index_tuple in multiply_by_100_tuple_list:
        random_forest_model_metric_replicate_quantile_series_collection_overall[row_index_tuple]=\
        df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_index_tuple,:].quantile(q=[0,.1,.9,1])*100
    else:
        random_forest_model_metric_replicate_quantile_series_collection_overall[row_index_tuple]=\
        df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_index_tuple,:].quantile(q=[0,.1,.9,1])

time: 12.2 ms (started: 2023-09-28 17:20:11 -07:00)

Get Random Forest 90% Precision Metric, Metric, Ad Revenue, Ad Spend, and ROAS, Replicates Collection: Coupon Venue Type¶

In [89]:

df_test_random_forest_10000_metric_replicates_Ad_Revenue_Ad_Spend_ROAS_replicates_collection=\
rpp('df_test_random_forest_'+str(number_of_replicates)+'_metric_replicates_Ad_Revenue_Ad_Spend_ROAS_replicates_collection_coupon_venue_type_v'+str(filename_version)+'.pkl')

time: 13.3 ms (started: 2023-09-28 17:20:11 -07:00)

Get Random Forest 90% Precision Estimated Metric Quantiles from Random Forest Metric Replicates: Coupon Venue Type¶

In [90]:

row_index_tuple_list=[('Treatment', 'Coupon Acceptance Rate'), 
                      ('Uplift', 'Coupon Acceptance Rate'),
                      ('Treatment', 'Ad Revenue'),
                      ('Uplift', 'Ad Revenue'),
                      ('Treatment', 'Ad Spend'),
                      ('Uplift', 'Ad Spend'),
                      ('Treatment', 'ROAS'),
                      ('Uplift', 'ROAS'),]

multiply_by_100_tuple_list=[('Treatment', 'Coupon Acceptance Rate'),
                            ('Uplift', 'Coupon Acceptance Rate'),]

coupon_venue_type_list=['Coffee House', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant', 'Bar']


random_forest_model_metric_replicate_quantile_series_collection={}
for coupon_venue_type in coupon_venue_type_list:
    if row_index_tuple in multiply_by_100_tuple_list:
        random_forest_model_metric_replicate_quantile_series_collection[coupon_venue_type]=\
        {row_index_tuple:df_test_random_forest_10000_metric_replicates_Ad_Revenue_Ad_Spend_ROAS_replicates_collection[coupon_venue_type].loc[row_index_tuple,:].quantile(q=[0,.1,.9,1])*100
         for row_index_tuple in row_index_tuple_list}
    else:
        random_forest_model_metric_replicate_quantile_series_collection[coupon_venue_type]=\
        {row_index_tuple:df_test_random_forest_10000_metric_replicates_Ad_Revenue_Ad_Spend_ROAS_replicates_collection[coupon_venue_type].loc[row_index_tuple,:].quantile(q=[0,.1,.9,1])
         for row_index_tuple in row_index_tuple_list}

random_forest_model_metric_replicate_quantile_series_collection['Coffee House'][('Uplift', 'ROAS')]

Out[90]:

0.0     97.677705
0.1    170.437496
0.9    231.005771
1.0    289.450148
Name: (Uplift, ROAS), dtype: float64

time: 46.9 ms (started: 2023-09-28 17:20:11 -07:00)

In [91]:

def format_percentage(value, tick_position):
    'The two args are the value and tick position'
    return f'{value*100:.0f}%'

def format_percentage_without_multiplier(value, tick_position):
    'The two args are the value and tick position'
    return f'{value:.0f}%'

def usd_format(value, tick_position):
    if (value < 1000) and (value >= 0):
        return f'${value:.0f}'
    elif (value < 0) and (value > -1000):
        return '-${:.0f}'.format(abs(value))
    elif (value >= 1000):
        return f'${value:,.0f}'
    elif (value <= -1000):
        return "-${:,.0f}".format(abs(value))

time: 1.03 ms (started: 2023-09-28 17:20:11 -07:00)

In [92]:

multiindex_object=pd.MultiIndex.from_tuples((('90% Confidence Interval', 'lower limit'),('90% Confidence Interval', 'upper limit')))

pilot_campaign_model_roas_uplift_90_percent_confidence_interval_dictionary=\
{coupon_venue_type:(random_forest_model_metric_replicate_quantile_series_collection[coupon_venue_type][('Uplift', 'ROAS')][.1], random_forest_model_metric_replicate_quantile_series_collection[coupon_venue_type][('Uplift', 'ROAS')][1])
 for coupon_venue_type in coupon_venue_type_list}

df_random_forest_roas_uplift_90_percent_confidence_interval=\
pd.DataFrame(pilot_campaign_model_roas_uplift_90_percent_confidence_interval_dictionary)

df_random_forest_roas_uplift_90_percent_confidence_interval.index=multiindex_object

df_random_forest_roas_uplift_90_percent_confidence_interval=\
df_random_forest_roas_uplift_90_percent_confidence_interval.T

df_random_forest_roas_uplift_90_percent_confidence_interval

Out[92]:

	90% Confidence Interval
	lower limit	upper limit
Coffee House	170.437496	289.450148
Takeout	70.465187	194.376305
Low-Cost Restaurant	105.154252	220.155957
Mid-Range Restaurant	230.263158	398.293030
Bar	292.050691	504.536290

time: 5.55 ms (started: 2023-09-28 17:20:11 -07:00)

Get Pilot Campaign Expected Coupon Acceptance Rate Distribution¶

In [93]:

model_type='random_forest'
row_name_tuple=('Treatment', 'Coupon Acceptance Rate')
xlabel_string='Coupon Acceptance Rate'
ylabel_string='Frequency'
title_string='Coupon Acceptance Rate Distribution'
xaxis_interval=.02

dpi=100
figure_filename = '../reports/figures/figure_'+model_type+'_classifier_'+xlabel_string.lower().replace(' ', '_')+'_overall_90_percent_confidence_dpi_'+str(dpi)+'_v'+filename_version+'.png'

number_of_bins = 88
bin_number_color_split=32

figure_filename_exists = os.path.isfile(figure_filename)
if figure_filename_exists == True:
    img = mpimg.imread(figure_filename)
    plt.figure(figsize=(10, 8))
    plt.grid(False)
    plt.axis('off')
    plt.imshow(img)
else:
    figsize=(6,4)

    
    
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize)    
    ax.xaxis.set_major_formatter(FuncFormatter(format_percentage))
    
    plt.xticks(np.arange(min(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:]), 
                         max(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:])+1, 
                         xaxis_interval))


    bin_count_array, bin_array, patches = ax.hist(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:],
                                                  linewidth=1,
                                                  bins=number_of_bins,
                                                  rwidth=1,
                                                  alpha=1,
                                                  edgecolor='green',)

    for i in range(0,bin_number_color_split):
        patches[i].set_facecolor('gray')
        patches[i].set_edgecolor('gray')
    for i in range(bin_number_color_split,len(patches)):    
        patches[i].set_facecolor('tab:green')
        patches[i].set_edgecolor('tab:green')

    plt.tick_params(axis='both', which='both', bottom=True, left=True, direction='out', length=6, width=1,)
    
    plt.xlabel(xlabel_string, fontsize=15)
    plt.ylabel(ylabel_string, fontsize=15)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title(title_string, fontsize=15);
    

    plt.savefig(figure_filename, bbox_inches='tight', dpi=100)

plt.show()


(random_forest_model_metric_replicate_quantile_series_collection_overall[('Treatment', 'Coupon Acceptance Rate')][.1], random_forest_model_metric_replicate_quantile_series_collection_overall[('Treatment', 'Coupon Acceptance Rate')][1])

Out[93]:

(8886.132347411958, 9567.307692307691)

time: 71.6 ms (started: 2023-09-28 17:20:11 -07:00)

Get Pilot Campaign Expected Coupon Acceptance Rate Uplift Distribution¶

In [94]:

model_type='random_forest'
row_name_tuple=('Uplift', 'Coupon Acceptance Rate')
xlabel_string='Coupon Acceptance Rate Uplift'
ylabel_string='Frequency'
title_string='Coupon Acceptance Rate Uplift Distribution'
xaxis_interval=.02


dpi=100
figure_filename='../reports/figures/figure_'+model_type+'_classifier_'+xlabel_string.lower().replace(' ', '_')+'_overall_90_percent_confidence_dpi_'+str(dpi)+'_v'+filename_version+'.png'

number_of_bins = 88
bin_number_color_split=32

figure_filename_exists = os.path.isfile(figure_filename)
if figure_filename_exists == True:
    img = mpimg.imread(figure_filename)
    plt.figure(figsize=(10, 8))
    plt.grid(False)
    plt.axis('off')
    plt.imshow(img)
else:
    figsize=(6,4)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    ax.xaxis.set_major_formatter(FuncFormatter(format_percentage))
    
    plt.xticks(np.arange(min(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:]), 
                         max(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:])+1, 
                         xaxis_interval))
    
    bin_count_array, bin_array, patches = ax.hist(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:],
                                                  linewidth=1,
                                                  bins=number_of_bins,
                                                  rwidth=1,
                                                  alpha=1,
                                                  edgecolor='green',)

    for i in range(0,bin_number_color_split):
        patches[i].set_facecolor('gray')
        patches[i].set_edgecolor('gray')
    for i in range(bin_number_color_split,len(patches)):    
        patches[i].set_facecolor('tab:green')
        patches[i].set_edgecolor('tab:green')

    plt.tick_params(axis='both', which='both', bottom=True, left=True, direction='out', length=6, width=1,)
    plt.xlabel(xlabel_string, fontsize=15)
    plt.ylabel(ylabel_string, fontsize=15)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title(title_string, fontsize=15);

    plt.savefig(figure_filename, bbox_inches='tight', dpi=100)

plt.show()

(random_forest_model_metric_replicate_quantile_series_collection_overall[('Uplift', 'Coupon Acceptance Rate')][.1], random_forest_model_metric_replicate_quantile_series_collection_overall[('Uplift', 'Coupon Acceptance Rate')][1])

Out[94]:

(3182.543273985793, 4178.305835149251)

time: 56.9 ms (started: 2023-09-28 17:20:11 -07:00)

Get Pilot Campaign Expected Percentage of Coupon Acceptances Captured Distribution¶

In [95]:

model_type='random_forest'
row_name_tuple=('Treatment', 'Percentage of Coupon Acceptances Captured')
xlabel_string='Percentage of Coupon Acceptances Captured'
ylabel_string='Frequency'
title_string='Percentage of Coupon Acceptances Captured Distribution'
xaxis_interval=.02


dpi=100
figure_filename='../reports/figures/figure_'+model_type+'_classifier_'+xlabel_string.lower().replace(' ', '_')+'_overall_90_percent_confidence_dpi_'+str(dpi)+'_v'+filename_version+'.png'

number_of_bins = 88
bin_number_color_split=25

figure_filename_exists = os.path.isfile(figure_filename)
if figure_filename_exists == True:
    img = mpimg.imread(figure_filename)
    plt.figure(figsize=(10, 8))
    plt.grid(False)
    plt.axis('off')
    plt.imshow(img)
else:
    figsize=(6,4)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    ax.xaxis.set_major_formatter(FuncFormatter(format_percentage))
    
    plt.xticks(np.arange(min(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:]), 
                         max(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:])+1, 
                         xaxis_interval))
    
    bin_count_array, bin_array, patches = ax.hist(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:],
                                                  linewidth=1,
                                                  bins=number_of_bins,
                                                  rwidth=1,
                                                  alpha=1,
                                                  edgecolor='green',)

    for i in range(0,bin_number_color_split):
        patches[i].set_facecolor('gray')
        patches[i].set_edgecolor('gray')
    for i in range(bin_number_color_split,len(patches)):    
        patches[i].set_facecolor('tab:green')
        patches[i].set_edgecolor('tab:green')

    plt.tick_params(axis='both', which='both', bottom=True, left=True, direction='out', length=6, width=1,)
    plt.xlabel(xlabel_string, fontsize=15)
    plt.ylabel(ylabel_string, fontsize=15)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title(title_string, fontsize=15);

    plt.savefig(figure_filename, bbox_inches='tight', dpi=100)

plt.show()

(random_forest_model_metric_replicate_quantile_series_collection_overall[('Treatment', 'Percentage of Coupon Acceptances Captured')][.1], random_forest_model_metric_replicate_quantile_series_collection_overall[('Treatment', 'Percentage of Coupon Acceptances Captured')][1])

Out[95]:

(26.425553786324812, 32.765667574931875)

time: 65.9 ms (started: 2023-09-28 17:20:11 -07:00)

Get Pilot Campaign Expected Ad Revenue Distribution¶

In [96]:

#revenue 90% confidence interval


model_type='random_forest'
row_name_tuple=('Treatment', 'Ad Revenue')
xlabel_string='Ad Revenue'
ylabel_string='Frequency'
title_string='Ad Revenue Distribution'
xaxis_interval=500
round_to_nearest=500

dpi=100
figure_filename='../reports/figures/figure_'+model_type+'_classifier_'+xlabel_string.lower().replace(' ', '_')+'_overall_90_percent_confidence_dpi_'+str(dpi)+'_v'+filename_version+'.png'

number_of_bins = 88
bin_number_color_split=27

figure_filename_exists = os.path.isfile(figure_filename)
if figure_filename_exists == True:
    img = mpimg.imread(figure_filename)
    plt.figure(figsize=(10, 8))
    plt.grid(False)
    plt.axis('off')
    plt.imshow(img)
else:
    figsize=(6,4)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    
    ax.xaxis.set_major_formatter(FuncFormatter(usd_format))
    plt.xticks(np.arange(round(min(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:])/round_to_nearest)*round_to_nearest, 
                         round(max(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:])/round_to_nearest)*round_to_nearest+1, 
                         xaxis_interval))
    
    bin_count_array, bin_array, patches = ax.hist(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:],
                                                  linewidth=1,
                                                  bins=number_of_bins,
                                                  rwidth=1,
                                                  alpha=1,
                                                  edgecolor='green',)

    for i in range(0,bin_number_color_split):
        patches[i].set_facecolor('gray')
        patches[i].set_edgecolor('gray')
    for i in range(bin_number_color_split,len(patches)):    
        patches[i].set_facecolor('tab:green')
        patches[i].set_edgecolor('tab:green')

    plt.tick_params(axis='both', which='both', bottom=True, left=True, direction='out', length=6, width=1,)
    plt.xlabel(xlabel_string, fontsize=15)
    plt.ylabel(ylabel_string, fontsize=15)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title(title_string, fontsize=15);

    plt.savefig(figure_filename, bbox_inches='tight', dpi=100)

plt.show()


(random_forest_model_metric_replicate_quantile_series_collection_overall[('Treatment', 'Ad Revenue')][.1], random_forest_model_metric_replicate_quantile_series_collection_overall[('Treatment', 'Ad Revenue')][1])

Out[96]:

(4568.45, 5818.5)

time: 64 ms (started: 2023-09-28 17:20:11 -07:00)

Get Pilot Campaign Model Estimated ROAS Distribution¶

In [97]:

#roas 90% confidence interval

model_type='random_forest'
row_name_tuple=('Treatment', 'ROAS')
xlabel_string='ROAS'
ylabel_string='Count'
title_string='ROAS Distribution'

xaxis_interval=10
round_to_nearest=10

dpi=100
figure_filename='../reports/figures/figure_'+model_type+'_classifier_'+xlabel_string.lower().replace(' ', '_')+'_overall_90_percent_confidence_dpi_'+str(dpi)+'_v'+filename_version+'.png'

number_of_bins=88
bin_number_color_split=33

figure_filename_exists = os.path.isfile(figure_filename)
if figure_filename_exists == True:
    img = mpimg.imread(figure_filename)
    plt.figure(figsize=(10, 8))
    plt.grid(False)
    plt.axis('off')
    plt.imshow(img)
else:
    figsize=(6,4)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    
    ax.xaxis.set_major_formatter(FuncFormatter(format_percentage_without_multiplier))
    plt.xticks(np.arange(round(min(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:])/round_to_nearest)*round_to_nearest, 
                         round(max(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:])/round_to_nearest)*round_to_nearest+1, 
                         xaxis_interval))
    
    bin_count_array, bin_array, patches = ax.hist(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:],
                                                  linewidth=1,
                                                  bins=number_of_bins,
                                                  rwidth=1,
                                                  alpha=1,
                                                  edgecolor='green',)

    for i in range(0,bin_number_color_split):
        patches[i].set_facecolor('gray')
        patches[i].set_edgecolor('gray')
    for i in range(bin_number_color_split,len(patches)):    
        patches[i].set_facecolor('tab:green')
        patches[i].set_edgecolor('tab:green')

    plt.tick_params(axis='both', which='both', bottom=True, left=True, direction='out', length=6, width=1,)
    plt.xlabel(xlabel_string, fontsize=15)
    plt.ylabel(ylabel_string, fontsize=15)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title(title_string, fontsize=15);

    plt.savefig(figure_filename, bbox_inches='tight', dpi=100)

plt.show()


(random_forest_model_metric_replicate_quantile_series_collection_overall[('Treatment', 'ROAS')][.1], random_forest_model_metric_replicate_quantile_series_collection_overall[('Treatment', 'ROAS')][1])

Out[97]:

(496.56230745474505, 531.9878764150473)

time: 66.4 ms (started: 2023-09-28 17:20:11 -07:00)

Get Pilot Campaign Estimated ROAS Uplift Distribution¶

In [98]:

#roas uplift 90% confidence interval

model_type='random_forest'
row_name_tuple=('Uplift', 'ROAS')
xlabel_string='ROAS Uplift'
ylabel_string='Count'
title_string='ROAS Uplift Distribution'

xaxis_interval=20
round_to_nearest=20



dpi=100
figure_filename='../reports/figures/figure_'+model_type+'_classifier_'+xlabel_string.lower().replace(' ', '_')+'_overall_90_percent_confidence_dpi_'+str(dpi)+'_v'+filename_version+'.png'

number_of_bins = 88
bin_number_color_split=30

figure_filename_exists = os.path.isfile(figure_filename)
if figure_filename_exists == True:
    img = mpimg.imread(figure_filename)
    plt.figure(figsize=(10, 8))
    plt.grid(False)
    plt.axis('off')
    plt.imshow(img)
else:
    figsize=(6,4)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize)

    ax.xaxis.set_major_formatter(FuncFormatter(format_percentage_without_multiplier))
    plt.xticks(np.arange(round(min(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:])/round_to_nearest)*round_to_nearest, 
                         round(max(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:])/round_to_nearest)*round_to_nearest+1, 
                         xaxis_interval))
    
    bin_count_array, bin_array, patches = ax.hist(df_test_random_forest_90_precision_estimated_10000_metric_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_replicates_overall.loc[row_name_tuple,:],
                                                  linewidth=1,
                                                  bins=number_of_bins,
                                                  rwidth=1,
                                                  alpha=1,
                                                  edgecolor='green',)

    for i in range(0,bin_number_color_split):
        patches[i].set_facecolor('gray')
        patches[i].set_edgecolor('gray')
    for i in range(bin_number_color_split,len(patches)):    
        patches[i].set_facecolor('tab:green')
        patches[i].set_edgecolor('tab:green')

    plt.tick_params(axis='both', which='both', bottom=True, left=True, direction='out', length=6, width=1,)
    plt.xlabel(xlabel_string, fontsize=15)
    plt.ylabel(ylabel_string, fontsize=15)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title(title_string, fontsize=15);

    plt.savefig(figure_filename, bbox_inches='tight', dpi=100)

plt.show()


(random_forest_model_metric_replicate_quantile_series_collection_overall[row_name_tuple][.1],
 random_forest_model_metric_replicate_quantile_series_collection_overall[row_name_tuple][1])

Out[98]:

(181.33783599369275, 252.37496977458187)

time: 64.7 ms (started: 2023-09-28 17:20:11 -07:00)

Get Gradient Boosting 80% of Coupon Acceptances Captured Estimated Metrics (Per Coupon Venue Type) Table¶

In [99]:

model_type='gradient_boosting'
survey_number_recall_estimated_y_predicted_column_name='Y_test_survey_80_recall_estimate_predicted'


metrics_coupon_venue_type_list=[]

for feature_column_name_filter_value_list_dictionary_key in feature_column_name_filter_value_list_dictionary.keys():
    
    metric_list=icr.get_model_and_survey_metrics(df=df_y_test_model_name_predicted_y_test_survey_recall_estimate_predicted_y_actual_coupon_venue_type.copy(),
                                                 model_y_predicted_column_name='Y_test_'+model_type+'_predicted',
                                                 survey_number_recall_estimated_y_predicted_column_name=survey_number_recall_estimated_y_predicted_column_name,
                                                 y_predicted_column_name_base_survey=y_predicted_column_name_base_survey,
                                                 y_actual_column_name='Y',
                                                 feature_column_name_filter=feature_column_name_filter,
                                                 feature_column_name_filter_value_list=feature_column_name_filter_value_list_dictionary[feature_column_name_filter_value_list_dictionary_key],
                                                 metrics_column_name_list=None,)
    metrics_coupon_venue_type_list+=[metric_list]

df_test_gradient_boosting_model_survey_metrics=\
pd.DataFrame(metrics_coupon_venue_type_list, 
             index=feature_column_name_filter_value_list_dictionary_key_list,
             columns=multiple_index[0:int(len(multiple_index)*2/3)]).T

df_test_gradient_boosting_metrics=icr.calculate_and_add_model_survey_difference(df_test_gradient_boosting_model_survey_metrics, multiple_index)

#add Venue Type Coupon Recommendation Cost Estimated, Sale Estimated
df_test_gradient_boosting_metrics_coupon_recommendation_cost_estimated_sale_estimated=pd.concat([df_test_gradient_boosting_metrics, df_train_gradient_boosting_80_recall_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated], axis=0)
df_test_gradient_boosting_metrics_coupon_recommendation_cost_estimated_sale_estimated



#get and add Total Ad Spend, Total Revenue, ROAS
df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=icr.get_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI(df=df_test_gradient_boosting_metrics_coupon_recommendation_cost_estimated_sale_estimated)

#select and reorder basic metrics, recommendation cost estimated, and average sale estimate
multiindex_basic_metrics=icr.get_the_multiindex_object_with_basic_metrics()
multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated=icr.get_the_multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated()

multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated=pd.MultiIndex.from_tuples(list(multiindex_basic_metrics)+list(multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated))

#display combined metrics
df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated,:]

Out[99]:

		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	79.876374	80.100756	75.159236	83.418367	83.163265	62.711864
	Percentage of Coupon Acceptances Captured	80.262250	74.473068	75.641026	90.082645	88.586957	54.814815
	Coupon Acceptances	1163.000000	318.000000	118.000000	327.000000	326.000000	74.000000
	Coupon Acceptances Possible	1449.000000	427.000000	156.000000	363.000000	368.000000	135.000000
	Coupon Recommendations	1456.000000	397.000000	157.000000	392.000000	392.000000	118.000000
	Coupon Recommendations Possible	2537.000000	811.000000	399.000000	489.000000	534.000000	304.000000
	Ad Revenue	14926.000000	1749.000000	1770.000000	4905.000000	3912.000000	2590.000000
	Ad Spend	2970.843315	336.842215	349.149254	962.883782	774.679720	547.288344
	ROAS	502.416264	519.234206	506.946522	509.407271	504.982885	473.242310
Control	Coupon Acceptance Rate	56.597222	52.715655	37.537538	73.809524	67.840376	46.640316
	Percentage of Coupon Acceptances Captured	78.743961	77.283372	80.128205	76.859504	78.532609	87.407407
	Coupon Acceptances	1141.000000	330.000000	125.000000	279.000000	289.000000	118.000000
	Coupon Acceptances Possible	1449.000000	427.000000	156.000000	363.000000	368.000000	135.000000
	Coupon Recommendations	2016.000000	626.000000	333.000000	378.000000	426.000000	253.000000
	Coupon Recommendations Possible	2537.000000	811.000000	399.000000	489.000000	534.000000	304.000000
	Ad Revenue	15473.000000	1815.000000	1875.000000	4185.000000	3468.000000	4130.000000
	Ad Spend	4215.483585	531.141629	740.552239	928.495076	841.871329	1173.423313
	ROAS	367.051601	341.716767	253.189431	450.729370	411.939436	351.961645
Uplift	Coupon Acceptance Rate	23.279151	27.385101	37.621698	9.608844	15.322890	16.071548
	Percentage of Coupon Acceptances Captured	1.518288	-2.810304	-4.487179	13.223140	10.054348	-32.592593
	Coupon Acceptances	22.000000	-12.000000	-7.000000	48.000000	37.000000	-44.000000
	Coupon Acceptances Possible	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	Coupon Recommendations	-560.000000	-229.000000	-176.000000	14.000000	-34.000000	-135.000000
	Coupon Recommendations Possible	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	Ad Revenue	-547.000000	-66.000000	-105.000000	720.000000	444.000000	-1540.000000
	Ad Spend	-1244.640270	-194.299414	-391.402985	34.388707	-67.191608	-626.134969
	ROAS	135.364663	177.517439	253.757091	58.677902	93.043449	121.280665
Treatment	Average Coupon Recommendation Cost Estimated	NaN	0.848469	2.223881	2.456336	1.976224	4.638037
Treatment	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000
Control	Average Coupon Recommendation Cost Estimated	NaN	0.541538	1.244129	2.201155	1.707282	3.081650
Control	Average Sale Estimated	NaN	5.500000	15.000000	15.000000	12.000000	35.000000

time: 62.4 ms (started: 2023-09-28 17:20:11 -07:00)

In [ ]:

In [100]:

#Dot Plot for Coupon Acceptance Rate, Percentage of Coupon Acceptances Captured, and Coupon Acceptances

#initialize variables
column_name_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']


#get Coupon Acceptance Rate, Percentage of Coupon Acceptances Captured and Coupon Acceptances DataFrame
multiindex_metrics_list=[('Treatment', 'Coupon Acceptance Rate'), ('Treatment', 'Percentage of Coupon Acceptances Captured'), ('Treatment', 'Coupon Acceptances')]
df_test_random_forest_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances=df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_metrics_list, column_name_list]

#build the Coupon Acceptance Group DataFrame
df_test_random_forest_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group=\
icr.extract_and_add_metric_coupon_acceptances_group(df=df_test_random_forest_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances)

df_test_random_forest_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group

Out[100]:

		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	90.604027	89.795918	92.307692	91.194969	90.116279	100.000000
	Percentage of Coupon Acceptances Captured	27.950311	20.608899	7.692308	39.944904	42.119565	3.703704
	Coupon Acceptances	405.000000	88.000000	12.000000	145.000000	155.000000	5.000000
	Coupon Acceptances Group	400.000000	140.000000	40.000000	140.000000	140.000000	40.000000

time: 9.13 ms (started: 2023-09-28 17:20:11 -07:00)

In [ ]:

In [101]:

xlabel_string = 'Percentage of Coupon Acceptances Captured'
ylabel_string = 'Coupon Acceptance Rate'
title_string = 'Pilot Campaign Model Metrics'

model_type='random_forest'
figure_filename='../reports/figures/figure_'+str(model_type)+'_'+title_string.replace(' ','_').lower()+'_in_plot_dot_labels'+'_v'+'_B_'+filename_version+'.png'


campaign_model_metric_label = ['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant',]

x = df_test_random_forest_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group.loc[('Treatment','Percentage of Coupon Acceptances Captured')].values
y = df_test_random_forest_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group.loc[('Treatment','Coupon Acceptance Rate')].values
sizes = df_test_random_forest_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group.loc[('Treatment','Coupon Acceptances Group')].values

#color_list = ['red', 'green', 'blue', 'orange', 'purple', 'pink', 'gray']
#color_list = ['#1f78b4','#1f78b4','#a6cee3','#1f78b4','#1f78b4','#a6cee3',]
color_list = ['#a6cee3',]*6


plt.rcParams.update({'font.size': 18})

#coupon_acceptances_group_legend_location = (0.6922, 0.06)
#coupon_acceptances_group_legend_location = (1.0222, 0.5)
#campaign_model_legend_location = (1.0222, 0.8)



# Create the Plot
figure, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 9))
scatter = axes.scatter(x, y, c=color_list, s=sizes, alpha=0.7, edgecolors='k', linewidths=1)

# Add Dot Labels
for index, string_label in enumerate(campaign_model_metric_label):
    if index in [0,]:
        #axes.annotate(string_label, (x[index]+0.5, y[index]+1)) # Overall
        axes.annotate(string_label, (x[index]-4.5, y[index]+1.5)) # Overall
    elif index in [1,]:
        axes.annotate(string_label, (x[index]-10, y[index]-1.8)) # Coffee House
        #axes.annotate(string_label, (x[index]-18.5, y[index]-1.4)) # Coffee House
    elif index in [2,]:
        axes.annotate(string_label, (x[index]+1, y[index])) # Bar
    elif index in [3,]:
        axes.annotate(string_label, (x[index]+1, y[index]+0.2)) # Takeout
    elif index in [4,]:
        axes.annotate(string_label, (x[index]+1, y[index]-1)) # Low-Cost Restuarant
    elif index in [5,]:
        axes.annotate(string_label, (x[index]+1, y[index]-1)) # Mid-Range Restaurant

# Add the "Coupon Acceptances" legend
legend_coupon_acceptances_group = plt.legend(handles=scatter.legend_elements(prop='sizes')[0], title="Coupon Acceptances", labels=['1-20', '21-200', '201-500', '501-1500'], loc='lower right')
plt.gca().add_artist(legend_coupon_acceptances_group)


# Add plot lebels
plt.xlabel(xlabel_string)
plt.ylabel(ylabel_string)
plt.title(title_string)
plt.xticks([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
plt.yticks([60 ,70, 80, 90, 100])
plt.gca().set_xticklabels(['{:.0f}%'.format(x) for x in plt.gca().get_xticks()])
plt.gca().set_yticklabels(['{:.0f}%'.format(y) for y in plt.gca().get_yticks()])

# Save it
plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)

plt.show()

time: 215 ms (started: 2023-09-28 17:20:11 -07:00)

In [ ]:

In [102]:

#Dot Plot for Coupon Acceptance Rate, Percentage of Coupon Acceptances Captured, and Coupon Acceptances

#initialize variables
column_name_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']

#get Coupon Acceptance Rate, Percentage of Coupon Acceptances Captured and Coupon Acceptances DataFrame
multiindex_metrics_list=[('Treatment', 'Coupon Acceptance Rate'), ('Treatment', 'Percentage of Coupon Acceptances Captured'), ('Treatment', 'Coupon Acceptances')]
df_test_gradient_boosting_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances=df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_metrics_list, column_name_list]

#build the Coupon Acceptance Group DataFrame
df_test_gradient_boosting_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group=\
icr.extract_and_add_metric_coupon_acceptances_group(df=df_test_gradient_boosting_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances)

df_test_gradient_boosting_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group

Out[102]:

		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	79.876374	80.100756	75.159236	83.418367	83.163265	62.711864
	Percentage of Coupon Acceptances Captured	80.262250	74.473068	75.641026	90.082645	88.586957	54.814815
	Coupon Acceptances	1163.000000	318.000000	118.000000	327.000000	326.000000	74.000000
	Coupon Acceptances Group	750.000000	400.000000	140.000000	400.000000	400.000000	140.000000

time: 10.4 ms (started: 2023-09-28 17:20:11 -07:00)

In [103]:

xlabel_string = 'Percentage of Coupon Acceptances Captured'
ylabel_string = 'Coupon Acceptance Rate'
title_string = 'Drive Sales Campaign Model Metrics'

model_type='gradient_boosting'
figure_filename='../reports/figures/figure_'+str(model_type)+'_'+title_string.replace(' ','_').lower()+'_in_plot_dot_labels'+'_v'+filename_version+'.png'


campaign_model_metric_label = ['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant',]

x = df_test_gradient_boosting_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group.loc[('Treatment','Percentage of Coupon Acceptances Captured')].values
y = df_test_gradient_boosting_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group.loc[('Treatment','Coupon Acceptance Rate')].values
sizes = df_test_gradient_boosting_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances_coupon_acceptances_group.loc[('Treatment','Coupon Acceptances Group')].values

#color_list = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c',]
color_list = ['#a6cee3',]*6

plt.rcParams.update({'font.size': 18})


# Create the Plot
figure, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 9))
scatter = axes.scatter(x, y, c=color_list, s=sizes, alpha=0.7, edgecolors='k', linewidths=1)

# Add Dot Labels
for index, string_label in enumerate(campaign_model_metric_label):
    if index in []:
        axes.annotate(string_label, (x[index], y[index]))
    elif index in [0,]:
        axes.annotate(string_label, (x[index]+2.5, y[index]-0.7)) # Overall
    elif index in [1,]:
        axes.annotate(string_label, (x[index]-20.2, y[index]-.4)) # Coffee House
    elif index in [2,]:
        axes.annotate(string_label, (x[index]+1, y[index])) # Bar
    elif index in [3,]:
        axes.annotate(string_label, (x[index]-.9, y[index]+1.2)) # Takeout
    elif index in [4,]:
        axes.annotate(string_label, (x[index]-30, y[index]+1)) # Low-Cost Restuarant
    elif index in [5,]:
        axes.annotate(string_label, (x[index]-31, y[index]-0)) # Mid-Range Restuarant

# Add the "Coupon Acceptances" legend
legend_coupon_acceptances_group = plt.legend(handles=scatter.legend_elements(prop='sizes')[0], title="Coupon Acceptances", labels=['21-200', '201-500', '501-1500'], loc='lower right')
plt.gca().add_artist(legend_coupon_acceptances_group)


# Add plot lebels
plt.xlabel(xlabel_string)
plt.ylabel(ylabel_string)
plt.title(title_string)
plt.xticks([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
plt.yticks([60 ,70, 80, 90, 100])
plt.gca().set_xticklabels(['{:.0f}%'.format(x) for x in plt.gca().get_xticks()])
plt.gca().set_yticklabels(['{:.0f}%'.format(y) for y in plt.gca().get_yticks()])

# Save it
plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)

plt.show()

time: 211 ms (started: 2023-09-28 17:20:11 -07:00)

In [ ]:

In [104]:

#Dot Plot for Coupon Acceptance Rate, Percentage of Coupon Acceptances Captured, and Coupon Acceptances

#initialize variables
column_name_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']

#get Coupon Acceptance Rate, Percentage of Coupon Acceptances Captured and Coupon Acceptances DataFrame
multiindex_metrics_list=[('Treatment', 'Coupon Acceptance Rate'), ('Treatment', 'Percentage of Coupon Acceptances Captured'), ('Treatment', 'Coupon Acceptances')]
df_test_random_forest_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances=df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_metrics_list, column_name_list]


column_name_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']
column_name_list_rename=['Drive Sales Campaign ' + column_name for column_name in column_name_list]
column_name_list_dictionary=dict(zip(column_name_list, column_name_list_rename))
df_drive_sales_campaign_model_overall=df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_metrics_list, column_name_list].rename(columns=column_name_list_dictionary)

df_campaign_model_metrics=pd.concat([df_test_random_forest_metrics_coupon_acceptance_rate_percentage_of_coupon_acceptances_captured_coupon_acceptances, df_drive_sales_campaign_model_overall], axis=1)


icr.extract_and_add_metric_coupon_acceptances_group(df=df_campaign_model_metrics)

#build the Coupon Acceptance Group DataFrame
df_campaign_model_metrics_coupon_acceptances_group=icr.extract_and_add_metric_coupon_acceptances_group(df=df_campaign_model_metrics)
del(df_campaign_model_metrics)

df_campaign_model_metrics_coupon_acceptances_group_all = df_campaign_model_metrics_coupon_acceptances_group.copy()

df_campaign_model_metrics_coupon_acceptances_group=df_campaign_model_metrics_coupon_acceptances_group.iloc[:, 0:7]

df_campaign_model_metrics_coupon_acceptances_group

Out[104]:

		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant	Drive Sales Campaign Overall
Treatment	Coupon Acceptance Rate	90.604027	89.795918	92.307692	91.194969	90.116279	100.000000	79.876374
	Percentage of Coupon Acceptances Captured	27.950311	20.608899	7.692308	39.944904	42.119565	3.703704	80.262250
	Coupon Acceptances	405.000000	88.000000	12.000000	145.000000	155.000000	5.000000	1163.000000
	Coupon Acceptances Group	400.000000	140.000000	40.000000	140.000000	140.000000	40.000000	750.000000

time: 14.7 ms (started: 2023-09-28 17:20:12 -07:00)

In [105]:

xlabel_string = 'Percentage of Coupon Acceptances Captured'
ylabel_string = 'Coupon Acceptance Rate'
title_string = 'Campaign Model Metrics'

model_types='random_forest_gradient_boosting'
figure_filename='../reports/figures/figure_'+str(model_types)+'_'+title_string.replace(' ','_').lower()+'_v'+filename_version+'.png'


campaign_model_metric_label = ['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant', 'Overall']

x = df_campaign_model_metrics_coupon_acceptances_group.loc[('Treatment','Percentage of Coupon Acceptances Captured')].values
y = df_campaign_model_metrics_coupon_acceptances_group.loc[('Treatment','Coupon Acceptance Rate')].values
sizes = df_campaign_model_metrics_coupon_acceptances_group.loc[('Treatment','Coupon Acceptances Group')].values

#color_list = ['red', 'green', 'blue', 'orange', 'purple', 'pink', 'gray']
#color_list = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f']
color_list = ['#a6cee3',]*7

#pattern_list = ['///', 'ooo', '\\\\', 'xxx', '||||']
pattern_list = [' ','xxx',]
pattern_list_2 = [' ', ' ', ' ', ' ', ' ', ' ','xxx',]

plt.rcParams.update({'font.size': 18})

#coupon_acceptances_group_legend_location = (0.6922, 0.06)
coupon_acceptances_group_legend_location = (1.0222, 0.5)
campaign_model_legend_location = (1.0222, 0.8)



# Create the Plot
figure, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 9))
scatter = axes.scatter(x, y, c=color_list, s=sizes, alpha=0.7, edgecolors='k', linewidths=1)

# Add Dot Labels
for index, string_label in enumerate(campaign_model_metric_label):
    if index in [0,]:
        axes.annotate(string_label, (x[index]-4.5, y[index]+1.5)) # Overall
    elif index in [1,]:
        axes.annotate(string_label, (x[index]-10, y[index]-1.8)) # Coffee House
    elif index in [2,]:
        axes.annotate(string_label, (x[index]+1, y[index])) # Bar
    elif index in [3,]:
        axes.annotate(string_label, (x[index]+1, y[index]+0.2)) # Takeout
    elif index in [4,]:
        axes.annotate(string_label, (x[index]+1, y[index]-1)) # Low-Cost Restuarant
    elif index in [5,]:
        axes.annotate(string_label, (x[index]+1, y[index]-1)) # Mid-Range Restaurant
    elif index in [6,]:
        axes.annotate(string_label, (x[index]+1, y[index]+1))

# Add Dot Campaign Model Type Hatch
for index, color in enumerate(color_list):
    axes.scatter(x[index], y[index], c=color, s=sizes[index], alpha=0.7, edgecolors='k', linewidths=1, hatch=pattern_list_2[index])

# Add the "Campaign Model" Legend
legend_elements_patterns = [mpatches.Patch(facecolor='white', edgecolor='black', hatch=pattern, label=['Pilot', 'Drive Sales'][i]) for i, pattern in enumerate(pattern_list)]
legend_patterns = axes.legend(handles=legend_elements_patterns, loc=campaign_model_legend_location, title='Campaign Model')
axes.add_artist(legend_patterns)

# Add the "Coupon Acceptances" legend
legend_coupon_acceptances_group = plt.legend(handles=scatter.legend_elements(prop='sizes')[0], title="Coupon Acceptances", labels=['1-20', '21-200', '201-500', '501-1500'], loc=coupon_acceptances_group_legend_location)
plt.gca().add_artist(legend_coupon_acceptances_group)


# Add plot lebels
plt.xlabel(xlabel_string)
plt.ylabel(ylabel_string)
plt.title(title_string)
plt.xticks([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
plt.yticks([60 ,70, 80, 90, 100])
plt.gca().set_xticklabels(['{:.0f}%'.format(x) for x in plt.gca().get_xticks()])
plt.gca().set_yticklabels(['{:.0f}%'.format(y) for y in plt.gca().get_yticks()])

# Save it
plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)
        
plt.show()

time: 261 ms (started: 2023-09-28 17:20:12 -07:00)

In [106]:

xlabel_string = 'Percentage of Coupon Acceptances Captured'
ylabel_string = 'Coupon Acceptance Rate'
title_string = 'Campaign Model Metrics'

model_types='random_forest_gradient_boosting'
figure_filename='../reports/figures/figure_'+str(model_types)+'_'+title_string.replace(' ','_').lower()+'_v'+filename_version+'.png'


campaign_model_metric_label = ['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant', 'Overall']

x = df_campaign_model_metrics_coupon_acceptances_group.loc[('Treatment','Percentage of Coupon Acceptances Captured')].values
y = df_campaign_model_metrics_coupon_acceptances_group.loc[('Treatment','Coupon Acceptance Rate')].values
sizes = df_campaign_model_metrics_coupon_acceptances_group.loc[('Treatment','Coupon Acceptances Group')].values

color_list = ['#a6cee3',]*7

#pattern_list = ['///', 'ooo', '\\\\', 'xxx', '||||']
pattern_list = [' ','xxx',]
pattern_list_2 = [' ', ' ', ' ', ' ', ' ', ' ','xxx',]

plt.rcParams.update({'font.size': 18})

#coupon_acceptances_group_legend_location = (0.6922, 0.06)
coupon_acceptances_group_legend_location = (0.0222, 0.06)
campaign_model_legend_location = (0.0222, 0.36)



# Create the Plot
figure, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 9))
scatter = axes.scatter(x, y, c=color_list, s=sizes, alpha=0.7, edgecolors='k', linewidths=1)

# Add Dot Labels
for index, string_label in enumerate(campaign_model_metric_label):
    if index in [0,]:
        axes.annotate(string_label, (x[index]-4.5, y[index]+1.5)) # Overall
    elif index in [1,]:
        axes.annotate(string_label, (x[index]-10, y[index]-1.8)) # Coffee House
    elif index in [2,]:
        axes.annotate(string_label, (x[index]+1, y[index])) # Bar
    elif index in [3,]:
        axes.annotate(string_label, (x[index]+1, y[index]+0.2)) # Takeout
    elif index in [4,]:
        axes.annotate(string_label, (x[index]+1, y[index]-1)) # Low-Cost Restuarant
    elif index in [5,]:
        axes.annotate(string_label, (x[index]+1, y[index]-1)) # Mid-Range Restaurant
    elif index in [6,]:
        axes.annotate(string_label, (x[index]+1, y[index]+1))

# Add Dot Campaign Model Type Hatch
for index, color in enumerate(color_list):
    axes.scatter(x[index], y[index], c=color, s=sizes[index], alpha=0.7, edgecolors='k', linewidths=1, hatch=pattern_list_2[index])

# Add the "Campaign Model" Legend
legend_elements_patterns = [mpatches.Patch(facecolor='white', edgecolor='black', hatch=pattern, label=['Pilot', 'Drive Sales'][i]) for i, pattern in enumerate(pattern_list)]
legend_patterns = axes.legend(handles=legend_elements_patterns, loc=campaign_model_legend_location, title='Campaign Model')
axes.add_artist(legend_patterns)

# Add the "Coupon Acceptances" legend
legend_coupon_acceptances_group = plt.legend(handles=scatter.legend_elements(prop='sizes')[0], title="Coupon Acceptances", labels=['1-20', '21-200', '201-500', '501-1500'], loc=coupon_acceptances_group_legend_location)
plt.gca().add_artist(legend_coupon_acceptances_group)


# Add plot lebels
plt.xlabel(xlabel_string)
plt.ylabel(ylabel_string)
plt.title(title_string)
plt.xticks([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
plt.yticks([60 ,70, 80, 90, 100])
plt.gca().set_xticklabels(['{:.0f}%'.format(x) for x in plt.gca().get_xticks()])
plt.gca().set_yticklabels(['{:.0f}%'.format(y) for y in plt.gca().get_yticks()])


# Save it
plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)

plt.show()

time: 248 ms (started: 2023-09-28 17:20:12 -07:00)

In [ ]:

In [115]:

xlabel_string = 'Percentage of Coupon Acceptances Captured'
ylabel_string = 'Coupon Acceptance Rate'
title_string = 'Campaign Model Metrics'

model_types='random_forest_gradient_boosting'
figure_filename='../reports/figures/figure_'+str(model_types)+'_'+title_string.replace(' ','_').lower()+'_v'+filename_version+'.png'


campaign_model_metric_label = ['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant',]*2
#campaign_model_metric_label = ['overall', 'coffee house', 'bar', 'takeout', 'low-cost restaurant', 'mid-range restaurant',]*2
#campaign_model_metric_label = ['Overall', 'Coffee house', 'Bar', 'Takeout', 'Low-cost restaurant', 'Mid-range restaurant',]*2

x = df_campaign_model_metrics_coupon_acceptances_group_all.loc[('Treatment','Percentage of Coupon Acceptances Captured')].values
y = df_campaign_model_metrics_coupon_acceptances_group_all.loc[('Treatment','Coupon Acceptance Rate')].values
sizes = df_campaign_model_metrics_coupon_acceptances_group_all.loc[('Treatment','Coupon Acceptances Group')].values

#color_list = ['red', 'green', 'blue', 'orange', 'purple', 'pink', 'gray']
#color_list = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f']
color = '#a6cee3'
dot_count = 12
color_list = [color,]*dot_count

#pattern_list = ['///', 'ooo', '\\\\', 'xxx', '||||']
pattern_list = [' ','xxx',]
pattern_list_2 = [' ',]*6 + ['xxx',]*6

plt.rcParams.update({'font.size': 18})

#coupon_acceptances_group_legend_location = (0.6922, 0.06)
coupon_acceptances_group_legend_location = (0.0222, 0.06)
campaign_model_legend_location = (0.0222, 0.36)



# Create the Plot
figure, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 9))
scatter = axes.scatter(x, y, c=color_list, s=sizes, alpha=0.7, edgecolors='k', linewidths=1)

# Add Dot Labels
for index, string_label in enumerate(campaign_model_metric_label):
    if index in [0,]:
        axes.annotate(string_label, (x[index]-4.5, y[index]+3)) # Overall
    elif index in [1,]:
        axes.annotate(string_label, (x[index]-10, y[index]-4.8)) # Coffee House
    elif index in [2,]:
        axes.annotate(string_label, (x[index]+1, y[index])) # Bar
    elif index in [3,]:
        axes.annotate(string_label, (x[index]+0.0, y[index]+1.7)) # Takeout
    elif index in [4,]:
        axes.annotate(string_label, (x[index]+1.5, y[index]-1.5)) # Low-Cost Restaurant
    elif index in [5,]:
        axes.annotate(string_label, (x[index]+1, y[index]-1.8)) # Mid-Range Restaurant
    elif index in [6,]:
        axes.annotate(string_label, (x[index]+2.3, y[index]-2.9)) # Overall
    elif index in [7,]:
        axes.annotate(string_label, (x[index]-20.2, y[index]-.4)) # Coffee House
    elif index in [8,]:
        axes.annotate(string_label, (x[index]+1, y[index]-3)) # Bar
    elif index in [9,]:
        axes.annotate(string_label, (x[index]-.9, y[index]+3.0)) # Takeout
    elif index in [10,]:
        axes.annotate(string_label, (x[index]-29.5, y[index]+1)) # Low-Cost Restaurant
    elif index in [11,]:
        axes.annotate(string_label, (x[index]-32.0, y[index]-0)) # Mid-Range Restaurant

# Add Dot Campaign Model Type Hatch
for index, color in enumerate(color_list):
    axes.scatter(x[index], y[index], c=color, s=sizes[index], alpha=0.7, edgecolors='k', linewidths=1, hatch=pattern_list_2[index])

# Add the "Campaign Model" Legend
legend_elements_patterns = [mpatches.Patch(facecolor=color, edgecolor='black', hatch=pattern, label=['Pilot', 'Drive Sales'][i]) for i, pattern in enumerate(pattern_list)]
legend_patterns = axes.legend(handles=legend_elements_patterns, loc=campaign_model_legend_location, title='Campaign Model')
axes.add_artist(legend_patterns)

# Add the "Coupon Acceptances" legend
legend_coupon_acceptances_group = plt.legend(handles=scatter.legend_elements(prop='sizes')[0], title="Coupon Acceptances", labels=['1-20', '21-200', '201-500', '501-1500'], loc=coupon_acceptances_group_legend_location)
plt.gca().add_artist(legend_coupon_acceptances_group)


# Add plot lebels
plt.xlabel(xlabel_string)
plt.ylabel(ylabel_string)
plt.title(title_string)
plt.xticks([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
#plt.yticks([60 ,70, 80, 90, 100])
plt.yticks([0, 10, 20, 30, 40, 50, 60 ,70, 80, 90, 100])
plt.gca().set_xticklabels(['{:.0f}%'.format(x) for x in plt.gca().get_xticks()])
plt.gca().set_yticklabels(['{:.0f}%'.format(y) for y in plt.gca().get_yticks()])


# Save it
plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)

plt.show()

time: 336 ms (started: 2023-09-28 17:24:18 -07:00)

In [ ]:

In [108]:

#show ROI table
icr.profit_spend_roi_number_table(df=df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI)

Out[108]:

Additional Production Cost	200			2000			20000
Metric	Profit	Spend	ROI	Profit	Spend	ROI	Profit	Spend	ROI
Group
Control	11057.516415	4415.483585	250.425943	9257.516415	6215.483585	148.942818	-8742.483585	24215.483585	-36.102866
Treatment	11755.156685	3170.843315	370.726508	9955.156685	4970.843315	200.270981	-8044.843315	22970.843315	-35.021976
Uplift	697.640270	-1244.640270	120.300564	697.640270	-1244.640270	51.328163	697.640270	-1244.640270	1.080890

time: 9.78 ms (started: 2023-09-28 17:20:12 -07:00)

In [109]:

#V-- add second legend with distinction between the Pilot Campaign Model and Drive Sales Campaign Model

time: 287 µs (started: 2023-09-28 17:20:12 -07:00)

In [ ]:

Get Gradient Boosting 80% Recall Estimated 95% Confidence Interval Metrics, Ad Revenue, Ad Spend, ROAS (Per Coupon Venue Type) Table¶

In [ ]:

In [110]:

########################################################################################################################################################################################################
filename='df_test_gradient_boosting_model_survey_95_confidence_interval_metric_feature_column_name_filter_value_v'+filename_version+'.pkl'

df_readback=icr.return_processed_data_file_if_it_exists_v2(filename=filename, column_name_row_integer_location_list=[0, 1], index_column_integer_location_list=[0, 1])
if df_readback.empty == False:
    df_test_gradient_boosting_model_survey_95_confidence_interval_metric_feature_column_name_filter_value=df_readback
else:
    
    model_type='gradient_boosting'
    survey_number_recall_estimated_y_predicted_column_name='Y_test_survey_80_recall_estimate_predicted'

    if st != 'yes':
        number_of_replicates=10000

    quantile_lower_upper_list=[0.025, 0.975]

    feature_column_name_filter='coupon_venue_type'

    save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list=['Overall', 'Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']

    gradient_boosting_model_survey_95_confidence_interval_metric_collection={}
    df_gradient_boosting_80_recall_estimated_feature_filter_number_bootstrap_replicates_metrics_collection={}

    for feature_column_name_filter_value_list_dictionary_key in feature_column_name_filter_value_list_dictionary_key_list:

        gradient_boosting_model_survey_95_confidence_interval_metric_collection[feature_column_name_filter_value_list_dictionary_key],\
        df_gradient_boosting_80_recall_estimated_feature_filter_number_bootstrap_replicates_metrics_collection[feature_column_name_filter_value_list_dictionary_key]=\
        icr.get_metric_confidence_interval_table_by_feature_column_name_filter_value_list_dictionary_key(df_y_train_test_model_name_predicted_y_train_test_survey_recall_estimate_predicted_y_actual_feature_column_name_filter=df_y_test_model_name_predicted_y_test_survey_recall_estimate_predicted_y_actual_coupon_venue_type.copy(),
                                                                                            feature_column_name_filter=feature_column_name_filter,
                                                                                            feature_column_name_filter_value_list_dictionary_key=feature_column_name_filter_value_list_dictionary_key,
                                                                                            feature_column_name_filter_value_list_dictionary=feature_column_name_filter_value_list_dictionary,
                                                                                            multiple_index=multiple_index,
                                                                                            number_of_replicates=number_of_replicates,
                                                                                            quantile_lower_upper_list=quantile_lower_upper_list,
                                                                                            model_type=model_type,
                                                                                            survey_number_recall_estimated_y_predicted_column_name=survey_number_recall_estimated_y_predicted_column_name,
                                                                                            save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list=save_metric_replicates_feature_column_name_filter_value_list_dictionary_key_list,
                                                                                            filename_version=filename_version,
                                                                                            sample_size=None)

    df_test_gradient_boosting_model_survey_95_confidence_interval_metric_feature_column_name_filter_value=\
    icr.convert_collection_to_data_frame_and_drop_top_column_level(gradient_boosting_model_survey_95_confidence_interval_metric_collection)
    
    

    #save it
    df_test_gradient_boosting_model_survey_95_confidence_interval_metric_feature_column_name_filter_value=\
    icr.save_and_return_data_frame_v2(df_test_gradient_boosting_model_survey_95_confidence_interval_metric_feature_column_name_filter_value, filename=filename)

    
    filename='df_gradient_boosting_80_recall_estimated_feature_filter_number_bootstrap_replicates_metrics_collection_v'+filename_version+'.pkl'
    #save it
    df_gradient_boosting_80_recall_estimated_feature_filter_number_bootstrap_replicates_metrics_collection=\
    icr.save_and_return_collection(df_gradient_boosting_80_recall_estimated_feature_filter_number_bootstrap_replicates_metrics_collection, filename=filename)

    
########################################################################################################################################################################################################





########################################################################################################################################################################################################
### Get Gradient Boosting 80% Recall Estimated 95% Confidence Interval Ad Revenue, Ad Spend, ROAS, Profit, Spend, and ROI (Per Coupon Venue Type) Table
filename='df_gradient_boosting_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_v'+filename_version+'.pkl'

df_readback=icr.return_processed_data_file_if_it_exists_v2(filename=filename, column_name_row_integer_location_list=[0, 1], index_column_integer_location_list=[0, 1])
if df_readback.empty == False:
    df_gradient_boosting_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=df_readback
else:
    #get Random Forest Model and Survey Coupon Recommendation Cost Estimated and Sale Estimated Replicate Collection by Venue Type
    df_gradient_boosting_model_survey_coupon_recommendation_cost_estimated_sale_estimated_replicate_collection=\
    icr.get_model_survey_coupon_recommendation_cost_estimated_and_sale_estimated_replicate_collection_venue_type(df=df_train_gradient_boosting_80_recall_survey_100_recall_coupon_recommendation_cost_estimated_sale_estimated, 
                                                                                                             column_name_list=['Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant'], 
                                                                                                             column_name_drop_list=['Overall'],
                                                                                                             number_of_replicates=number_of_replicates)


    #get filename list of gradient boosting metric replicates by venue type
    column_name_list=['Coffee House', 'Bar', 'Takeout', 'Low-Cost Restaurant', 'Mid-Range Restaurant']
    model_name='gradient_boosting'

    test_gradient_boosting_metric_replicate_filename_collection={}
    for column_name in column_name_list:
        test_gradient_boosting_metric_replicate_filename_collection[column_name]='df_test_'+str(model_name)+'_number_metric_estimated_'+str(number_of_replicates)+'_metric_replicates_from_'+str(number_of_replicates)+'_nonparametric_subsamples_'+str(column_name.lower().replace(' ','_'))+'_v'+str(filename_version)+'.csv'
        print(test_gradient_boosting_metric_replicate_filename_collection[column_name])



    #calculate 95% confidence interval for Ad Revenue, Ad Spend, ROAS, Profit, Spend and ROI
    if st!='yes':
        number_of_replicates=10000

    df_gradient_boosting_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=\
    icr.calculate_Overall_and_Coupon_Venue_Type_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI_95_Confidence_Intervals_from_metric_replicates_and_append_to_metric_confidence_interval_table(
        df_model_name_model_survey_coupon_recommendation_cost_estimated_sale_estimated_replicate_collection=df_gradient_boosting_model_survey_coupon_recommendation_cost_estimated_sale_estimated_replicate_collection,
        df_test_model_name_model_survey_95_confidence_interval_metric_feature_column_name_filter_value=df_test_gradient_boosting_model_survey_95_confidence_interval_metric_feature_column_name_filter_value,
        test_model_name_metric_replicate_filename_collection=test_gradient_boosting_metric_replicate_filename_collection,
        model_type=model_name,
        filename_version=filename_version,
        number_of_replicates=number_of_replicates)
    
    #save it
    df_gradient_boosting_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI=\
    icr.save_and_return_data_frame_v2(df_gradient_boosting_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI, filename=filename)

    

#select and reorder basic metrics, recommendation cost estimated, and average sale estimate
multiindex_basic_metrics=icr.get_the_multiindex_object_with_basic_metrics()
multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated=icr.get_the_multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated()

multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated=pd.MultiIndex.from_tuples(list(multiindex_basic_metrics)+list(multiindex_metrics_coupon_recommendation_cost_estimate_sale_estimated))

#display combined metrics
df_gradient_boosting_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[multiindex_basic_metrics_coupon_recommendation_cost_estimate_sale_estimated,:]



########################################################################################################################################################################################################

This file already exists
This file already exists

Out[110]:

		95% Confidence Interval
		Overall	Coffee House	Bar	Takeout	Low-Cost Restaurant	Mid-Range Restaurant
Treatment	Coupon Acceptance Rate	(77%, 81%)	(76%, 83%)	(68%, 81%)	(79%, 87%)	(79%, 86%)	(53%, 71%)
	Percentage of Coupon Acceptances Captured	(78%, 82%)	(70%, 78%)	(68%, 82%)	(86%, 93%)	(85%, 91%)	(46%, 63%)
	Coupon Acceptances	(1115, 1213)	(286, 351)	(98, 140)	(294, 361)	(294, 360)	(58, 91)
	Coupon Acceptances Possible	(1401, 1498)	(390, 464)	(133, 181)	(328, 398)	(334, 403)	(113, 158)
	Coupon Recommendations	(1407, 1505)	(361, 433)	(133, 182)	(356, 428)	(357, 429)	(98, 139)
	Coupon Recommendations Possible	(2537, 2537)	(765, 858)	(362, 435)	(450, 529)	(494, 575)	(271, 337)
	Ad Revenue	(\$14132.98, \$15725.56)	(\$1573.0, \$1930.5)	(\$1470.0, \$2100.0)	(\$4410.0, \$5415.0)	(\$3528.0, \$4320.0)	(\$2030.0, \$3185.0)
	Ad Spend	(\$2846.93, \$3098.58)	(\$306.3, \$367.39)	(\$295.78, \$404.75)	(\$874.46, \$1051.31)	(\$705.51, \$847.8)	(\$454.53, \$644.69)
	ROAS	(485.46%, 518.55%)	(493.8%, 544.38%)	(460.09%, 553.01%)	(486.65%, 531.53%)	(481.97%, 526.35%)	(405.85%, 538.14%)
Control	Coupon Acceptance Rate	(54%, 58%)	(48%, 56%)	(32%, 42%)	(69%, 78%)	(63%, 72%)	(40%, 52%)
	Percentage of Coupon Acceptances Captured	(76%, 80%)	(73%, 81%)	(73%, 86%)	(72%, 81%)	(74%, 82%)	(81%, 92%)
	Coupon Acceptances	(1093, 1190)	(297, 364)	(104, 147)	(248, 310)	(258, 321)	(98, 140)
	Coupon Acceptances Possible	(1401, 1498)	(390, 464)	(133, 181)	(328, 398)	(334, 403)	(113, 158)
	Coupon Recommendations	(1976, 2055)	(584, 669)	(299, 366)	(343, 414)	(389, 463)	(223, 284)
	Coupon Recommendations Possible	(2537, 2537)	(765, 858)	(362, 435)	(450, 529)	(494, 575)	(271, 337)
	Ad Revenue	(\$14602.48, \$16338.54)	(\$1633.5, \$2002.0)	(\$1560.0, \$2205.0)	(\$3720.0, \$4650.0)	(\$3096.0, \$3852.0)	(\$3430.0, \$4900.0)
	Ad Spend	(\$4083.83, \$4349.14)	(\$495.51, \$567.63)	(\$664.94, \$813.94)	(\$842.52, \$1016.92)	(\$768.75, \$914.99)	(\$1034.28, \$1317.2)
	ROAS	(350.06%, 384.03%)	(316.86%, 367.33%)	(218.47%, 289.66%)	(423.55%, 477.54%)	(385.1%, 438.71%)	(304.81%, 398.62%)
Uplift	Coupon Acceptance Rate	(21%, 25%)	(23%, 31%)	(31%, 43%)	(6%, 12%)	(11%, 18%)	(9%, 23%)
	Percentage of Coupon Acceptances Captured	(-1%, 4%)	(-8%, 3%)	(-13%, 4%)	(8%, 18%)	(4%, 15%)	(-42%, -23%)
	Coupon Acceptances	(-20, 64)	(-37, 13)	(-21, 7)	(29, 68)	(18, 57)	(-59, -30)
	Coupon Acceptances Possible	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)
	Coupon Recommendations	(-624, -497)	(-268, -190)	(-204, -148)	(-10, 39)	(-61, -7)	(-159, -111)
	Coupon Recommendations Possible	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)	(0.0, 0.0)
	Ad Revenue	(\$-1256.5, \$139.54)	(\$-203.5, \$71.5)	(\$-315.0, \$105.0)	(\$435.0, \$1020.0)	(\$216.0, \$684.0)	(\$-2065.0, \$-1050.0)
	Ad Spend	(\$-1398.25, \$-1097.03)	(\$-227.39, \$-161.21)	(\$-453.67, \$-329.13)	(\$-24.56, \$95.8)	(\$-120.55, \$-13.83)	(\$-737.45, \$-514.82)
	ROAS	(120.31%, 150.38%)	(151.62%, 203.11%)	(214.2%, 294.6%)	(39.23%, 78.83%)	(71.26%, 114.84%)	(68.5%, 174.18%)
Treatment	Average Coupon Recommendation Cost Estimated	NaN	(\$0.85, \$0.85)	(\$2.22, \$2.22)	(\$2.46, \$2.46)	(\$1.98, \$1.98)	(\$4.64, \$4.64)
Treatment	Average Sale Estimated	NaN	(\$5.5, \$5.5)	(\$15.0, \$15.0)	(\$15.0, \$15.0)	(\$12.0, \$12.0)	(\$35.0, \$35.0)
Control	Average Coupon Recommendation Cost Estimated	NaN	(\$0.54, \$0.54)	(\$1.24, \$1.24)	(\$2.2, \$2.2)	(\$1.71, \$1.71)	(\$3.08, \$3.08)
Control	Average Sale Estimated	NaN	(\$5.5, \$5.5)	(\$15.0, \$15.0)	(\$15.0, \$15.0)	(\$12.0, \$12.0)	(\$35.0, \$35.0)

time: 20.6 ms (started: 2023-09-28 17:20:12 -07:00)

Get Gradient Boosting 80% Recall Estimated 95% Confidence Interval Profit, Spend, and ROI Per Additional Production Cost (Overall) Table¶

In [111]:

#show ROI table
icr.profit_spend_roi_number_table(df=df_gradient_boosting_95_confidence_interval_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI)

Out[111]:

	95% Confidence Interval
	$200 Additional Production Cost			$2,000 Additional Production Cost			$20,000 Additional Production Cost
	Profit	Spend	ROI	Profit	Spend	ROI	Profit	Spend	ROI
Group
Treatment	(\$11062.41, \$12460.41)	(\$3046.93, \$3298.58)	(354.69%, 385.84%)	(\$9262.41, \$10660.41)	(\$4846.93, \$5098.58)	(189.07%, 211.21%)	(\$-8737.59, \$-7339.59)	(\$22846.93, \$23098.58)	(-38.21%, -31.83%)
Control	(\$10263.34, \$11855.73)	(\$4283.83, \$4549.14)	(234.3%, 266.61%)	(\$8463.34, \$10055.73)	(\$6083.83, \$6349.14)	(137.16%, 160.91%)	(\$-9536.66, \$-7944.27)	(\$24083.83, \$24349.14)	(-39.5%, -32.73%)
Uplift	(\$70.49, \$1295.19)	(\$-1398.25, \$-1097.03)	(106.33%, 134.08%)	(\$70.49, \$1295.19)	(\$-1398.25, \$-1097.03)	(42.23%, 60.22%)	(\$70.49, \$1295.19)	(\$-1398.25, \$-1097.03)	(-1.67%, 3.71%)

time: 15.8 ms (started: 2023-09-28 17:20:12 -07:00)

In [112]:

model_types='random_forest_gradient_boosting'
xlabel_string='Additional Production Cost'
ylabel_string='ROI'
title_string='Campaign '+str(ylabel_string)+' Per '+str(xlabel_string)

figure_filename='../reports/figures/figure_'+str(model_types)+'_'+title_string.replace(' ','_').lower()+'_v'+filename_version+'.png'


additional_production_cost = np.linspace(0, 20000, 1000)


#pilot campaign roi equation
pilot_campaign_model_ad_revenue=df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Treatment', 'Ad Revenue'), 'Overall']
pilot_campaign_model_ad_spend=df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Treatment', 'Ad Spend'), 'Overall']
pilot_campaign_model_roi=(pilot_campaign_model_ad_revenue-pilot_campaign_model_ad_spend-additional_production_cost)/(pilot_campaign_model_ad_spend+additional_production_cost)


#drive sales campaign roi equation
drive_sales_campaign_model_ad_revenue=df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Treatment', 'Ad Revenue'), 'Overall']
drive_sales_campaign_model_ad_spend=df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Treatment', 'Ad Spend'), 'Overall']
drive_sales_campaign_model_roi=(drive_sales_campaign_model_ad_revenue-drive_sales_campaign_model_ad_spend-additional_production_cost)/(drive_sales_campaign_model_ad_spend+additional_production_cost)


plt.rcParams.update({'font.size': 16})

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))

axes.plot(additional_production_cost, pilot_campaign_model_roi*100)

axes.plot(additional_production_cost, drive_sales_campaign_model_roi*100)


axes.axhline(y=0, color='k', linewidth=.6)
axes.axvline(x=0, color='k')

axes.set_xlabel(xlabel_string)
axes.set_ylabel(ylabel_string)
axes.set_title(title_string)
axes.xaxis.set_major_formatter('${x:1.0f}')
axes.yaxis.set_major_formatter(mtick.PercentFormatter())
axes.legend(['Pilot Campaign','Drive Sales Campaign'])

plt.tick_params(axis='both', which='both', bottom=True, left=True, direction='out', length=6, width=1,)
axes.set_xlim(-0,20000)
axes.set_ylim(-100,480)

axes.set_xticks([0, 5000, 10000, 15000, 20000])
axes.set_yticks([-100, 0, 100, 200, 300, 400])
axes.set_xticklabels(['$0', '$5K', '$10K', '$15K', '$20K'])
axes.set_yticklabels(['-100%','0%', '100%', '200%', '300%', '400%'])

#save it
plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)

plt.show()

time: 137 ms (started: 2023-09-28 17:20:13 -07:00)

In [113]:

model_types='random_forest_gradient_boosting'
xlabel_string='Additional Production Cost'
ylabel_string='ROI Uplift Estimate'
title_string='Campaign '+str(ylabel_string)+' Per '+str(xlabel_string)

figure_filename='../reports/figures/figure_'+str(model_types)+'_'+title_string.replace(' ','_').lower()+'_v'+filename_version+'.png'


#get pilot campaign model ad revenue and ad spend,
pilot_campaign_model_ad_revenue=df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Treatment', 'Ad Revenue'), 'Overall']
pilot_campaign_model_ad_spend=df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Treatment', 'Ad Spend'), 'Overall']

#get pilot campaig survey ad revenue and ad spend
pilot_campaign_survey_ad_revenue=df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Control', 'Ad Revenue'), 'Overall']
pilot_campaign_survey_ad_spend=df_test_random_forest_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Control', 'Ad Spend'), 'Overall']

#get drive sales campaign model ad revenue and ad spend
drive_sales_campaign_model_ad_revenue=df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Treatment', 'Ad Revenue'), 'Overall']
drive_sales_campaign_model_ad_spend=df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Treatment', 'Ad Spend'), 'Overall']

#get drive sales campaign survey ad revenue and ad spend
drive_sales_campaign_survey_ad_revenue=df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Control', 'Ad Revenue'), 'Overall']
drive_sales_campaign_survey_ad_spend=df_test_gradient_boosting_metrics_Ad_Revenue_Ad_Spend_ROAS_Profit_Spend_ROI.loc[('Control', 'Ad Spend'), 'Overall']

#additional production cost values
additional_production_cost=np.linspace(0, 10000, 1000)




#calculate pilot campaign model/survey roi per addtional production cost values
pilot_campaign_model_roi=icr.get_campaign_roi_from_ad_revenue_ad_spend_additional_production_cost(ad_revenue=pilot_campaign_model_ad_revenue, ad_spend=pilot_campaign_model_ad_spend, additional_production_cost=additional_production_cost)
pilot_campaign_survey_roi=icr.get_campaign_roi_from_ad_revenue_ad_spend_additional_production_cost(ad_revenue=pilot_campaign_survey_ad_revenue, ad_spend=pilot_campaign_survey_ad_spend, additional_production_cost=additional_production_cost)

#calculate pilot campaign roi uplift
pilot_campaign_model_survey_difference_roi=pilot_campaign_model_roi-pilot_campaign_survey_roi


#calculate drive sales campaign model/survey roi per addtional production cost values
drive_sales_campaign_model_roi=icr.get_campaign_roi_from_ad_revenue_ad_spend_additional_production_cost(ad_revenue=drive_sales_campaign_model_ad_revenue, ad_spend=drive_sales_campaign_model_ad_spend, additional_production_cost=additional_production_cost)
drive_sales_campaign_survey_roi=icr.get_campaign_roi_from_ad_revenue_ad_spend_additional_production_cost(ad_revenue=drive_sales_campaign_survey_ad_revenue, ad_spend=drive_sales_campaign_survey_ad_spend, additional_production_cost=additional_production_cost)

#calculate drive sales campaign model roi uplift
drive_sales_campaign_model_survey_difference_roi=drive_sales_campaign_model_roi-drive_sales_campaign_survey_roi


plt.rcParams.update({'font.size': 16})
fig, axes=plt.subplots(nrows=1, ncols=1, figsize=(10, 5))


#plot pilot campaign and drive sales campaign curves
axes.plot(additional_production_cost, pilot_campaign_model_survey_difference_roi*100, label='Pilot Campaign')
axes.plot(additional_production_cost, drive_sales_campaign_model_survey_difference_roi*100, label='Drive Sales Campaign')


pilot_campaign_ROAS_two_tuple=(int(additional_production_cost[0]), int(pilot_campaign_model_survey_difference_roi[0]*100))
drive_sales_campaign_ROAS_two_tuple=(int(additional_production_cost[0]), int(drive_sales_campaign_model_survey_difference_roi[0]*100))
axes.plot(0, pilot_campaign_ROAS_two_tuple[1], 'ro', label=f'ROAS Uplift Estimate (${pilot_campaign_ROAS_two_tuple[0]}, {pilot_campaign_ROAS_two_tuple[1]}%)')
axes.plot(0, drive_sales_campaign_ROAS_two_tuple[1], 'bo', label=f'ROAS Uplift Estimate (${drive_sales_campaign_ROAS_two_tuple[0]}, {drive_sales_campaign_ROAS_two_tuple[1]}%)')




axes.set_xlabel(xlabel_string)
axes.set_ylabel(ylabel_string)
axes.set_title(title_string)
axes.xaxis.set_major_formatter('${x:1.0f}')
axes.yaxis.set_major_formatter(mtick.PercentFormatter())
axes.legend()

axes.tick_params(axis='both', which='both', bottom=True, left=True, direction='out', length=6, width=1,)
axes.set_xlim(-0,10000)
axes.set_ylim(-0,220)

axes.set_xticks([0, 5000, 10000,])
axes.set_yticks([0, 100, 200,])
axes.set_xticklabels(['$0', '$5K', '$10K',])
axes.set_yticklabels(['0%', '100%', '200%',])



#save it
plt.savefig(figure_filename, bbox_inches='tight', dpi=dpi)

plt.show()

time: 138 ms (started: 2023-09-28 17:20:13 -07:00)

In [114]:

%watermark -a "Paul Jacob" -d -t -v -p numpy,pandas -g

Author: Paul Jacob

Python implementation: CPython
Python version       : 3.9.12
IPython version      : 8.2.0

numpy : 1.21.5
pandas: 1.4.2

Git hash: 964b5d89b7a257f3efd946cda8d59619eb1f33f2

time: 91.5 ms (started: 2023-09-28 17:20:13 -07:00)