In this notebook we compute 95% Confidence Intervals for 10-Fold Cross Validation as a comparison to the Standard Error Method
The standard error is the standard deviation of the Student t-distribution. T-distributions are slightly different from Gaussian, and vary depending on the size of the sample.
!pip install git+https://github.com/pattersonconsulting/ml_tools.git
Collecting git+https://github.com/pattersonconsulting/ml_tools.git Cloning https://github.com/pattersonconsulting/ml_tools.git to /tmp/pip-req-build-nm_jnpmr Running command git clone -q https://github.com/pattersonconsulting/ml_tools.git /tmp/pip-req-build-nm_jnpmr Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (1.1.5) Requirement already satisfied: sklearn in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (0.0) Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (3.2.2) Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (1.19.5) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (1.3.2) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (2.8.2) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (2.4.7) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (0.11.0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib->ml-valuation==0.0.1) (1.15.0) Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->ml-valuation==0.0.1) (2018.9) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from sklearn->ml-valuation==0.0.1) (0.22.2.post1) Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->ml-valuation==0.0.1) (1.4.1) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->ml-valuation==0.0.1) (1.1.0)
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import roc_curve, auc, confusion_matrix
from matplotlib import pyplot
import ml_valuation
from ml_valuation import model_valuation
from ml_valuation import model_visualization
arr_X, arr_y = load_breast_cancer(return_X_y=True)
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print("X: " + str(arr_X.shape))
#print("X_test: " + str(X.shape))
X: (569, 30)
print(arr_X)
[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01] [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02] [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02] ... [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02] [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01] [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]
unique, counts = np.unique(arr_y, return_counts=True)
dict(zip(unique, counts))
{0: 212, 1: 357}
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV, StratifiedKFold
# fit a model
classifier_kfold_LR = LogisticRegression(solver='newton-cg')
k = 10
cv = StratifiedKFold( n_splits=k )
stats = list()
X = pd.DataFrame(arr_X)
y = pd.DataFrame(arr_y)
#for train_index, test_index in cv.split(X, y):
for i, (train_index, test_index) in enumerate(cv.split(X, y)):
# convert the data indexes into references
Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
ytrain, ytest = y.iloc[train_index], y.iloc[test_index]
print("Running CV Fold-" + str(i))
#print(Xtrain.shape)
#print(Xtest.shape)
#x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(df_full_scaled, y_train_full, test_size=1 - train_ratio)
# classifier_kfold_LR.fit( X.iloc[ train_index ], y.iloc[ train_index ])
# fit the model on the training data (Xtrain) and labels (ytrain)
classifier_kfold_LR.fit( Xtrain, ytrain.values.ravel() )
# now get the probabilites of the predictions for the text input (data: Xtest, labels: ytest)
#probas_ = classifier_kfold_LR.predict_proba( Xtest )
#print( "prediction probabilities: " + str(yhat.shape) )
#prediction_est_prob = probas_[:, 1]
y_pred = classifier_kfold_LR.predict(Xtest)
accuracy_fold = accuracy_score(ytest, y_pred)
#scmtrx_lr_full_testset = model_valuation.standard_confusion_matrix_for_top_ranked_percent(y_test_scaled, yhat, 0.5, 1.0)
stats.append(accuracy_fold)
print("Accuracy: " + str(accuracy_fold))
print("-----")
mean_score = np.mean(stats)
print("\n\nAverage Accuracy Across All Folds: " + str("{:.4f}".format(mean_score)))
Running CV Fold-0 Accuracy: 0.9824561403508771 ----- Running CV Fold-1
/usr/local/lib/python3.7/dist-packages/scipy/optimize/linesearch.py:314: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) /usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:204: UserWarning: Line Search failed warnings.warn('Line Search failed')
Accuracy: 0.9122807017543859 ----- Running CV Fold-2 Accuracy: 0.9298245614035088 ----- Running CV Fold-3 Accuracy: 0.9473684210526315 ----- Running CV Fold-4 Accuracy: 0.9824561403508771 ----- Running CV Fold-5 Accuracy: 0.9824561403508771 ----- Running CV Fold-6 Accuracy: 0.9298245614035088 ----- Running CV Fold-7 Accuracy: 0.9473684210526315 ----- Running CV Fold-8 Accuracy: 0.9649122807017544 ----- Running CV Fold-9 Accuracy: 0.9642857142857143 ----- Average Accuracy Across All Folds: 0.9543
(1) determine if you need to use:
(2) if we have the sample standard deviation
(3) sample size as another indicator
(4) computing confidence intervals for the population mean
CIs = sample mean +/- t (v, alpha/2) * s / sqrt(n)
v = n - 1
alpha / 2 = (1 - CL) / 2 = (1 - 0.95) / 2 = 0.05 / 2 = 0.025
t = look up in student's t-distribuion table with v and alpha/2 values
t (9, 0.025) = 2.262
EBM: "margin of error"
ebm = (t) (std / sqrt(n))
CI = mean +/ (t) (std / sqrt(n))
CI = mean +/ ebm
What does this confidence interval mean?
This confidence interval means that we are 95% sure that the "average accuracy for this model on of all breast cancer data in the full population of breast cancer data is somewhere between 93.7% and 97.1%"
"A confidence interval is a range of values, bounded above and below the statistic's mean, that likely would contain an unknown population parameter. Confidence level refers to the percentage of probability, or certainty, that the confidence interval would contain the true population parameter when you draw a random sample many times."
if confidence level refers to "percentage of probability" of certainty, then we can assume that 95% of the time our accuracy should be between the lower and upper bound of the estimate.
t = 2.262 # v = 10 - 1, alpha = 0.025
std_dev_sample = np.std(stats)
print("\n\nSample STD DEV: " + str(std_dev_sample))
ebm = (1/np.sqrt(k)) * std_dev_sample * t
print("\n\nEBM (Accuracy) Across All Folds: ( " + str("{:.4f}".format(ebm)) + ")")
print("CI Ranges 95%:")
low_end_range = mean_score - ebm
high_end_range = mean_score + ebm
print("High: " + str(high_end_range))
print("Low : " + str(low_end_range))
Sample STD DEV: 0.023770661464399972 EBM (Accuracy) Across All Folds: ( 0.0170) CI Ranges 95%: High: 0.9713266337249031 Low : 0.9373199828164502
These results are comparable to bootstrap632 results, for reference, on the same dataset / classifier combination
Notable Links