#!/usr/bin/env python # coding: utf-8 #

# ## BreezeGen - Imbalanced Classification # # Analyze an imbalanced dataset. Train models using the imbalanced-learn library for over- and under-sampling, and compare model performance. # # BreezeGen is a maintenance company specializing in wind turbines. They have supplied ciphered sensor data. Generator maintenance is costly, so preventing a full replacement is paramount. # # * An inspection costs \$5,000 regardless of whether any repairs end up being necessary. # # * A repair costs \$15,000. # # * A complete device replacement costs \$40,000. We obviously want to minimize the frequency of replacement. # # The target variable encodes the failure state, with 0 indicating no failure and 1 signalling failure. # ## Importing libraries # First update scikit-learn and imbalanced-learn libraries. # In[ ]: get_ipython().system(' pip install scikit-learn -U') # In[ ]: get_ipython().system(' pip install imbalanced-learn -U') # Then we can import the necessary libraries. # In[ ]: import numpy as np import pandas as pd from matplotlib import pyplot as plt import seaborn as sns # data processing from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import KNNImputer from imblearn.over_sampling import SMOTE from imblearn.under_sampling import RandomUnderSampler # models building from sklearn.model_selection import StratifiedKFold, cross_val_score from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import BaggingClassifier, RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier from sklearn.ensemble import VotingClassifier from xgboost import XGBClassifier # model assessment and production from sklearn import metrics from sklearn.model_selection import RandomizedSearchCV, GridSearchCV from sklearn.pipeline import Pipeline, make_pipeline from sklearn.ensemble import StackingClassifier # In[ ]: # display setups # plotting theme sns.set_theme() # dataframe display all columns pd.set_option('display.max_columns',None) # ## Loading Data # In[ ]: df=pd.read_csv('dataset_train.csv') # In[ ]: data=df.copy() # ## EDA # In[ ]: data.sample(10,random_state=1) # * The ciphered data consists of floating point numbers with values near 0. There are both positive and negative values. # # * The target variable is categorical, Boolean in fact. There seems, at first glance, to be imbalance in the target classes. # In[ ]: data['Target'].value_counts(normalize=True) # Indeed, around 95% of the target observations are 0. From the data dictionary, we learn this indicates 'No Failure', which is ideal. Thus, only around 5% of observations are 'Failure' cases. # In[ ]: data.info() # Just to double-check the integrity of our data, we confirm that the forty ciphered features are all stored as floats. This indicates to me that there aren't any errant entries, such as '?', which would force the column to be string data type. # # We see some columns have missing entries. # In[ ]: data.isna().sum().loc[data.isna().sum()>0].index.tolist() # Columns 'V1' and 'V2' have missing values. # In[ ]: data.describe().T # Looking at the table of statistics, it is challenging to glean much insight. From the statistics, it appears that most of the distributions are fairly symmetric. # In[ ]: plt.figure(figsize=(8,5)) plt.title('Target Distribution',fontsize=20) sns.countplot(data=data,x='Target'); # We find that cases requiring the replacement of equipment are fairly rare. # # In the next section, we will process the data for modeling. This includes scaling the features so they have approximately mean 0 and standard deviation 1. # ## Data Pre-processing # In[ ]: data['Target']=pd.Categorical(data['Target']) # To start, we change the target variable type to categorical. # In[ ]: X=data.drop('Target',axis=1) y=data['Target'] # In[ ]: X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.25,stratify=y,random_state=57) # We separate our data set. First, we break off the target variable. Then we split X and y into training and validation sets. Recall that we have a separate CSV file with test data for final model production, so we need not apportion a test set here. # In[ ]: pre=make_pipeline( StandardScaler(), KNNImputer() ).set_output(transform='pandas') # We next make a simple pre-processing pipeline that outputs pandas DataFrames. We will use this to scale our data and impute missing values: # * We will scale columns so they have mean 0 and standard deviation 1. # * We impute missing data using the K Nearest Neighbors method. By default, we use 5 neighbors. # In[ ]: X_train=pre.fit_transform(X_train) # We fit the pipeline on the training data and transform it. # In[ ]: X_val=pre.transform(X_val) # By fitting the pipeline on the _training_ data and then transforming the validation data using this pipeline, we avoid data leakage: The data in the validation set does not influence the means, standard deviations, or nearest neighbors calculation. # ## Model Building # ### Functions # # The following functions will assist with model building and assessment. Some are adapted from my previous projects. # In[ ]: def confusion_heatmap(model,show_scores=True): '''Heatmap of confusion matrix for model performance on validation data.''' actual=y_val predicted=model.predict(X_val) # generate confusion matrix cm=metrics.confusion_matrix(actual,predicted) cm=np.flip(cm).T # heatmap labels labels=['TP','FP','FN','TN'] cm_labels=np.array(cm).flatten() cm_percents=np.round((cm_labels/np.sum(cm))*100,3) annot_labels=[] for i in range(4): annot_labels.append(str(labels[i])+'\nCount:'+str(cm_labels[i])+'\n'+str(cm_percents[i])+'%') annot_labels=np.array(annot_labels).reshape(2,2) # print figure plt.figure(figsize=(8,5)) plt.title('Confusion Matrix',fontsize=20) sns.heatmap(data=cm, annot=annot_labels, annot_kws={'fontsize':'x-large'}, xticklabels=[1,0], yticklabels=[1,0], cmap='Greens', fmt='s') plt.xlabel('Actual',fontsize=14) plt.ylabel('Predicted',fontsize=14) plt.tight_layout(); # scores if show_scores==True: scores=['Accuracy','Precision','Recall','F1'] score_list=[metrics.accuracy_score(actual,predicted), metrics.precision_score(actual,predicted), metrics.recall_score(actual,predicted), metrics.f1_score(actual,predicted)] df=pd.DataFrame(index=scores) df['Scores']=score_list return df return # alias function name to something shorter ch=confusion_heatmap # The function above prints a confusion matrix of model performance on validation data. It also prints a table with validation accuracy, precision, recall, and F1 scores. # In[ ]: def cv_recall(estimator,sample_strategy=None): '''Compute a recall score using stratified k-fold cross-validation.''' # define data based on sampling strategy if sample_strategy=='over': X_data=X_train_over y_data=y_train_over elif sample_strategy=='under': X_data=X_train_under y_data=y_train_under else: X_data=X_train y_data=y_train # cv strategy e=estimator kfold=StratifiedKFold(n_splits=5, shuffle=True, random_state=2) # run cv cvs=cross_val_score(estimator=e, X=X_data, y=y_data, scoring='recall', cv=kfold, n_jobs=-1) return cvs.mean() # The function above returns the mean cross-validated recall for a given model. # In[ ]: model_table=pd.DataFrame(columns=['Train Acc', 'Val Acc', 'Train Recall', 'CV Recall', 'Val Recall']) def tabulate(model,name,sample=None,cvs=None): '''Compute train/val accuracy and recall for a given model. Add to table.''' # run predictions with model X_val_pred=model.predict(X_val) if sample==None: y_tr=y_train y_pred=model.predict(X_train) elif sample=='over': y_tr=y_train_over y_pred=model.predict(X_train_over) elif sample=='under': y_tr=y_train_under y_pred=model.predict(X_train_under) else: raise ValueError("Sample parameter takes values in {None,'over','under'}.") # cross validation recall if cvs==None: m=cv_recall(model,sample_strategy=sample) else: m=cvs # collect data for new table row model_table.loc[name]=[metrics.accuracy_score(y_tr,y_pred), metrics.accuracy_score(y_val,X_val_pred), metrics.recall_score(y_tr,y_pred), m, metrics.recall_score(y_val,X_val_pred)] return model_table # The funciton above collects various metrics for evaluating model performance into a comparison table. # ### Model Building with original data # #### Decision Tree # In[ ]: dtree=DecisionTreeClassifier(random_state=1) # In[ ]: m=cv_recall(dtree) print(f'Cross-validated recall is {m}.') # A plain decision tree classifier yields a cross-validated recall of 0.72, decent performance for the first attempt. # In[ ]: dtree.fit(X_train,y_train) # In[ ]: ch(dtree) # After fitting the model to the whole training set, we find high accuracy (around 97%), while precision and recall are closer to 75%. # # **Note:** All confusion matrices in this project are compiled using the validation data. # In[ ]: tabulate(dtree,'dtree',cvs=m) # Moreover, we see clear evidence of overfitting when we compare training and validation set results. # #### Logistic Regression # In[ ]: lr=LogisticRegression() # In[ ]: m=cv_recall(lr) print(f'Cross-validated recall is {m}.') # The plain logistic regression model performs far worse than the decision tree, with a cross-validated mean recall of under 50%. In other words, this model performs worse than randomly guessing for the positive class (1). # In[ ]: lr.fit(X_train,y_train) # In[ ]: ch(lr) # As the negative class (0) is by far the majority class, this logistic regression scores better on precision because it predicts few false positives. # In[ ]: tabulate(lr,'Logistic Regr',cvs=m) # While this model's performance is poor, at least it isn't overfit. # #### Bagging Classifier # In[ ]: bag=BaggingClassifier(random_state=1) m=cv_recall(bag) print(f'Cross-validated recall is {m}.') # A CV recall of 71% is much better than the logistlic regression, similar to the decision tree score. # In[ ]: bag.fit(X_train,y_train) ch(bag) # In addition to good recall, this model produces high accuracy and precision! # In[ ]: tabulate(bag,'Bagging Clfr',cvs=m) # Unfortunately, when comparing training and validation scores, we find that this model is also overfitting (note recall scores). # #### Random Forest # In[ ]: rf=RandomForestClassifier(random_state=1) m=cv_recall(rf) print(f'Cross-validated recall is {m}.') # Our second bagging model, random forest, scores a bit better, with a CV recall of around 76%. # In[ ]: rf.fit(X_train,y_train) ch(rf) # Again, precision and accuracy are stellar on the validation data. Note in the confusion matrix above that there are **SIX** false positives in a data set of 10,000! This translates to a specificity (true negative rate) of over 99.9%. # In[ ]: tabulate(rf,'Random Forest',cvs=m) # Our model unfortunately suffers from overfitting. # #### AdaBoost # In[ ]: abc=AdaBoostClassifier(random_state=1) m=cv_recall(abc) print(f'Cross-validated recall is {m}.') # While AdaBoost yields a lower CV recall score, at only around 61%, we are hopeful that boosting methods will be less susceptible to overfitting. # In[ ]: abc.fit(X_train,y_train) ch(abc) # An accuracy of 97% and good precision is promising, but the surprising result in the table above is the recall, which is higher than the CV recall calculated above. We find 65% recall on validation data with this model. # In[ ]: tabulate(abc,'AdaBoost',cvs=m) # While there is not much evidence of overfitting here, the model performance is appreciably lower than other models. # #### Gradient Boosting # In[ ]: gbc=GradientBoostingClassifier(random_state=1) m=cv_recall(gbc) print(f'Cross-validated recall is {m}.') # Gradient boosting has done much better as compared to AdaBoost, with a CV recall of 72%. # In[ ]: gbc.fit(X_train,y_train) ch(gbc) # As with many other models, recall is actually the lowest score of the four metrics above, with accuracy and precision being much higher. # In[ ]: tabulate(gbc,'Grad Boost',cvs=m) # So far, the gradient boosting model is one of the better models. The only issue is evidence of overfitting (note the recall). # #### XGBoost # In[ ]: xgb=XGBClassifier(random_state=1) m=cv_recall(xgb) print(f'Cross-validated recall is {m}.') # A score of 74% is comparable to the better performing models above. # In[ ]: xgb.fit(X_train,y_train) ch(xgb) # Recall at 77% is one of the best we've seen so far, and both precision and accuracy scores are quite high. # In[ ]: tabulate(xgb,'XGBoost',cvs=m) # Note how much lower the CV recall score is from the training and validation recall scores. The latter two are comparable, so I do not fear overfitting here. # ### Model Building with Oversampled data # # In[ ]: sm=SMOTE( k_neighbors=5, sampling_strategy=1.0, random_state=1 ) X_train_over,y_train_over=sm.fit_resample(X_train,y_train) # Now we will oversample the minority class of the target variable. # #### Decision Tree [Oversampled] # In[ ]: dtree_over=DecisionTreeClassifier(random_state=1) m=cv_recall(dtree_over,sample_strategy='over') print(f'Cross-validated recall is {m}.') # With such a high CV recall score (97%!), I am skeptical already of this model. # In[ ]: dtree_over.fit(X_train_over,y_train_over) ch(dtree_over) # Unlike most of the previous models, recall is higher here than precision. This is certainly due to the oversampling. # In[ ]: tabulate(dtree_over,'dtree (over)',sample='over',cvs=m) # Unsurprisingly though, this model is exceptionally overfit. # #### Logistic Regression [Oversampled] # In[ ]: lr_over=LogisticRegression() m=cv_recall(lr_over,sample_strategy='over') print(f'Cross-validated recall is {m}.') # A CV recall score of 87% is one of the highest we've seen so far, but isn't so high as to immediately suggest overfitting. # In[ ]: lr_over.fit(X_train_over,y_train_over) ch(lr_over) # Both accuracy and recall are quite good with this logistic regression model. The cost, however, is disasterous precision: 28%! The regression's prediction of the positive class is far worse than randomly guessing. # In[ ]: tabulate(lr_over,'Logistic Regr (over)',sample='over',cvs=m) # There is no real evidence of overfitting, so this model looks solid. Great accuracy and recall in both training and validation. It's only downside is terrible precision. # #### Bagging Classifier [Oversampled] # In[ ]: bag_over=BaggingClassifier(random_state=1) m=cv_recall(bag_over,sample_strategy='over') print(f'Cross-validated recall is {m}.') # As with the last decision tree, this CV recall is high enough (97%) to raise suspicion of overfitting. # In[ ]: bag_over.fit(X_train_over,y_train_over) ch(bag_over) # Great scores across the board, while promising, could be symptomatic of overfitting. # In[ ]: tabulate(bag_over,'Bagging (over)',sample='over',cvs=m) # Sure enough, this model is overfit (see recall). # #### Random Forest [Oversampled] # In[ ]: rf_over=RandomForestClassifier(random_state=1) m=cv_recall(rf_over,sample_strategy='over') print(f'Cross-validated recall is {m}.') # Another questionably high score here: 98%. # In[ ]: rf_over.fit(X_train_over,y_train_over) ch(rf_over) # Again, it would be lovely if these scores were beliveable, but we should first look at the table below to assess whether this model is overfit. # In[ ]: tabulate(rf_over,'Rand Forest (over)',sample='over',cvs=m) # Indeed, the difference in training and validation recall is enough to confirm overfitting. # #### AdaBoost [Oversampled] # In[ ]: abc_over=AdaBoostClassifier(random_state=1) m=cv_recall(abc_over,sample_strategy='over') print(f'Cross-validated recall is {m}.') # We have seen good results with boosting before, so a score of 88% is promising. # In[ ]: abc_over.fit(X_train_over,y_train_over) ch(abc_over) # Here we see that the cost of good recall is poor precision. Indeed, the percentage of false positives is nearly double that of true positives predicted by this AdaBoost model (see confusion matrix). # In[ ]: tabulate(abc_over,'AdaBoost (over)',sample='over',cvs=m) # This boosting model offers one of the higher recall scores we have seen without overfitting. # #### Gradient Boosting [Oversampled] # In[ ]: gbc_over=GradientBoostingClassifier(random_state=1) m=cv_recall(gbc_over,sample_strategy='over') print(f'Cross-validated recall is {m}.') # As with the boosting models trained on the original data, the gradient boosting model here has a better CV recall score than the previous AdaBoost model, at around 91%. # In[ ]: gbc_over.fit(X_train_over,y_train_over) ch(gbc_over) # Here too we have good performance on all four metrics. In contrast to the AdaBoost model above, this model offers much better precision, in particular, a score greater than 50%, or random chance. # In[ ]: tabulate(gbc_over,'Grad Boost (over)',sample='over',cvs=m) # As we have seen with boosting models, this model does not have much issue with overfitting. # #### XGBoost [Oversampled] # In[ ]: xgb_over=XGBClassifier(random_state=1) m=cv_recall(xgb_over,sample_strategy='over') print(f'Cross-validated recall is {m}.') # With comparable performance to gradient boosting, a score of about 91% for XGBoost is unsurprising. # In[ ]: xgb_over.fit(X_train_over,y_train_over) ch(xgb_over) # Good or great performance on all metrics, and more true positives than both false positive and false negatives. # In[ ]: tabulate(xgb_over,'XGBoost (over)',sample='over',cvs=m) # Looking back over this table so far, the gradient boosting and XGBoost models trained on oversampled data have performed best. Random forest boasts promising precision if only we could curtail overfitting. # ### Model Building with Undersampled data # In[ ]: rus=RandomUnderSampler( sampling_strategy=1.0, random_state=1 ) X_train_under,y_train=rus.fit_resample(X_train,y_train) # Now we undersample the majority class of the target variable. This is another method to balanced the class weights in the target. # #### Decision Tree [Undersampled] # In[ ]: dtree_under=DecisionTreeClassifier(random_state=1) m=cv_recall(dtree_under,sample_strategy='under') print(f'Cross-validated recall is {m}.') # A score of 85% is good for this estimator, but since previous decision trees had a tendency to overfit, I will wait to assess performance until I see the other metrics. # In[ ]: dtree_under.fit(X_train_under,y_train_under) ch(dtree_under) # This decision tree is the worst on precision (see the concerning rate of false positives). Recall and accuracy, however, are good. # In[ ]: tabulate(dtree_under,'dtree (under)',sample='under',cvs=m) # As with the other decision trees, this model overfits disasterously. # #### Logistic Regression [Undersampled] # In[ ]: lr_under=LogisticRegression() m=cv_recall(lr_under,sample_strategy='under') print(f'Cross-validated recall is {m}.') # A comparable score to the last model, 84% is promising! # In[ ]: lr_under.fit(X_train_under,y_train_under) ch(lr_under) # Again, the false positive rate really cuts into precision (and thus F1) score. Accuracy and recall are good though. # In[ ]: tabulate(lr_under,'Logistic Regr (under)',sample='under',cvs=m) # Thankfully, this model doesn't overfit. Decent recall would make this a top contender, were it not for the worse-than-guessing precision, which would end up costing BreezeGen greatly in inspection costs. # #### Bagging Classifier [Undersampled] # In[ ]: bag_under=BaggingClassifier(random_state=1) m=cv_recall(bag_under,sample_strategy='under') print(f'Cross-validated recall is {m}.') # Bagging has been overfitting, so I'm curious to see if the model trained on undersampled data avoids this issue. The cross-validated recall score is 87%. # In[ ]: bag_under.fit(X_train_under,y_train_under) ch(bag_under) # With about as many false positives as true positives (see confusion matrix), precision lands at 50%, or as good as random guessing. That being said, accuracy and recall and both great. # In[ ]: tabulate(bag_under,'Bagging (under)',sample='under',cvs=m) # Unfortunately overfitting plagues this model too. Note especially the disparity in recall. # #### Random Forest [Undersampled] # In[ ]: rf_under=RandomForestClassifier(random_state=1) m=cv_recall(rf_under,sample_strategy='under') print(f'Cross-validated recall is {m}.') # This model scores 98%, just like the random forest trained on oversampled data. # In[ ]: rf_under.fit(X_train_under,y_train_under) ch(rf_under) # As with the random forest model trained on oversampled data, the precision is good. Accuracy and recall are high. # In[ ]: tabulate(rf_under,'Rand Forest (under)',sample='under',cvs=m) # While this model is surely overfit, tuning might just curtail the issue. And with a decent precision score, this model might be worth tuning to hang onto the cost savings good precision affords. # #### AdaBoost [Undersampled] # In[ ]: abc_under=AdaBoostClassifier(random_state=1) m=cv_recall(abc_under,sample_strategy='under') print(f'Cross-validated recall is {m}.') # The cross-validated recall score for this AdaBoost classifier is 86%. # In[ ]: abc_under.fit(X_train_under,y_train_under) ch(abc_under) # While both accuracy and recall are good on the validation set, precision is really quite poor. # In[ ]: tabulate(abc_under,'AdaBoost (under)',sample='under',cvs=m) # This model performs well for recall, but insufficient precision would cost BreezeGen in the long run. # #### Gradient Boosting [Undersampled] # In[ ]: gbc_under=GradientBoostingClassifier(random_state=1) m=cv_recall(gbc_under,sample_strategy='under') print(f'Cross-validated recall is {m}.') # A CV recall score of 88% is slightly less than that of the Gradient Boosting classifier trained on oversampled data. # In[ ]: gbc_under.fit(X_train_under,y_train_under) ch(gbc_under) # Accuracy and recall are both great, and as this is a boosting model, I do not fear overfitting. Precision is about as good as guessing: compare the true positives and false positives in the confusion matrix above. # In[ ]: tabulate(gbc_under,'Grad Boost (under)',sample='under',cvs=m) # This is one of the better models we've seen. Great recall and accuracy, at around 91% and 95% respectively. # #### XGBoost [Undersampled] # In[ ]: xgb_under=XGBClassifier(random_state=1) m=cv_recall(xgb_under,sample_strategy='under') print(f'Cross-validated recall is {m}.') # A CV recall score of 88% is good, comparable with the previous gradient boosting model. # In[ ]: xgb_under.fit(X_train_under,y_train_under) ch(xgb_under) # This model scores comparably on accuracy and recall to the previous model, but improves on precision! (Compare 53% to 59%.) # In[ ]: tabulate(xgb_under,'XGBoost (under)',sample='under',cvs=m) # Another promising model. Great recall and accuracy, without excessive detriment to precision. # ## HyperparameterTuning # ### Model Finalists # We'll assemble some candidate finalists and compare them in a table. The following function computes many performance metrics for a given model. # In[ ]: def model_scores(model,*,sample): X_val_pred=model.predict(X_val) if sample==None: y_tr=y_train y_pred=model.predict(X_train) elif sample=='over': y_tr=y_train_over y_pred=model.predict(X_train_over) elif sample=='under': y_tr=y_train_under y_pred=model.predict(X_train_under) else: raise ValueError("Sample parameter takes values in {None,'over','under'}.") ser=pd.Series(dtype=float) # accuracy ser.loc['Train Accuracy']=metrics.accuracy_score(y_tr,y_pred) ser.loc['Validation Accuracy']=metrics.accuracy_score(y_val,X_val_pred) # recall ser.loc['Train Recall']=metrics.recall_score(y_tr,y_pred) ser.loc['Validation Recall']=metrics.recall_score(y_val,X_val_pred) # validation precision and f1 ser.loc['Validation Precision']=metrics.precision_score(y_val,X_val_pred) ser.loc['Validation F1']=metrics.f1_score(y_val,X_val_pred) return ser # In[ ]: finalists=pd.DataFrame() finalists['Bag_over']=model_scores(bag_over,sample='over') finalists['Bag_under']=model_scores(bag_under,sample='under') finalists['RF']=model_scores(rf,sample=None) finalists['RF_over']=model_scores(rf_over,sample='over') finalists['RF_under']=model_scores(rf_under,sample='under') finalists['GB_over']=model_scores(gbc_over,sample='over') finalists['GB_under']=model_scores(gbc_under,sample='under') finalists['XGB_over']=model_scores(xgb_over,sample='over') finalists['XGB_under']=model_scores(xgb_under,sample='under') # From the model building process, we compile a list of candidates for tuning. # In[ ]: finalists # * The bagging classifiers are outperformed by the random forest classifiers in every metric. Accordingly, we would rather just study the random forest models. # # * I am curious to tune at least one model with the original data. Random forest performed best in this category. # # * The random forest models trained on both oversampled and undersampled data are strong. While the undersamped model shows higher validation recall, the oversampled model boasts impressive precision and F1, without much sacrifice in recall. A model with both stellar recall _and_ good precision will further cut down on operating costs for BreezeGen. The only issue with the random forest models is their overfitting. We will tune both to eliminate overfitting and find out which model comes out on top. # # * Both gradient boosting and XGBoost performed exceptionally. Additionally, these models did not treaten overfitting as much as the random forest models. The models trained on oversampled data and undersampled data boast great accuracy and recall. We will favor the oversampled ones, since their precision beats undersampled, and we'll tune both `GB_over` and `XGB_over`. # ### Random Forest # In[ ]: params={'n_estimators':np.arange(100,250,50), 'max_depth':np.arange(3,10), 'class_weight':[None,'balanced']} # We will start by tuning the random forest trained on the original data. We vary `n_estimators` in an attempt to improve performance. We use `max_depth` to control overfitting, and balancing `class_weight` should help with recall on our highly unbalanced data set. # In[ ]: rf_tuned=RandomForestClassifier(random_state=2) search=RandomizedSearchCV(estimator=rf_tuned, param_distributions=params, n_iter=20, scoring='recall', n_jobs=-1, cv=5, verbose=1, random_state=1) search.fit(X_train,y_train) # In[ ]: search.best_params_ # It turns out fewer estimators yielded better recall. A depth of 5, roughly in the middle of our proposed range, is better, and balanced class weights won out over no weighted classes. # In[ ]: best_rf=search.best_params_ # fit model with best params rf_tuned=rf_tuned=RandomForestClassifier( random_state=2, n_jobs=-1, **best_rf ) rf_tuned.fit(X_train,y_train) # The trained estimator has the following performance on validation data. # In[ ]: ch(rf_tuned) # In[ ]: tuned_models=pd.DataFrame() tuned_models['RF']=model_scores(rf_tuned,sample=None) tuned_models # Accuracy is stellar and certainly **not** overfit. Recall is good, at around 87%. Precision is a bit lower: not much more than 50%, i.e., random guessing. While precision is not our number one priority, other finalist models demonstrate good precision too, which would further reduce costs for BreezeGen. # ### Random Forest [Oversampled] # In[ ]: params={'n_estimators':np.arange(250,350,25), 'max_depth':np.arange(4,9), 'max_features':['sqrt',0.5]} # Again, we vary the number of estimators with the goal of improving performance. We will curtail overfitting with the `max_depth` and `max_features` parameters. The latter should also aid performance. (After many trials, I discovered that low values of `n_estimators` caused some overfitting, so I set 250 as the minimum value in the parameter distribution.) # # Additionally, we instantiate the estimator with `min_samples_leaf=2` to further prevent overfitting: a leaf cannot consist of a single datum. This reduces occurances of the model memorizing noise in the training data. # In[ ]: rf_over_tuned=RandomForestClassifier(random_state=2,min_samples_leaf=2) search=RandomizedSearchCV(estimator=rf_over_tuned, param_distributions=params, n_iter=15, scoring='recall', n_jobs=-1, cv=5, verbose=1, random_state=1) search.fit(X_train_over,y_train_over) # In[ ]: search.best_params_ # We find that greater depth and more estimators increase recall. Additionally, taking 50% of features (greater than $\sqrt{\text{num_features}}$) yielded a higher score. # In[ ]: best_rf_over=search.best_params_ # fit model with best params rf_over_tuned=RandomForestClassifier( random_state=2, min_samples_leaf=2, n_jobs=-1, **best_rf_over ) rf_over_tuned.fit(X_train_over,y_train_over) # In[ ]: ch(rf_over_tuned) # The metrics for validation data are great: 91% recall and around 98% accuracy. Precision is good too, at around 75%, yielding an F1 score of 82%. Note especially the rarity of false negatives in the confusion matrix above. # In[ ]: tuned_models['RF_over']=model_scores(rf_over_tuned,sample='over') tuned_models # Comparing with training data, there's not much concern for overfitting here. Recall is locked in around 91%, and precision is a good improvement on the previous random forest model. # ### Random Forest [Undersampled] # In[ ]: params={'n_estimators':np.arange(150,300,50), 'max_depth':np.arange(3,10), 'max_features':['sqrt',0.5]} # As with the last model, we will test values for `n_estimators`, `max_depth`, and `max_features`. Mostly, we are looking to prevent overfitting. # In[ ]: rf_under_tuned=RandomForestClassifier(random_state=2,min_samples_leaf=2) search=RandomizedSearchCV(estimator=rf_under_tuned, param_distributions=params, n_iter=30, scoring='recall', n_jobs=-1, cv=5, verbose=1, random_state=1) search.fit(X_train_under,y_train_under) # In[ ]: search.best_params_ # In this case, fewer estimators and fewer features yielded better results. Like the previous model, a greater depth was preferable. # In[ ]: best_rf_under=search.best_params_ # fit model with best params rf_under_tuned=RandomForestClassifier( random_state=2, min_samples_leaf=2, n_jobs=-1, **best_rf_under ) rf_under_tuned.fit(X_train_under,y_train_under) # In[ ]: ch(rf_under_tuned) # In[ ]: tuned_models['RF_under']=model_scores(rf_under_tuned,sample='under') tuned_models # Performance here is quite similar to the last model. What's different, however, is the consistency across data sets: the difference between training and validation scores is less than that of the previous model. Recall is reliably 91%. It only falls short of the previous model in precision. # ### Gradient Boosting [Oversampled] # In[ ]: params={'n_estimators':np.arange(50,125,25), 'subsample':[0.5,0.75], 'max_depth':[3,4]} # After many trials, the main issue with tuned gradient boosting was overfitting. Keeping `n_estimators` low helped curtail this issue, as did taking values for `subsample` less than 1. Adjusting `max_depth` assisted with increasing precision without much impact on recall. # In[ ]: gbc_over_tuned=GradientBoostingClassifier(random_state=2,min_samples_leaf=4) search=RandomizedSearchCV(estimator=gbc_over_tuned, param_distributions=params, n_iter=8, scoring='recall', n_jobs=-1, cv=5, verbose=1, random_state=1) search.fit(X_train_over,y_train_over) # In[ ]: search.best_params_ # The model certainly leaned toward the high end of `n_estimators`, but previous trials revealed that a value any higher than 100 introduced overfitting issues. Better results were observed with lower `subsample` and higher `max_depth`. # In[ ]: best_gbc_over=search.best_params_ # fit model with best params gbc_over_tuned=GradientBoostingClassifier( random_state=2, min_samples_leaf=4, **best_gbc_over ) gbc_over_tuned.fit(X_train_over,y_train_over) # In[ ]: ch(gbc_over_tuned) # In[ ]: tuned_models['GBC_over']=model_scores(gbc_over_tuned,sample='over') tuned_models # This model performs comparably to `RF_over`, which is to say, fantastically! Accuracy is solidly in the 95-98% range, with recall at 91%. Precision clocks in at 74%, one of our top scores for these finalist models. # ### XGBoost [Oversampled] # In[ ]: params={'eta':[0.05,0.1,0.15], 'colsample_bytree':[0.5,0.75,1.0], 'max_depth':np.arange(2,5)} # After much experimentation, I narrowed down the parameter space to `eta` (=`learning_rate`), `colsample_bytree`, and `max_depth`. All three parameters will be used to control overfitting and improve performance on secondary metrics, like precision, since recall is solidly around 90% across a large swath of the parameter space. # # What's more, by instantiating the classifier with `tree_method='gpu_hist'`, training time is lightnight fast! So fast, in fact, that I am able to run exhaustive parameter searches with `GridSearchCV` nearly instantly. # In[ ]: xgb_over_tuned=XGBClassifier(random_state=1, tree_method='gpu_hist') go=GridSearchCV(estimator=xgb_over_tuned, param_grid=params, scoring='recall', cv=5, n_jobs=-1, verbose=1) go.fit(X_train_over,y_train_over) # In[ ]: go.best_params_ # The best setting for `colsample_bytree` turns out to be the default. I found that `eta` wanted to be as low as possible, but any lower than 0.05 threatened overfitting. Reducing `max_depth` to 4 (from the default of 6) certainly did prevent overfitting. # In[ ]: best_xgb_over=go.best_params_ # fit model with best params xgb_over_tuned=XGBClassifier( random_state=1, tree_method='gpu_hist', **best_xgb_over ) xgb_over_tuned.fit(X_train_over,y_train_over) # In[ ]: ch(xgb_over_tuned) # In[ ]: tuned_models['XGB_over']=model_scores(xgb_over_tuned,sample='over') tuned_models # Performance here is nearly identical to `RF_over` and `GBC_over`. Great recall will surely be a cost savings for BreezeGen, and good precision will help too! # ## Model performance comparison and choosing the final model # In[ ]: tuned_models # * All models boast accuracy in the 95-97% range without overfitting. # # * The random forest trained on the original data scores worst on recall, so it will not be the final model. Every other model has a recall score squarely in the 91-92% range. # # * Precision is an important secondary metric, as has been discussed throughout. While recall will cut down on BreezeGen's greatest expense, namely repair and replacement costs, precision reduces the instances where an inspection is unnecessary. This reduces money wasted on needless inspections. Of the remaining models, the random forest trained on undersampled data scores worst on precision, so it will not be the final model. # ### Test set final performance # To choose the best model, we'll look at performance on completely unseen data. # In[ ]: test_data=pd.read_csv('dataset_test.csv') # In[ ]: X_test=test_data.drop('Target',axis=1) y_test=test_data['Target'] # X_test=pre.transform(X_test) # After spliting the data into predictor features and target, we run `X_test` through the preprocessing pipeline defined earlier. This pipeline includes a scaler and an imputer. # In[ ]: test_perf=pd.DataFrame() def test_scores(model): y_pred=model.predict(X_test) ser=pd.Series(dtype=float) ser.loc['Test Accuracy']=metrics.accuracy_score(y_test,y_pred) ser.loc['Test Recall']=metrics.recall_score(y_test,y_pred) ser.loc['Test Precision']=metrics.precision_score(y_test,y_pred) return ser # This funciton will compile metrics. # In[ ]: test_perf['RF']=test_scores(rf_over_tuned) test_perf['GBC']=test_scores(gbc_over_tuned) test_perf['XGB']=test_scores(xgb_over_tuned) test_perf # Interesting! Recall is lower on the testing data than on training and validation data. All three models score around 97% on accuracy, and around 87% on recall. What sets apart our winner is its precision score. The random forest model has noticably better precision (76%), which will cut down on inspection costs for BreezeGen. # # The best model is the tuned random forest trained on the oversampled data. # ## Pipelines to build the final model # # Our preprocessing pipeline was already most of what we needed, so we will add to what was already built above. # In[ ]: transformers=[ ('Scaler',StandardScaler()), ('Imputer',KNNImputer()), ('Predictor',rf_over_tuned) ] pipe=Pipeline(transformers) # The three steps in our pipeline are scaling, imputing, and predicting. # In[ ]: pipe.fit(X_train_over,y_train_over) # This pipeline can be used to quickly process new data and subsequently make predictions for possible inspections. # # Business Insights and Conclusions # In[ ]: ch(pipe,show_scores=False) # BreezeGen incurs the following maintenance costs: # * \$40,000 - generator replacement, # * \$15,000 - generator repair, # * \$5,00 - generator inspection. # # To save money on maintenance, BreezeGen must reduce occurances of replacement first. The company spends money unnecessarily when sensors do not alert technicians that a component has failed. This will lead to degredation or outright failure of the generator, necessitating replacement. Such situations are coded as false negatives, and we built a model that avoids false negatives. Our model correctly predicts component failures 86% of the time. Note how few instances of false negatives (FN) are present in the confusion matrix above. # # While repair costs are inevitable, unnecessary inspections add expense without any operational benefit. Cutting down on inspection costs means reducing false negatives; a false negative is predicting failure in a generator where the compontents are all functioning. For every three superfluous inspections, BreezeGen can afford another generator repair, so these savings translate into serious business gains. Our model correctly rejects false negatives 76% of the time, allowing technicians to spend more work hours on generators in genuine need of repair. The additional savings our model offers by reducing unnecessary inspections can be put toward repair expenses, meaning BreezeGen's expenses are going toward maintaining their generator infrastructure without much overhead. # ***