import multiprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.random import sample_without_replacement
from skopt import BayesSearchCV
from xgboost import XGBClassifier, plot_importance
from roaf import visualization
%matplotlib inline
plt.style.use("dark_background")
plt.set_cmap("Dark2")
sns.set_palette("Dark2")
<Figure size 640x480 with 0 Axes>
These parameters can be overwritten with papermill parameterization, namely for testing purposes.
Parameter | Description |
---|---|
fast_execution | Should be set to True if the focus is on testing and not on prediction quality |
plot_dir | The output directory for the plots |
max_sample_size | The number of data points used for machine learning and validation. If set to None, all the data (after undersampling) will be used |
n_plot | The number of data points to plot in certain figures |
n_cv | Parameter for k-fold cross-validation used in parameter optimization |
n_permutation_repetitions | The number of permutations to be performed to find the importance of features in trained models |
n_random_forest_estimators | The number of estimators in the random forest model |
FAST_EXECUTION = False
PLOT_DIR = "../images/"
PLOT_FILE_FORMATS = ["png"]
MAX_SAMPLE_SIZE = None
N_PLOT = 15
N_CV = 5
N_PERMUTATION_REPETITIONS = 10
N_RANDOM_FOREST_ESTIMATORS = 100
VERBOSE = 0
df = pd.read_parquet("../data/processed/df_by_person.parquet")
The data was already cleaned in notebook 1, but this cleaning was meant for general purpose, so that more data was retained for visualizations in notebook 2.
In this notebook, the data will be prepared for the application of machine learning models. This preparation will be done in these steps:
df_ml = (
df.select_dtypes(include=np.number)
.drop(columns=["accident_id", "accident_id_y"])
.dropna(axis=1, how="any")
)
df_ml = pd.get_dummies(
data=df_ml,
columns=[
"daylight",
"built_up_area",
"intersection_category",
"weather",
"collision_category",
"road_admin_category",
"traffic_regime",
"reserved_lane",
"plane_layout",
"surface_condition",
"infrastructure",
"location",
"is_weekend",
"role",
],
)
features = df_ml.drop(columns="severity")
feature_columns = features.columns
target = df_ml["severity"]
random_under_sampler = RandomUnderSampler()
features, target = random_under_sampler.fit_resample(X=features, y=target)
sample_size = len(target)
if MAX_SAMPLE_SIZE is not None:
if sample_size > MAX_SAMPLE_SIZE:
sample_idx = sample_without_replacement(
n_population=sample_size,
n_samples=MAX_SAMPLE_SIZE,
random_state=0,
)
features = features.iloc[sample_idx]
target = target.iloc[sample_idx]
print(sample_size)
28491
scaler = StandardScaler()
features = scaler.fit_transform(features)
X_train, X_test, y_train, y_test = train_test_split(
features,
target,
test_size=0.2,
random_state=0,
)
# Reconvert the features to DataFrames in order to keep the feature names
X_train = pd.DataFrame(data=X_train)
X_test = pd.DataFrame(data=X_test)
X_train.columns = feature_columns
X_test.columns = feature_columns
The preprocessed and splitted dataset will be exported to parquet so that it can be used in notebook 4 (artificial neural networks). Parquet is used again as the file format for its low requirements regarding disk space.
TRAIN_FILENAME = "Xy_train"
TEST_FILENAME = "Xy_test"
if FAST_EXECUTION:
TRAIN_FILENAME = "TESTING_" + TRAIN_FILENAME
TEST_FILENAME = "TESTING_" + TEST_FILENAME
Xy_train = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
Xy_test = pd.concat([X_test, y_test.reset_index(drop=True)], axis=1)
Xy_train.to_parquet("../data/processed/" + TRAIN_FILENAME + ".parquet")
Xy_test.to_parquet("../data/processed/" + TEST_FILENAME + ".parquet")
xgb_clf = XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2)
param_spaces = {
"max_depth": [2, 4, 6],
"n_estimators": [100, 200],
"learning_rate": [0.05, 0.1],
}
bayes_search = BayesSearchCV(
xgb_clf, param_spaces, cv=N_CV, n_jobs=2, n_iter=4, verbose=0
)
bayes_search.fit(X_train, y_train)
BayesSearchCV(cv=5, estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None... max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=4, num_parallel_tree=None, predictor=None, random_state=None, ...), n_iter=4, n_jobs=2, search_spaces={'learning_rate': [0.05, 0.1], 'max_depth': [2, 4, 6], 'n_estimators': [100, 200]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
BayesSearchCV(cv=5, estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None... max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=4, num_parallel_tree=None, predictor=None, random_state=None, ...), n_iter=4, n_jobs=2, search_spaces={'learning_rate': [0.05, 0.1], 'max_depth': [2, 4, 6], 'n_estimators': [100, 200]})
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=4, num_parallel_tree=None, predictor=None, random_state=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=4, num_parallel_tree=None, predictor=None, random_state=None, ...)
best_xgb = bayes_search.best_estimator_
# Predict
y_pred = best_xgb.predict(X_test)
print(classification_report(y_true=y_test, y_pred=y_pred))
# Plot confusion matrix
visualization.plot_confusion_matrix(
y_test, y_pred, "XGBoost", normalize="true", figsize=(4, 4)
)
# Save figure
FILENAME = "rxgb_confusion_matrix"
visualization.savefig(basename=FILENAME, filepath=PLOT_DIR)
precision recall f1-score support 0 0.67 0.77 0.71 1869 1 0.62 0.47 0.53 1906 2 0.69 0.75 0.72 1924 accuracy 0.66 5699 macro avg 0.66 0.66 0.65 5699 weighted avg 0.66 0.66 0.65 5699
['../images/rxgb_confusion_matrix.png', '../images/rxgb_confusion_matrix.svg']
When interpreting precision and recall for machine learning models, it is important to keep in mind, that the test data is balanced while the real-world data is not. That means, that we would expect an accuracy of 0.33 for a random guess.
The model does a good job predicting deadly injuries. Unharmed persons are detected with a high recall but with lower precision. It shows a lower performance for injured persons. Here, especially the recall is very low.
p = plot_importance(best_xgb, max_num_features=N_PLOT, height=0.8, grid="off")
p.set_title("Feature Importance for XGBoost")
p.grid(False)
FILENAME = "rxgb_feat_importance"
visualization.savefig(basename=FILENAME, filepath=PLOT_DIR, formats=PLOT_FILE_FORMATS)
['../images/rxgb_feat_importance.png']
The feature importance plot enables us to identify the most important features used by XGBoost for the classification problem. It seems like the location (represented by longitude and latitude) has the highest importance in this case.
random_forest_clf = RandomForestClassifier(n_estimators=N_RANDOM_FOREST_ESTIMATORS)
random_forest_clf.fit(X_train, y_train)
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
# Predict test values
y_pred = random_forest_clf.predict(X_test)
print(classification_report(y_true=y_test, y_pred=y_pred))
# Plot confusion matrix
visualization.plot_confusion_matrix(
y_test, y_pred, "Random Forest", normalize="true", figsize=(4, 4)
)
# Save figure
FILENAME = "rf_confusion_matrix"
visualization.savefig(basename=FILENAME, filepath=PLOT_DIR, formats=PLOT_FILE_FORMATS)
precision recall f1-score support 0 0.67 0.75 0.71 1869 1 0.60 0.47 0.53 1906 2 0.68 0.75 0.71 1924 accuracy 0.66 5699 macro avg 0.65 0.66 0.65 5699 weighted avg 0.65 0.66 0.65 5699
['../images/rf_confusion_matrix.png']
Random Forests can be interpreted with impurity-based feature importance, but this approach has some downsides. I will therefore use permutation feature importance to analyze the model. For this, I will calculate the feature importance weights for both the training and the test set and compare them. Those features that show a high # difference # between the calculated values for training and test set are considered to be causal for overfitting.
# The permutation performance takes a while to compute.
r_train = permutation_importance(
random_forest_clf,
X_train,
y_train,
n_repeats=N_PERMUTATION_REPETITIONS,
random_state=0,
)
r_test = permutation_importance(
random_forest_clf,
X_test,
y_test,
n_repeats=N_PERMUTATION_REPETITIONS,
random_state=0,
)
importances_mean_df = pd.DataFrame(index=feature_columns)
importances_std_df = pd.DataFrame(index=feature_columns)
# Mean Values
importances_mean_df["train"] = r_train.importances_mean
importances_mean_df["test"] = r_test.importances_mean
importances_mean_df["train_test_diff"] = abs(
importances_mean_df["test"] - importances_mean_df["train"]
)
importances_mean_df.sort_values(by="train_test_diff", ascending=False, inplace=True)
importances_mean_df.drop(columns=["train_test_diff"], inplace=True)
# Standard Deviation
importances_std_df["train"] = r_train.importances_std
importances_std_df["test"] = r_test.importances_std
importances_std_df = importances_std_df.reindex_like(importances_mean_df)
importances_mean_df[["train", "test"]].head(N_PLOT).plot(
kind="barh", capsize=2, xerr=importances_std_df.head(N_PLOT)
)
plt.title("Features with High Difference in Importance between Train and Test Set")
plt.xlabel("")
plt.ylabel("feature")
visualization.savefig(
basename="rf_feature_importance", filepath=PLOT_DIR, formats=PLOT_FILE_FORMATS
)
['../images/rf_feature_importance.png']
importances_mean_df.sort_values("train", ascending=False, inplace=True)
importances_std_df = importances_std_df.reindex_like(importances_mean_df)
sns.barplot(
data=importances_mean_df.head(N_PLOT),
x="train",
y=importances_mean_df.head(N_PLOT).index.values,
xerr=importances_std_df["train"].head(N_PLOT),
capsize=1.0,
ecolor="white",
palette="Dark2",
)
plt.title(
"Mean Permutation Feature Importance\nfor the Random Forest Classifier on the Training Set"
)
plt.xlabel("Mean Feature Importance")
plt.ylabel("Feature")
visualization.savefig(
basename="Random Forest Permutation Feature Importance",
filepath=PLOT_DIR,
formats=PLOT_FILE_FORMATS,
)
['../images/Random Forest Permutation Feature Importance.png']