#!/usr/bin/env python
# coding: utf-8

# # XGBoost Feature Importance

# This notebook explains how to generate feature importance plots from `XGBoost` using tree-based feature importance, permutation importance and `shap`.
# 
# This notebook will build and evaluate a model to predict arrival delay for flights in and out of NYC in 2013.  

# ### Packages

# This tutorial uses:
# * [pandas](https://pandas.pydata.org/docs/)
# * [statsmodels](https://www.statsmodels.org/stable/index.html)
#     * [statsmodels.api](https://www.statsmodels.org/stable/api.html#statsmodels-api)
# * [matplotlib](https://matplotlib.org/)
#     * [matplotlib.pyplot](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.html)
# * [numpy](https://numpy.org/doc/stable/)
# * [scikit-learn](https://scikit-learn.org/stable/)
#     * [sklearn.metrics](https://scikit-learn.org/stable/modules/model_evaluation.html)
#     * [sklearn.model_selection](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
#     * [sklearn.inspection](https://scikit-learn.org/stable/inspection.html)
# * [shap](https://shap.readthedocs.io/en/latest/index.html)
# * [category_encoders](https://contrib.scikit-learn.org/category_encoders/)
# * [XGBoost](https://xgboost.readthedocs.io/en/latest/)

# In[1]:


import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import shap
import category_encoders as ce

import xgboost as xgb


# ## Reading the data

# The data is from `rdatasets` imported using the Python package `statsmodels`.

# In[2]:


df = sm.datasets.get_rdataset('flights', 'nycflights13').data
df.info()


# ## Feature Engineering

# ### Handle null values

# In[3]:


df.isnull().sum()


# As this model will predict arrival delay, the `Null` values are caused by flights did were cancelled or diverted. These can be excluded from this analysis.

# In[4]:


df.dropna(inplace=True)


# ### Convert the times from floats or ints to hour and minutes

# In[5]:


df['arr_hour'] = df.arr_time.apply(lambda x: int(np.floor(x/100)))
df['arr_minute'] = df.arr_time.apply(lambda x: int(x - np.floor(x/100)*100))
df['sched_arr_hour'] = df.sched_arr_time.apply(lambda x: int(np.floor(x/100)))
df['sched_arr_minute'] = df.sched_arr_time.apply(lambda x: int(x - np.floor(x/100)*100))
df['sched_dep_hour'] = df.sched_dep_time.apply(lambda x: int(np.floor(x/100)))
df['sched_dep_minute'] = df.sched_dep_time.apply(lambda x: int(x - np.floor(x/100)*100))
df.rename(columns={'hour': 'dep_hour',
                   'minute': 'dep_minute'}, inplace=True)


# ## Prepare data for modeling

# ### Set up train-test split

# In[6]:


target = 'arr_delay'
y = df[target]
X = df.drop(columns=[target, 'flight', 'tailnum', 'time_hour', 'year', 'dep_time', 'sched_dep_time', 'arr_time', 'sched_arr_time', 'dep_delay'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1066)


# ### Encode categorical variables

# We use a leave-one-out encoder as it creates a single column for each categorical variable instead of creating a column for each level of the categorical variable like one-hot-encoding.  This makes interpreting the impact of categorical variables with feature impact easier.

# In[7]:


encoder = ce.LeaveOneOutEncoder(return_df=True)
X_train_loo = encoder.fit_transform(X_train, y_train)
X_test_loo = encoder.transform(X_test)


# ## Fit the model

# In[8]:


model = xgb.XGBRegressor(n_estimators=500, max_depth=5, eta=0.05)
model.fit(X_train_loo, y_train)

rmse = np.sqrt(mean_squared_error(y_test, model.predict(np.ascontiguousarray(X_test_loo))))
rmse


# ## Feature Importance

# ### Plot the tree-based (or Gini) importance

# In[9]:


feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Feature Importance')


# ### Plot the permutation importance.

# In[10]:


perm_importance = permutation_importance(model, np.ascontiguousarray(X_test_loo), y_test, n_repeats=10, random_state=1066)
sorted_idx = perm_importance.importances_mean.argsort()
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), perm_importance.importances_mean[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Permutation Importance')


# ### Plot the mean absolute value of the SHAP values

# In[11]:


explainer = shap.Explainer(model)
shap_values = explainer(np.ascontiguousarray(X_test_loo))
shap_importance = shap_values.abs.mean(0).values
sorted_idx = shap_importance.argsort()
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), shap_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('SHAP Importance')


# `SHAP` contains a function to plot this directly.

# In[12]:


shap.plots.bar(shap_values, max_display=X_test_loo.shape[0])


# In[ ]: