Now that we have gone through a manual process of modeling our dataset, let's see if we can replicate this using an Automated workflow. As a reminder, our plan of action was as follows:
# Only enable critical logging (Optional)
import os
os.environ["PYCARET_CUSTOM_LOGGING_LEVEL"] = "CRITICAL"
def what_is_installed():
from pycaret import show_versions
show_versions()
try:
what_is_installed()
except ModuleNotFoundError:
!pip install pycaret
what_is_installed()
System: python: 3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)] executable: C:\Users\Nikhil\.conda\envs\pycaret_dev_sktime_0p11_2\python.exe machine: Windows-10-10.0.19044-SP0 PyCaret required dependencies:
C:\Users\Nikhil\.conda\envs\pycaret_dev_sktime_0p11_2\lib\site-packages\_distutils_hack\__init__.py:30: UserWarning: Setuptools is replacing distutils. warnings.warn("Setuptools is replacing distutils.")
pip: 21.2.2 setuptools: 61.2.0 pycaret: 3.0.0 ipython: Not installed ipywidgets: 7.7.0 numpy: 1.21.6 pandas: 1.4.2 jinja2: 3.1.2 scipy: 1.8.0 joblib: 1.1.0 sklearn: 1.0.2 pyod: Installed but version unavailable imblearn: 0.9.0 category_encoders: 2.4.1 lightgbm: 3.3.2 numba: 0.55.1 requests: 2.27.1 matplotlib: 3.5.2 scikitplot: 0.3.7 yellowbrick: 1.4 plotly: 5.8.0 kaleido: 0.2.1 statsmodels: 0.13.2 sktime: 0.11.4 tbats: Installed but version unavailable pmdarima: 1.8.5 PyCaret optional dependencies: shap: Not installed interpret: Not installed umap: Not installed pandas_profiling: Not installed explainerdashboard: Not installed autoviz: Not installed fairlearn: Not installed xgboost: Not installed catboost: Not installed kmodes: Not installed mlxtend: Not installed statsforecast: 0.5.5 tune_sklearn: Not installed ray: Not installed hyperopt: Not installed optuna: Not installed skopt: Not installed mlflow: 1.25.1 gradio: Not installed fastapi: Not installed uvicorn: Not installed m2cgen: Not installed evidently: Not installed nltk: Not installed pyLDAvis: Not installed gensim: Not installed spacy: Not installed wordcloud: Not installed textblob: Not installed psutil: 5.9.0 fugue: Not installed streamlit: Not installed prophet: Not installed
import numpy as np
import pandas as pd
from pycaret.datasets import get_data
from pycaret.time_series import TSForecastingExperiment
# Global Figure Settings for notebook ----
global_fig_settings = {"renderer": "notebook", "width": 1000, "height": 600}
data = get_data("airquality", verbose=False)
data["index"] = pd.to_datetime(data["Date"] + " " + data["Time"])
data.drop(columns=["Date", "Time"], inplace=True)
data.replace(-200, np.nan, inplace=True)
data.set_index("index", inplace=True)
target = "CO(GT)"
exog_vars = ['NOx(GT)', 'PT08.S3(NOx)', 'RH']
include = [target] + exog_vars
data = data[include]
data.head()
CO(GT) | NOx(GT) | PT08.S3(NOx) | RH | |
---|---|---|---|---|
index | ||||
2004-03-10 18:00:00 | 2.6 | 166.0 | 1056.0 | 48.9 |
2004-03-10 19:00:00 | 2.0 | 103.0 | 1174.0 | 47.7 |
2004-03-10 20:00:00 | 2.2 | 131.0 | 1140.0 | 54.0 |
2004-03-10 21:00:00 | 2.2 | 172.0 | 1092.0 | 60.0 |
2004-03-10 22:00:00 | 1.6 | 131.0 | 1205.0 | 59.6 |
FH=48
metric = "mase"
exclude = ["auto_arima", "bats", "tbats", "lar_cds_dt", "par_cds_dt"]
exp_auto = TSForecastingExperiment()
# enforce_exogenous=False --> Use multivariate forecasting when model supports it, else use univariate forecasting
exp_auto.setup(
data=data, target=target, fh=FH, enforce_exogenous=False,
numeric_imputation_target="ffill", numeric_imputation_exogenous="ffill",
fig_kwargs=global_fig_settings, session_id=42
)
Description | Value | |
---|---|---|
0 | session_id | 42 |
1 | Target | CO(GT) |
2 | Approach | Univariate |
3 | Exogenous Variables | Present |
4 | Original data shape | (9357, 4) |
5 | Transformed data shape | (9357, 4) |
6 | Transformed train set shape | (9309, 4) |
7 | Transformed test set shape | (48, 4) |
8 | Rows with missing values | 25.8% |
9 | Fold Generator | ExpandingWindowSplitter |
10 | Fold Number | 3 |
11 | Enforce Prediction Interval | False |
12 | Seasonal Period(s) Tested | 24 |
13 | Seasonality Present | True |
14 | Seasonalities Detected | [24] |
15 | Primary Seasonality | 24 |
16 | Target Strictly Positive | True |
17 | Target White Noise | No |
18 | Recommended d | 1 |
19 | Recommended Seasonal D | 0 |
20 | Preprocess | True |
21 | Numerical Imputation (Target) | ffill |
22 | Transformation (Target) | None |
23 | Scaling (Target) | None |
24 | Numerical Imputation (Exogenous) | ffill |
25 | Transformation (Exogenous) | None |
26 | Scaling (Exogenous) | None |
27 | CPU Jobs | -1 |
28 | Use GPU | False |
29 | Log Experiment | False |
30 | Experiment Name | ts-default-name |
31 | USI | aa6b |
<pycaret.time_series.forecasting.oop.TSForecastingExperiment at 0x1a6ea34f040>
# # Check available models ----
# exp_auto_noexo.models()
# Include slower models like Prophet (turbo=False), but exclude some specific models ----
best = exp_auto.compare_models(sort=metric, turbo=False, exclude=exclude)
Model | MASE | RMSSE | MAE | RMSE | MAPE | SMAPE | R2 | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
arima | ARIMA | 0.1963 | 0.1758 | 0.1674 | 0.2301 | 0.1380 | 0.1512 | 0.8652 | 24.1733 |
br_cds_dt | Bayesian Ridge w/ Cond. Deseasonalize & Detrending | 0.5576 | 0.4841 | 0.4756 | 0.6338 | 0.3683 | 0.2969 | -0.0134 | 8.9533 |
ridge_cds_dt | Ridge w/ Cond. Deseasonalize & Detrending | 0.5639 | 0.4898 | 0.4809 | 0.6413 | 0.3725 | 0.2997 | -0.0367 | 8.8033 |
lr_cds_dt | Linear w/ Cond. Deseasonalize & Detrending | 0.5640 | 0.4899 | 0.4811 | 0.6414 | 0.3726 | 0.2998 | -0.0372 | 9.2867 |
omp_cds_dt | Orthogonal Matching Pursuit w/ Cond. Deseasonalize & Detrending | 0.6373 | 0.5616 | 0.5435 | 0.7353 | 0.4223 | 0.3325 | -0.4062 | 9.1567 |
exp_smooth | Exponential Smoothing | 0.6729 | 0.6150 | 0.5742 | 0.8055 | 0.3813 | 0.3681 | -0.0278 | 4.5233 |
theta | Theta Forecaster | 0.7012 | 0.5888 | 0.5986 | 0.7714 | 0.4009 | 0.4067 | 0.1467 | 5.2033 |
rf_cds_dt | Random Forest w/ Cond. Deseasonalize & Detrending | 0.7131 | 0.5803 | 0.6082 | 0.7599 | 0.4887 | 0.3689 | -0.2896 | 25.8700 |
gbr_cds_dt | Gradient Boosting w/ Cond. Deseasonalize & Detrending | 0.7144 | 0.5798 | 0.6092 | 0.7590 | 0.5195 | 0.3738 | -0.5342 | 18.5533 |
lightgbm_cds_dt | Light Gradient Boosting w/ Cond. Deseasonalize & Detrending | 0.7732 | 0.6165 | 0.6594 | 0.8072 | 0.5427 | 0.3930 | -0.3296 | 9.8100 |
snaive | Seasonal Naive Forecaster | 0.8175 | 0.7847 | 0.6972 | 1.0275 | 0.4645 | 0.3643 | -1.8616 | 5.9833 |
et_cds_dt | Extra Trees w/ Cond. Deseasonalize & Detrending | 0.8410 | 0.6729 | 0.7173 | 0.8810 | 0.5927 | 0.4188 | -1.0515 | 14.6700 |
huber_cds_dt | Huber w/ Cond. Deseasonalize & Detrending | 0.8675 | 0.6634 | 0.7400 | 0.8686 | 0.6135 | 0.4357 | -0.9304 | 9.2500 |
en_cds_dt | Elastic Net w/ Cond. Deseasonalize & Detrending | 0.8720 | 0.6712 | 0.7438 | 0.8789 | 0.6033 | 0.4312 | -1.0172 | 8.6667 |
lasso_cds_dt | Lasso w/ Cond. Deseasonalize & Detrending | 0.8764 | 0.6788 | 0.7475 | 0.8888 | 0.5934 | 0.4267 | -1.0696 | 8.9633 |
naive | Naive Forecaster | 0.9212 | 0.7554 | 0.7861 | 0.9895 | 0.6125 | 0.5160 | -0.3784 | 7.1367 |
croston | Croston | 0.9337 | 0.7464 | 0.7966 | 0.9775 | 0.7744 | 0.5053 | -0.6474 | 3.7600 |
llar_cds_dt | Lasso Least Angular Regressor w/ Cond. Deseasonalize & Detrending | 1.0551 | 0.7735 | 0.8999 | 1.0127 | 0.8315 | 0.5344 | -2.3112 | 8.7700 |
grand_means | Grand Means Forecaster | 1.1332 | 0.8380 | 0.9666 | 1.0973 | 1.0269 | 0.5836 | -1.8707 | 6.4433 |
knn_cds_dt | K Neighbors w/ Cond. Deseasonalize & Detrending | 1.1507 | 0.8952 | 0.9814 | 1.1721 | 0.8098 | 0.5266 | -3.0201 | 9.0300 |
dt_cds_dt | Decision Tree w/ Cond. Deseasonalize & Detrending | 1.2128 | 1.0815 | 1.0346 | 1.4163 | 0.8780 | 0.5389 | -2.8879 | 9.5533 |
polytrend | Polynomial Trend Forecaster | 1.2665 | 0.9168 | 1.0803 | 1.2003 | 1.1741 | 0.6261 | -2.6610 | 4.0433 |
ets | ETS | 1.8734 | 1.5672 | 1.5985 | 2.0528 | 1.0819 | 0.7912 | -4.8138 | 7.0567 |
ada_cds_dt | AdaBoost w/ Cond. Deseasonalize & Detrending | 1.9746 | 1.4070 | 1.6839 | 1.8418 | 1.7105 | 0.7690 | -10.9710 | 12.0200 |
exp_auto.plot_model(best)
final_auto_model = exp_auto.finalize_model(best)
def safe_predict(exp, model):
"""Prediction wrapper for demo purposes."""
try:
future_preds = exp.predict_model(model)
except ValueError as exception:
print(exception)
exo_vars = exp.exogenous_variables
print(f"{len(exo_vars)} exogenous variables (X) needed in order to make future predictions:\n{exo_vars}")
exog_exps = []
exog_models = []
for exog_var in exog_vars:
exog_exp = TSForecastingExperiment()
exog_exp.setup(
data=data[exog_var], fh=FH,
numeric_imputation_target="ffill", numeric_imputation_exogenous="ffill",
fig_kwargs=global_fig_settings, session_id=42
)
# Users can customize how to model future exogenous variables i.e. add
# more steps and models to potentially get better models at the expense
# of higher modeling time.
best = exog_exp.compare_models(
sort=metric, include=["arima", "ets", "exp_smooth", "theta", "lightgbm_cds_dt",]
)
final_exog_model = exog_exp.finalize_model(best)
exog_exps.append(exog_exp)
exog_models.append(final_exog_model)
# Step 2: Get future predictions for exog variables ----
future_exog = [
exog_exp.predict_model(exog_model)
for exog_exp, exog_model in zip(exog_exps, exog_models)
]
future_exog = pd.concat(future_exog, axis=1)
future_exog.columns = exog_vars
future_preds = exp.predict_model(model, X=future_exog)
return future_preds
future_preds = safe_predict(exp_auto, final_auto_model)
future_preds.plot()
Model | MASE | RMSSE | MAE | RMSE | MAPE | SMAPE | R2 | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
lightgbm_cds_dt | Light Gradient Boosting w/ Cond. Deseasonalize & Detrending | 1.1990 | 1.0486 | 11.0836 | 13.5879 | 0.2862 | 0.2304 | -0.9502 | 5.1967 |
exp_smooth | Exponential Smoothing | 1.4890 | 1.1904 | 13.7510 | 15.4140 | 0.3037 | 0.2478 | -0.4248 | 3.1167 |
arima | ARIMA | 1.5657 | 1.3693 | 14.4696 | 17.7412 | 0.3539 | 0.2739 | -2.1191 | 3.7867 |
theta | Theta Forecaster | 1.9102 | 1.4902 | 17.6420 | 19.2958 | 0.3713 | 0.2986 | -1.2629 | 3.2633 |
ets | ETS | 3.4504 | 2.6557 | 31.8652 | 34.3839 | 0.6295 | 0.4585 | -6.1733 | 10.4200 |
<AxesSubplot:>