#!/usr/bin/env python # coding: utf-8 # # Overview # # In the 10x series of notebooks, we will look at Time Series modeling in pycaret using univariate data and no exogenous variables. We will use the famous airline dataset for illustration. Our plan of action is as follows: # # 1. Perform EDA on the dataset to extract valuable insight about the process generating the time series. **(COMPLETED)** # 2. Model the dataset based on exploratory analysis (univariable model without exogenous variables). **(COMPLETED)** # 3. Use an automated approach (AutoML) to improve the performance. **(COMPLETED)** # In[1]: # Only enable critical logging (Optional) import os os.environ["PYCARET_CUSTOM_LOGGING_LEVEL"] = "CRITICAL" # In[2]: def what_is_installed(): from pycaret import show_versions show_versions() try: what_is_installed() except ModuleNotFoundError: get_ipython().system('pip install pycaret') what_is_installed() # In[3]: import time import numpy as np import pandas as pd from pycaret.datasets import get_data from pycaret.time_series import TSForecastingExperiment # In[4]: y = get_data('airline', verbose=False) # In[5]: # We want to forecast the next 12 months of data and we will use 3 fold cross-validation to test the models. fh = 12 # or alternately fh = np.arange(1,13) fold = 3 # In[6]: # Global Figure Settings for notebook ---- # Depending on whether you are using jupyter notebook, jupyter lab, Google Colab, you may have to set the renderer appropriately # NOTE: Setting to a static renderer here so that the notebook saved size is reduced. fig_kwargs = { # "renderer": "notebook", "renderer": "png", "width": 1000, "height": 400, } # # User Customizations # # Let's look at how users can customize various steps in the modeling process # ## Prediction Customization # # ### Forecast Horizon # Sometimes users may wish to customize the forecast horizon after the model has been created. This can be done as follows. # In[7]: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, fig_kwargs=fig_kwargs, session_id=42, verbose=False) # In[8]: model = exp.create_model("arima") # In[9]: # Default prediction exp.predict_model(model) # In[10]: # Increased forecast horizon to 2 years instead of the original 1 year exp.predict_model(model, fh=24) # ### Prediction Interval # # #### NOTES: # 1. **When prediction intervals are requested, the default coverage = 0.9 corresponding to 90% coverage.** # 2. **Coverage is symmetrical around the median (alpha = 0.5). Hence a coverage of 0.9 corresponds to lower interval = 0.05 and an upper interval of 0.95 to give a total coverage between lower and upper interval = 0.9.** # In[11]: # With Prediction Interval (default coverage = 0.9) exp.predict_model(model, return_pred_int=True) # In[12]: # With Prediction Interval (custom coverage = 0.8, corresponding to lower and upper quantiles of 0.1 and 0.9 respectively) # The point estimate remains the same as before. # But the lower and upper intervals are now narrower since we are OK with a lower coverage. exp.predict_model(model, return_pred_int=True, coverage=0.8) # **Sometimes, users may wish to get the point estimates at values other than the mean/median. In such cases, they can specify the alpha (quantile) value for the point estimate directly.** # # **NOTE: Not all models support this feature. If this is used with models that do not support it, an error is raised. If you want to only use models that support this feature, you must set `point_alpha` to a floating point value in the `setup` stage (see below).** # In[13]: # With Custom Point Estimate (alpha = 0.7) # The point estimate is now higher than before since we are asking for the # 70% percentile as the point estimate), vs. mean/median before. exp.predict_model(model, alpha=0.7) # In[14]: # For models that do not produce a prediction interval --> returns NA values model_no_pred_int = exp.create_model("lr_cds_dt") exp.predict_model(model_no_pred_int, return_pred_int=True) # ## Forecast Plotting Customization # # Similar to the prediction customization, we can customize the forecast plots as well. # In[15]: # Regular Plot exp.plot_model(model) # In[16]: # Modified Plot (zoom into the plot to see differences between the 2 plots) exp.plot_model(model, data_kwargs={"alpha": 0.7, "coverage": 0.8}) # ## Enforce Prediction Intervals # # In some use cases, it is important to have prediction intervals. Users may wish to restrict the modeling to only those models that support prediction intervals. # # * Specifying `point_alpha` to any floating point value restricts the models to only those that provide a prediction interval. The value that is specified corresponds to the quantile of the point prediction that is returned. # * This also adds an extra metric called `COVERAGE`. # * `COVERAGE` gives the percentage of actuals that are within the prediction interval. # In[17]: exp = TSForecastingExperiment() # We can see that specifying a value for point_alpha enables `Enforce Prediction Interval` in the grid (and limits the models). exp.setup(data=y, fh=fh, fold=fold, fig_kwargs=fig_kwargs, point_alpha=0.5) exp.models() # In[18]: best_model = exp.compare_models() # # To enable slower models such as prophet, BATS and TBATS, add turbo=False # best_model = exp.compare_models(turbo=False) # ## Types of Window Splitters # # Various window splitters are available for performing the cross validation. # # ### Sliding Window Splitter # In[19]: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, fig_kwargs=fig_kwargs, fold_strategy='sliding', verbose=False) exp.plot_model(plot="cv") # ### Expanding/Rolling Window # # * They are identical # In[20]: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, fig_kwargs=fig_kwargs, fold_strategy='expanding', verbose=False) exp.plot_model(plot="cv") # In[21]: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, fig_kwargs=fig_kwargs, fold_strategy='rolling', verbose=False) exp.plot_model(plot="cv") # ### Error Handling due to lack of data # # Sometimes, there are not enough data points available to perform the experiment. In such cases, pycaret will warn you accordingly. # In[22]: try: exp = TSForecastingExperiment() exp.setup(data=y[:30], fh=12, fold=3, fig_kwargs=fig_kwargs) except ValueError as error: print(error) # In[23]: try: exp = TSForecastingExperiment() exp.setup(data=y[:30], fh=6, fold=3, fig_kwargs=fig_kwargs) except ValueError as error: print(error) # ## Tuning Customization # In[24]: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, fig_kwargs=fig_kwargs, session_id=42) # In[25]: model = exp.create_model("lr_cds_dt") # In[26]: # Random Grid Search (default) tuned_model = exp.tune_model(model) print(model) print(tuned_model) # In[27]: exp.plot_model([model, tuned_model], data_kwargs={"labels": ["Original", "Tuned"]}) # In[28]: # Fixed Grid Search tuned_model = exp.tune_model(model, search_algorithm="grid") print(model) print(tuned_model) # **Observations:** # * In this case, the tuning resulted in worse metrics than the original model (this is possible). # * Hence, pycaret returned the original model as the best one since `choose_better=True` by default. # * If the user does not want this behavior, they can set `choose_better=False` # In[29]: tuned_model = exp.tune_model(model, search_algorithm="grid", choose_better=False) print(model) print(tuned_model) # Sometimes, there are time constraints on the tuning so users may wish to adjust the number of hyperparameters that are tried using the `n_iter` argument. # In[30]: tuned_model = exp.tune_model(model, n_iter=5) print(model) print(tuned_model) # More information about tunuing in pycaret time series can be found here: # # 1. **[Basic Tuning](https://github.com/pycaret/pycaret/discussions/1791)** # 2. **[Advanced Tuning](https://github.com/pycaret/pycaret/discussions/1795)** # ## Setting Renderer # # Sometimes the plotly renderer if not detected correctly for the environment. In such cases, the users can manually specify the render in pycaret # In[31]: exp = TSForecastingExperiment() exp.setup( data=y, fh=fh, fold=fold, # fig_kwargs={'renderer': 'notebook'}, verbose=False ) exp.plot_model(plot="cv") # Users can also specify the renderer for specific plot types # In[32]: exp.plot_model(fig_kwargs={'renderer': 'png'}) # ## Seasonal Period # # * Setting the seasonal period for time series models is one of the most important aspects that can dictate how accurate the model are. # * By default, pycaret will try to try to derive the seasonal periods using the data characteristics. This is the preferred approach since it is data driven. # * Users also have the option of deriving the seasonal period using the index frequency. This is not preferred since it is based on asumptions made from the data frequency. # * Users also have the option of providing their own manual seasonal period if they have done their due diligence and dont want to rely on pycaret's algorithms. # # All these options are shown below # ### Method 1: Auto Detection of Seasonal Period (Preferred) # In[33]: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, fig_kwargs=fig_kwargs) # **Observations:** # # * The Seasonal Period was derived from data properties as 12. # * Other harmonics such as 24, 36, 48 are also detected, but 12 has the most strength and hence is taken as the primary seasonal period. # # As specified above, users can change the seasonal period manually if they want based on their EDA. e.g. lets change it to 36 and see what happens # ### Method 2: Manually provided Seasonal Periods # In[34]: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, seasonal_period=36, fig_kwargs=fig_kwargs) # **Observations:** # # * In this case, the user specified a seasonal period of 36 and this also passed pycaret's internal seasonality tests. Hence, it is used for modeling. # * If the user specified seasonal period does not pass the seasonality test, pycaret swicthes to using no seasonality (see below). # In[35]: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, seasonal_period=52, fig_kwargs=fig_kwargs) # **Even then, if the user want to force pycaret to use a certain seasonal period, they can do that by specifying the `ignore_seasonality_test` argument** # In[36]: exp.setup(data=y, fh=fh, fold=fold, seasonal_period=52, ignore_seasonality_test=True, fig_kwargs=fig_kwargs) # ### Method 3: Using the time series index (not preferred) # In[37]: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, sp_detection="index", fig_kwargs=fig_kwargs) # **Observations**: # - PyCaret derives the seasonal period using the index frequency. # - In this case, since we have monthly frequency, the seasonal period tested and used is 12. # # In some cases, when the frequency can not be derived from the index (see example below), user needs to switch to one of the other methods (auto detection or manualy specifying period). # In[38]: y = get_data("1", folder="time_series/ar1") # In[39]: try: exp = TSForecastingExperiment() exp.setup(data=y, fh=fh, fold=fold, sp_detection="index", fig_kwargs=fig_kwargs) except ValueError as error: print(error) # **Observations:** # * The frequency/seasonal period could not be derived from the index. # * The user needs to switch to one of the other methods (auto detection or manualy specifying period). # In[40]: eda = TSForecastingExperiment() eda.setup(data=y, fh=fh, fold=fold, sp_detection="auto", max_sp_to_consider = 40, fig_kwargs=fig_kwargs) # **That's it for this notebook. If you would like to see other demonstrations, feel free to open an issue on [GitHub](https://github.com/pycaret/pycaret/issues).**