#!/usr/bin/env python # coding: utf-8 # # Ensembles # # [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/etna-team/etna/master?filepath=examples/203-ensembles.ipynb) # This notebook contains the simple examples of using the ensemble models with ETNA library. # # **Table of contents** # # * [Loading dataset](#chapter1) # * [Building pipelines](#chapter2) # * [Ensembles](#chapter3) # * [VotingEnsemble](#section_3_1) # * [StackingEnsamble](#section_3_2) # * [Results](#section_3_3) # In[1]: import warnings warnings.filterwarnings("ignore") # In[2]: import pandas as pd from etna.datasets import TSDataset from etna.metrics import MAE from etna.metrics import MAPE from etna.metrics import MSE from etna.metrics import SMAPE from etna.models import CatBoostMultiSegmentModel from etna.models import NaiveModel from etna.models import SeasonalMovingAverageModel from etna.pipeline import Pipeline from etna.transforms import LagTransform # ## 1. Loading dataset # # In this notebook we will work with the dataset contains only one segment with monthly wine sales. Working process with the dataset containing more segments will be absolutely the same. # In[3]: original_df = pd.read_csv("data/monthly-australian-wine-sales.csv") original_df["timestamp"] = pd.to_datetime(original_df["month"]) original_df["target"] = original_df["sales"] original_df.drop(columns=["month", "sales"], inplace=True) original_df["segment"] = "main" original_df.head() df = TSDataset.to_dataset(original_df) ts = TSDataset(df=df, freq="MS") ts.plot() # ## 2. Building pipelines # # Given the sales' history, we want to select the best model(pipeline) to forecast future sales. # In[4]: HORIZON = 3 N_FOLDS = 5 # Let's build four pipelines using the different models # In[5]: naive_pipeline = Pipeline(model=NaiveModel(lag=12), transforms=[], horizon=HORIZON) seasonalma_pipeline = Pipeline( model=SeasonalMovingAverageModel(window=5, seasonality=12), transforms=[], horizon=HORIZON, ) catboost_pipeline = Pipeline( model=CatBoostMultiSegmentModel(), transforms=[LagTransform(lags=[6, 7, 8, 9, 10, 11, 12], in_column="target")], horizon=HORIZON, ) pipeline_names = ["naive", "moving average", "catboost"] pipelines = [naive_pipeline, seasonalma_pipeline, catboost_pipeline] # And evaluate their performance on the backtest # In[6]: metrics = [] for pipeline in pipelines: metrics.append( pipeline.backtest( ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=5, )[0].iloc[:, 1:] ) # In[7]: metrics = pd.concat(metrics) metrics.index = pipeline_names metrics # ## 3. Ensembles # To improve the performance of the individual models, we can try to make ensembles out of them. Our library contains two ensembling methods, which we will try on now. # ### 3.1 `VotingEnsemble` # # `VotingEnsemble` forecasts future values with weighted averaging of it's `pipelines` forecasts. # In[8]: from etna.ensembles import VotingEnsemble # By default, `VotingEnsemble` uses **uniform** weights for the pipelines' forecasts. However, you can specify the weights manually using the `weights` parameter. The higher weight the more you trust the base model. In addition, you can set `weights` with the literal `auto`. In this case, the weights of pipelines are assigned with the importances got from `feature_importance_` property of `regressor`. # # *Note*: The `weights` are automatically normalized. # In[9]: voting_ensemble = VotingEnsemble(pipelines=pipelines, weights=[1, 9, 4], n_jobs=4) # In[10]: voting_ensamble_metrics = voting_ensemble.backtest( ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2, )[0].iloc[:, 1:] voting_ensamble_metrics.index = ["voting ensemble"] voting_ensamble_metrics # ### 3.2 `StackingEnsemble` # `StackingEnsemble` forecasts future using the metamodel to combine the forecasts of it's `pipelines`. # In[11]: from etna.ensembles import StackingEnsemble # By default, `StackingEnsemble` uses only the pipelines' forecasts as features for the `final_model`. However, you can specify the additional features using the `features_to_use` parameter. The following values are possible: # # + **None** - use only the pipelines' forecasts(default) # + **List[str]** - use the pipelines' forecasts + features from the list # + **"all"** - use all the available features # # *Note:* It is possible to use only the features available for the base models. # In[12]: stacking_ensemble_unfeatured = StackingEnsemble(pipelines=pipelines, n_folds=10, n_jobs=4) # In[13]: stacking_ensamble_metrics = stacking_ensemble_unfeatured.backtest( ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2, )[0].iloc[:, 1:] stacking_ensamble_metrics.index = ["stacking ensemble"] stacking_ensamble_metrics # In addition, it is also possible to specify the `final_model`. You can use any regression model with the sklearn interface for this purpose. # ### 3.3 Results # # Finally, let's take a look at the results of our experiments # In[14]: metrics = pd.concat([metrics, voting_ensamble_metrics, stacking_ensamble_metrics]) metrics