#!/usr/bin/env python # coding: utf-8 # # Regressors and exogenous data # # # # # # This notebook serves as a tutorial for: # # - Loading regressors to TSDataset # - Training and using a model with regressors # # **Table of Contents** # # * [What is regressor?](#chapter1) # * [What is exogenous data?](#chapter1_1) # * [Dataset](#chapter2) # * [Loading Dataset](#chapter2_1) # * [EDA](#chapter2_2) # * [Forecast with regressors](#chapter3) # # ## 1. What is regressor? # # In previous tutorials, we have shown how to work with **target time series**. # # > Target time series is a time series we want to forecast. # # But imagine that you have information about the future that can help model with forecasting target time series. # It may be information about holidays, weather, recurring events, marketing campaigns, etc. # We will call it **regressor**. # # > Regressor is a time series that we are not interested in forecasting, however, it may help to forecast the target time series. # # To apply an ML model that uses regressors to make more accurate forecasts, # we need to know how regressors affected the target time series in the past and information # about their values in the future. # # ### What is additional data? # # There is also data that we don't know in advance. # However using it still allows us to make more accurate forecasts. This data we will call **additional data**. # For example, if many users bought a new phone few weeks ago we should expect more support requests on this product. # # In order to use additional data in ML models we should create regressors out of them. # For example, it could be done with LagTransform or TrendTransform. # # In this tutorial we will not look at **additional data** and will focus on **regressors**. # ## 2. Dataset # # ETNA allows working with regressor as convenient as with target time series. # # We are going to forecast the time series from [Tabular Playground Series - Jan 2022](https://www.kaggle.com/c/tabular-playground-series-jan-2022/overview). # The dataset contains daily merchandise sales – mugs, hats, and stickers – at two imaginary # store chains across three Scandinavian countries. As exogenous data, we will use # [Finland, Norway, and Sweden Weather Data 2015-2019](https://www.kaggle.com/adamwurdits/finland-norway-and-sweden-weather-data-20152019?select=nordics_weather.csv) # dataset containing daily country average precipitation, snow depth and air temperature data. # # ### 2.1 Loading Dataset # # First, let's load the data. # In[1]: import pandas as pd import warnings warnings.filterwarnings("ignore") target_df = pd.read_csv("data/nordic_merch_sales.csv") regressor_df = pd.read_csv("data/nordics_weather.csv") # The next step is converting the data into the ETNA format. # Code that allows us to do that is identical for target time series and exogenous data. # In[2]: from etna.datasets import TSDataset target_df = TSDataset.to_dataset(target_df) target_df.tail() # As you can see, the target ends in 2018, and the exogenous data ends in 2019, # so we have prior information a year ahead. # This implies that our exogenous data contains only regressors. # In[3]: regressor_df = TSDataset.to_dataset(regressor_df) regressor_df.tail() # Then we have to create TSDataset with both target time series and exogenous data. # TSDataset expects us to put target time series in `df` argument and exogenous data in `df_exog`. # We should do it because regressors contain information about the target's future. # TSDataset ensures we don't mix them. # # In order to specify the columns of df_exog, which contains regressors, we need to use the `known_future` parameter. # This allows TSDataset to determine which columns are **regressors** and which columns are **additional data**. # In[4]: ts = TSDataset(df=target_df, freq="D", df_exog=regressor_df, known_future="all") ts.head() # ### 2.2 EDA # # TSDataset joins exogenous data and the target time series, # so we can visualize and analyze exogenous data in the same way as target time series. # More information in [EDA notebook](https://github.com/tinkoff-ai/etna/blob/09a7938103c56e10bcdac7f13def9fa66c2c88dd/examples/EDA.ipynb). # In[5]: ts.plot(column="snow_depth", n_segments=2) # In[6]: ts.plot(column="precipitation", n_segments=2) # In[7]: ts.plot(column="target", n_segments=2) # ## 3. Forecast with regressors # # We will use LinearPerSegmentModel. It is a simple model that works with regressors. # # > Note: some models do not work with regressors. In this case, they will warn you about it. # # We should forecast merchandise sales a year ahead using regressors with information about weather. # In[8]: from etna.models import LinearPerSegmentModel HORIZON = 365 model = LinearPerSegmentModel() # ETNA allows to configure the transforms to work with exogenous data the same way as they work with the time series. # In addition to this, transforms will automatically update information about regressors in TSDataset. # In[9]: from etna.transforms import FilterFeaturesTransform from etna.transforms import MeanTransform # math from etna.transforms import DateFlagsTransform, HolidayTransform # datetime from etna.transforms import LagTransform # lags transforms = [ LagTransform( in_column="target", lags=list(range(HORIZON, HORIZON + 28)), out_column="target_lag", ), LagTransform(in_column="tavg", lags=list(range(1, 3)), out_column="tavg_lag"), MeanTransform(in_column="tavg", window=7, out_column="tavg_mean"), MeanTransform( in_column="target_lag_365", out_column="target_mean", window=104, seasonality=7, ), DateFlagsTransform( day_number_in_week=True, day_number_in_month=True, is_weekend=True, special_days_in_week=[4], out_column="date_flag", ), HolidayTransform(iso_code="SWE", out_column="SWE_holidays"), HolidayTransform(iso_code="NOR", out_column="NOR_holidays"), HolidayTransform(iso_code="FIN", out_column="FIN_holidays"), LagTransform( in_column="SWE_holidays", lags=list(range(2, 6)), out_column="SWE_holidays_lag", ), LagTransform( in_column="NOR_holidays", lags=list(range(2, 6)), out_column="NOR_holidays_lag", ), LagTransform( in_column="FIN_holidays", lags=list(range(2, 6)), out_column="FIN_holidays_lag", ), FilterFeaturesTransform(exclude=["precipitation", "snow_depth", "tmin", "tmax"]), ] # The next steps are literally identical to the situation when we work with target time series only. # In[10]: from etna.pipeline import Pipeline pipeline = Pipeline(model=model, transforms=transforms, horizon=HORIZON) # In[11]: from etna.metrics import SMAPE metrics, forecasts, _ = pipeline.backtest(ts, metrics=[SMAPE()], aggregate_metrics=True, n_folds=2) # In[12]: metrics # In[13]: from etna.analysis import plot_backtest plot_backtest(forecasts, ts) # Supporting more work strategies for regressors and additional data is a future feature on the ETNA development roadmap. #