Exogenous Variables and Time Series Forecasting¶

In [1]:

import plotly.io as pio
pio.renderers.default = "colab+notebook_connected+vscode"

import pandas as pd
import numpy as np
import duckdb as db
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import optuna
import warnings

warnings.filterwarnings('ignore')

from IPython.display import display, Markdown
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, MSTL, MFLES, AutoMFLES
# ConformalIntervals
from statsforecast.models import ConformalIntervals
# mape
from sklearn.metrics import mean_absolute_percentage_error
# sarimax
from statsmodels.tsa.statespace.sarimax import SARIMAX

optuna.logging.set_verbosity(optuna.logging.ERROR)

In [2]:

orig = pd.read_parquet("../../data/processed/train_enhanced.parquet")
train_agg = pd.read_parquet("../../data/processed/train_agg.parquet")
train_region_code_agg = pd.read_parquet("../../data/processed/train_region_code_agg.parquet")
holiday_df= pd.read_csv("../../data/processed/holidays.csv")

Region Sales Forecasting¶

In [3]:

train_region_code_agg

Out[3]:

	Date	Region_Code	Total_Sales	Avg_Sales	Total_Orders	Avg_Orders	Num_Stores	Holiday	Total_Discounts
0	2018-01-01	R4	2286812	45736	2914	58	50	1	50
1	2018-01-01	R2	4436859	42256	5644	54	105	1	105
2	2018-01-01	R3	3527439	41017	4599	53	86	1	86
3	2018-01-01	R1	5094374	41084	6509	52	124	1	124
4	2018-01-02	R4	2545119	50902	3057	61	50	0	50
...	...	...	...	...	...	...	...	...	...
2059	2019-05-30	R4	1966320	39326	2829	57	50	0	4
2060	2019-05-31	R2	4351299	41441	6411	61	105	1	11
2061	2019-05-31	R4	1909319	38186	2746	55	50	1	1
2062	2019-05-31	R1	5900798	47587	9433	76	124	1	18
2063	2019-05-31	R3	3440408	40005	5012	58	86	1	9

2064 rows × 9 columns

In [4]:

df = train_region_code_agg[["Region_Code" ,"Date", "Total_Sales" , "Holiday","Total_Discounts"]]

In [5]:

date_mapping = {date: idx + 1 for idx, date in enumerate(sorted(df['Date'].unique()))}
df['idx'] = df['Date'].map(date_mapping)
df = df.rename(columns={"Total_Sales": "y", "Region_Code": "unique_id", "Date": "ds"})
df=df.sort_values(by='ds')
df

Out[5]:

	unique_id	ds	y	Holiday	Total_Discounts	idx
0	R4	2018-01-01	2286812	1	50	1
1	R2	2018-01-01	4436859	1	105	1
2	R3	2018-01-01	3527439	1	86	1
3	R1	2018-01-01	5094374	1	124	1
4	R4	2018-01-02	2545119	0	50	2
...	...	...	...	...	...	...
2059	R4	2019-05-30	1966320	0	4	515
2061	R4	2019-05-31	1909319	1	1	516
2062	R1	2019-05-31	5900798	1	18	516
2060	R2	2019-05-31	4351299	1	11	516
2063	R3	2019-05-31	3440408	1	9	516

2064 rows × 6 columns

In [6]:

threshold = 0.8
train = df[df['idx'] <= df['idx'].max() * threshold]
test = df[df['idx'] > df['idx'].max() * threshold]

In [7]:

train

Out[7]:

	unique_id	ds	y	Holiday	Total_Discounts	idx
0	R4	2018-01-01	2286812	1	50	1
1	R2	2018-01-01	4436859	1	105	1
2	R3	2018-01-01	3527439	1	86	1
3	R1	2018-01-01	5094374	1	124	1
4	R4	2018-01-02	2545119	0	50	2
...	...	...	...	...	...	...
1640	R3	2019-02-15	3300873	0	73	411
1644	R2	2019-02-16	4485144	0	93	412
1645	R4	2019-02-16	2120472	0	49	412
1646	R1	2019-02-16	6426930	0	108	412
1647	R3	2019-02-16	3960552	0	76	412

1648 rows × 6 columns

In [8]:

test

Out[8]:

	unique_id	ds	y	Holiday	Total_Discounts	idx
1648	R3	2019-02-17	4253736	0	83	413
1649	R1	2019-02-17	6858420	0	120	413
1650	R4	2019-02-17	2341383	0	49	413
1651	R2	2019-02-17	4888986	0	99	413
1652	R3	2019-02-18	3948027	0	78	414
...	...	...	...	...	...	...
2059	R4	2019-05-30	1966320	0	4	515
2061	R4	2019-05-31	1909319	1	1	516
2062	R1	2019-05-31	5900798	1	18	516
2060	R2	2019-05-31	4351299	1	11	516
2063	R3	2019-05-31	3440408	1	9	516

416 rows × 6 columns

In [9]:

prediction_window = test['idx'].max() - train['idx'].max()
prediction_window

Out[9]:

In [10]:

train = train.drop(columns=['idx'], axis=1)
test = test.drop(columns=['idx'], axis=1)

In [12]:

seasonality = [7, 12, 30]
models = [
    AutoARIMA(
        season_length=12,
        d=0,
        approximation=True,
        stationary=True
    ),
    MSTL(season_length=[7, 12, 30],
         trend_forecaster=AutoARIMA(
        d=0,
        approximation=True,
        stationary=True,
        prediction_intervals=ConformalIntervals(h=prediction_window),
    )),

    AutoMFLES(
        season_length=seasonality,
        test_size=20,
        prediction_intervals=ConformalIntervals(h=prediction_window),
    )
]

In [13]:

fcst = StatsForecast( models=models, freq="D", n_jobs=-1)

In [ ]:

fcst.fit(train);

In [15]:

forecast = fcst.predict(h=prediction_window, level=[95], X_df=test.drop(columns=['y'], axis=1))

In [16]:

forecast

Out[16]:

	unique_id	ds	AutoARIMA	AutoARIMA-lo-95	AutoARIMA-hi-95	MSTL	MSTL-lo-95	MSTL-hi-95	AutoMFLES	AutoMFLES-lo-95	AutoMFLES-hi-95
0	R1	2019-02-17	6666489.000	4.967450e+06	8365528.000	7860963.500	7.173004e+06	8548923.000	6284394.000	5568465.000	7000323.000
1	R1	2019-02-18	6439762.000	4.580992e+06	8298532.500	6418570.500	5.809900e+06	7027240.500	6035970.500	5626605.500	6445336.000
2	R1	2019-02-19	4717097.500	2.828473e+06	6605722.500	3911237.750	1.795879e+06	6026596.500	4297116.000	1185046.375	7409185.500
3	R1	2019-02-20	5539322.500	3.644876e+06	7433769.500	5014326.500	3.016612e+06	7012041.000	5055353.000	2069786.250	8040920.000
4	R1	2019-02-21	5380298.000	3.484707e+06	7275888.500	4978095.000	2.719740e+06	7236449.000	4924604.000	1592243.250	8256965.000
...	...	...	...	...	...	...	...	...	...	...	...
411	R4	2019-05-27	2244195.000	1.554406e+06	2933983.750	2056336.500	1.980181e+06	2132491.750	1875758.875	1646278.875	2105238.750
412	R4	2019-05-28	2234609.750	1.544821e+06	2924398.500	1933951.875	1.729963e+06	2137941.000	1820966.250	1588354.000	2053578.625
413	R4	2019-05-29	2052491.625	1.362703e+06	2742280.250	1975758.500	1.717912e+06	2233604.500	1684806.250	1307494.750	2062117.875
414	R4	2019-05-30	1860788.375	1.171000e+06	2550577.000	1572959.375	1.177561e+06	1968357.625	1647948.625	1221317.625	2074579.625
415	R4	2019-05-31	1395954.375	7.061657e+05	2085743.125	1374726.000	9.185508e+05	1830901.250	1486565.500	1341881.250	1631249.875

416 rows × 11 columns

In [17]:

merged =test.merge(forecast, on=['unique_id', 'ds'], how='left')
print("MAPE using ARIMA: ", mean_absolute_percentage_error(merged['y'], merged['AutoARIMA']))
print("MAPE using MSTL: ", mean_absolute_percentage_error(merged['y'], merged['MSTL']))
print("MAPE using AutoMFLES: ", mean_absolute_percentage_error(merged['y'], merged['AutoMFLES']))

MAPE using ARIMA:  0.1742272055815889
MAPE using MSTL:  0.17424597681344803
MAPE using AutoMFLES:  0.19126523176303173

In [18]:

def plot(train, test, forecast, unique_id, model_name):
    filtered_train = train[train['unique_id'] == unique_id]
    filtered_test = test[test['unique_id'] == unique_id]
    filtered_forecast = forecast[forecast['unique_id'] == unique_id]

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=filtered_train['ds'], y=filtered_train['y'], mode='lines', name='Train'))
    fig.add_trace(go.Scatter(x=filtered_test['ds'], y=filtered_test['y'], mode='lines', name='Test'))
    fig.add_trace(go.Scatter(x=filtered_forecast['ds'], y=filtered_forecast[model_name], mode='lines', name='Forecast'))
    fig.add_trace(go.Scatter(
        x=filtered_forecast['ds'], 
        y=filtered_forecast[f"{model_name}-hi-95"], 
        mode='lines', 
        name='Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))

    # Add lower bound as an area plot
    fig.add_trace(go.Scatter(
        x=filtered_forecast['ds'], 
        y=filtered_forecast[f"{model_name}-lo-95"], 
        mode='lines', 
        name='Lower Bound',
        fill='tonexty',  # Fill area between this trace and the previous one
        fillcolor='rgba(0, 100, 80, 0.2)',  # Set fill color with opacity
        line=dict(width=0),
        showlegend=False
    ))


    fig.update_layout(title=f'Total Sales Forecast for {unique_id}', xaxis_title='Date', yaxis_title='Total Sales')
    fig.show()

plot(train, test, forecast, "R1", "AutoARIMA")
plot(train, test, forecast, "R2", "AutoARIMA")
plot(train, test, forecast, "R3", "AutoARIMA")
plot(train, test, forecast, "R4", "AutoARIMA")

In [19]:

plot(train, test, forecast, "R1", "MSTL")
plot(train, test, forecast, "R2", "MSTL")
plot(train, test, forecast, "R3", "MSTL")
plot(train, test, forecast, "R4", "MSTL")

In [20]:

plot(train, test, forecast, "R1", "AutoMFLES")
plot(train, test, forecast, "R2", "AutoMFLES")
plot(train, test, forecast, "R3", "AutoMFLES")
plot(train, test, forecast, "R4", "AutoMFLES")

Global Forecasting¶

In [4]:

train_agg

Out[4]:

	Date	Total_Sales	Avg_Sales	Total_Orders	Avg_Orders	Holiday	Total_Discounts
0	2018-01-01	15345484	42042	19666	54	1	365
1	2018-01-02	19592415	53678	25326	69	0	365
2	2018-01-03	18652527	51103	24047	66	0	365
3	2018-01-04	19956267	54675	25584	70	0	364
4	2018-01-05	22902651	62747	28436	78	0	364
...	...	...	...	...	...	...	...
511	2019-05-27	17197023	47115	25447	70	0	321
512	2019-05-28	18652065	51102	27184	74	0	319
513	2019-05-29	16213497	44421	24047	66	0	193
514	2019-05-30	16082139	44061	24318	67	0	76
515	2019-05-31	15601825	42745	23602	65	1	39

516 rows × 7 columns

In [17]:

df = train_agg[["Date", "Total_Sales" , "Holiday","Total_Discounts"]]

In [18]:

date_mapping = {date: idx + 1 for idx, date in enumerate(sorted(df['Date'].unique()))}
df['idx'] = df['Date'].map(date_mapping)
df = df.rename(columns={"Total_Sales": "y","Date": "ds"})
df=df.sort_values(by='ds')
df["unique_id"] = 1
df

Out[18]:

	ds	y	Holiday	Total_Discounts	idx	unique_id
0	2018-01-01	15345484	1	365	1	1
1	2018-01-02	19592415	0	365	2	1
2	2018-01-03	18652527	0	365	3	1
3	2018-01-04	19956267	0	364	4	1
4	2018-01-05	22902651	0	364	5	1
...	...	...	...	...	...	...
511	2019-05-27	17197023	0	321	512	1
512	2019-05-28	18652065	0	319	513	1
513	2019-05-29	16213497	0	193	514	1
514	2019-05-30	16082139	0	76	515	1
515	2019-05-31	15601825	1	39	516	1

516 rows × 6 columns

In [19]:

threshold = 0.8
train = df[df['idx'] <= df['idx'].max() * threshold]
test = df[df['idx'] > df['idx'].max() * threshold]

In [20]:

train

Out[20]:

	ds	y	Holiday	Total_Discounts	idx	unique_id
0	2018-01-01	15345484	1	365	1	1
1	2018-01-02	19592415	0	365	2	1
2	2018-01-03	18652527	0	365	3	1
3	2018-01-04	19956267	0	364	4	1
4	2018-01-05	22902651	0	364	5	1
...	...	...	...	...	...	...
407	2019-02-12	12384495	0	7	408	1
408	2019-02-13	12457656	0	39	409	1
409	2019-02-14	13301838	0	139	410	1
410	2019-02-15	14306637	0	307	411	1
411	2019-02-16	16993098	0	326	412	1

412 rows × 6 columns

In [21]:

test

Out[21]:

	ds	y	Holiday	Total_Discounts	idx	unique_id
412	2019-02-17	18342525	0	351	413	1
413	2019-02-18	16319103	0	319	414	1
414	2019-02-19	15816166	1	222	415	1
415	2019-02-20	12800094	0	59	416	1
416	2019-02-21	12383694	0	37	417	1
...	...	...	...	...	...	...
511	2019-05-27	17197023	0	321	512	1
512	2019-05-28	18652065	0	319	513	1
513	2019-05-29	16213497	0	193	514	1
514	2019-05-30	16082139	0	76	515	1
515	2019-05-31	15601825	1	39	516	1

104 rows × 6 columns

In [22]:

prediction_window = test['idx'].max() - train['idx'].max()
prediction_window

Out[22]:

In [23]:

train = train.drop(columns=['idx'], axis=1)
test = test.drop(columns=['idx'], axis=1)

In [24]:

seasonality = [7, 12, 30]
models = [
    AutoARIMA(
        season_length=12,
        d=0,
        approximation=True,
        stationary=True
    ),
    MSTL(season_length=[7, 12, 30],
         trend_forecaster=AutoARIMA(
        d=0,
        approximation=True,
        stationary=True,
        prediction_intervals=ConformalIntervals(h=prediction_window),
    )),

    AutoMFLES(
        season_length=seasonality,
        test_size=20,
        prediction_intervals=ConformalIntervals(h=prediction_window),
    )
]

In [25]:

fcst = StatsForecast( models=models, freq="D", n_jobs=-1)

In [26]:

fcst.fit(train);

In [27]:

forecast = fcst.predict(h=prediction_window, level=[95], X_df=test.drop(columns=['y'], axis=1))

In [28]:

forecast

Out[28]:

	unique_id	ds	AutoARIMA	AutoARIMA-lo-95	AutoARIMA-hi-95	MSTL	MSTL-lo-95	MSTL-hi-95	AutoMFLES	AutoMFLES-lo-95	AutoMFLES-hi-95
0	1	2019-02-17	17897164.0	13301840.0	22492488.0	20815704.0	18865470.0	22765938.0	16443518.0	15116372.00	17770664.0
1	1	2019-02-18	17645540.0	12640227.0	22650854.0	17287290.0	15781578.0	18793000.0	16055208.0	15050656.00	17059760.0
2	1	2019-02-19	13183467.0	8105391.5	18261542.0	10942386.0	5298980.0	16585792.0	11509044.0	2970222.25	20047866.0
3	1	2019-02-20	14814651.0	9723128.0	19906174.0	13533622.0	8225240.0	18842004.0	12900191.0	4727806.50	21072574.0
4	1	2019-02-21	14426889.0	9332863.0	19520914.0	13593423.0	7848842.5	19338004.0	12633228.0	3807427.25	21459028.0
...	...	...	...	...	...	...	...	...	...	...	...
99	1	2019-05-27	17838166.0	12724844.0	22951490.0	16683699.0	16239398.0	17128000.0	16079478.0	15046672.00	17112284.0
100	1	2019-05-28	17814770.0	12701448.0	22928094.0	15773848.0	14561388.0	16986308.0	16055208.0	14594145.00	17516272.0
101	1	2019-05-29	16340833.0	11227510.0	21454156.0	15926136.0	13938983.0	17913290.0	14526238.0	11550098.00	17502378.0
102	1	2019-05-30	14972177.0	9858854.0	20085500.0	12472893.0	9664525.0	15281260.0	13106480.0	10257496.00	15955465.0
103	1	2019-05-31	11072638.0	5959315.0	16185961.0	10983630.0	7566960.5	14400299.0	9288397.0	8639927.00	9936868.0

104 rows × 11 columns

In [29]:

merged =test.merge(forecast, on=['unique_id', 'ds'], how='left')
print("MAPE using ARIMA: ", mean_absolute_percentage_error(merged['y'], merged['AutoARIMA']))
print("MAPE using MSTL: ", mean_absolute_percentage_error(merged['y'], merged['MSTL']))
print("MAPE using AutoMFLES: ", mean_absolute_percentage_error(merged['y'], merged['AutoMFLES']))

MAPE using ARIMA:  0.17022523364912714
MAPE using MSTL:  0.1693179254887603
MAPE using AutoMFLES:  0.18515452838892862

In [33]:

def plot(train, test, forecast, unique_id, model_name):
    filtered_train = train[train['unique_id'] == unique_id]
    filtered_test = test[test['unique_id'] == unique_id]
    filtered_forecast = forecast[forecast['unique_id'] == unique_id]

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=filtered_train['ds'], y=filtered_train['y'], mode='lines', name='Train'))
    fig.add_trace(go.Scatter(x=filtered_test['ds'], y=filtered_test['y'], mode='lines', name='Test'))
    fig.add_trace(go.Scatter(x=filtered_forecast['ds'], y=filtered_forecast[model_name], mode='lines', name='Forecast'))
    fig.add_trace(go.Scatter(
        x=filtered_forecast['ds'], 
        y=filtered_forecast[f"{model_name}-hi-95"], 
        mode='lines', 
        name='Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))

    # Add lower bound as an area plot
    fig.add_trace(go.Scatter(
        x=filtered_forecast['ds'], 
        y=filtered_forecast[f"{model_name}-lo-95"], 
        mode='lines', 
        name='Lower Bound',
        fill='tonexty',  # Fill area between this trace and the previous one
        fillcolor='rgba(0, 100, 80, 0.2)',  # Set fill color with opacity
        line=dict(width=0),
        showlegend=False
    ))


    fig.update_layout(title=f'Total Sales Forecast for {unique_id}', xaxis_title='Date', yaxis_title='Total Sales')
    fig.show()

plot(train, test, forecast, 1, "AutoARIMA")
plot(train, test, forecast, 1, "MSTL")
plot(train, test, forecast, 1, "AutoMFLES")

In [ ]: