import plotly.io as pio
pio.renderers.default = "colab+notebook_connected+vscode"
import pandas as pd
import numpy as np
import duckdb as db
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import optuna
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, Markdown
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, MSTL, MFLES, AutoMFLES
# ConformalIntervals
from statsforecast.models import ConformalIntervals
# mape
from sklearn.metrics import mean_absolute_percentage_error
# sarimax
from statsmodels.tsa.statespace.sarimax import SARIMAX
optuna.logging.set_verbosity(optuna.logging.ERROR)
orig = pd.read_parquet("../../data/processed/train_enhanced.parquet")
train_agg = pd.read_parquet("../../data/processed/train_agg.parquet")
train_region_code_agg = pd.read_parquet("../../data/processed/train_region_code_agg.parquet")
holiday_df= pd.read_csv("../../data/processed/holidays.csv")
train_region_code_agg
Date | Region_Code | Total_Sales | Avg_Sales | Total_Orders | Avg_Orders | Num_Stores | Holiday | Total_Discounts | |
---|---|---|---|---|---|---|---|---|---|
0 | 2018-01-01 | R4 | 2286812 | 45736 | 2914 | 58 | 50 | 1 | 50 |
1 | 2018-01-01 | R2 | 4436859 | 42256 | 5644 | 54 | 105 | 1 | 105 |
2 | 2018-01-01 | R3 | 3527439 | 41017 | 4599 | 53 | 86 | 1 | 86 |
3 | 2018-01-01 | R1 | 5094374 | 41084 | 6509 | 52 | 124 | 1 | 124 |
4 | 2018-01-02 | R4 | 2545119 | 50902 | 3057 | 61 | 50 | 0 | 50 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2059 | 2019-05-30 | R4 | 1966320 | 39326 | 2829 | 57 | 50 | 0 | 4 |
2060 | 2019-05-31 | R2 | 4351299 | 41441 | 6411 | 61 | 105 | 1 | 11 |
2061 | 2019-05-31 | R4 | 1909319 | 38186 | 2746 | 55 | 50 | 1 | 1 |
2062 | 2019-05-31 | R1 | 5900798 | 47587 | 9433 | 76 | 124 | 1 | 18 |
2063 | 2019-05-31 | R3 | 3440408 | 40005 | 5012 | 58 | 86 | 1 | 9 |
2064 rows × 9 columns
df = train_region_code_agg[["Region_Code" ,"Date", "Total_Sales" , "Holiday","Total_Discounts"]]
date_mapping = {date: idx + 1 for idx, date in enumerate(sorted(df['Date'].unique()))}
df['idx'] = df['Date'].map(date_mapping)
df = df.rename(columns={"Total_Sales": "y", "Region_Code": "unique_id", "Date": "ds"})
df=df.sort_values(by='ds')
df
unique_id | ds | y | Holiday | Total_Discounts | idx | |
---|---|---|---|---|---|---|
0 | R4 | 2018-01-01 | 2286812 | 1 | 50 | 1 |
1 | R2 | 2018-01-01 | 4436859 | 1 | 105 | 1 |
2 | R3 | 2018-01-01 | 3527439 | 1 | 86 | 1 |
3 | R1 | 2018-01-01 | 5094374 | 1 | 124 | 1 |
4 | R4 | 2018-01-02 | 2545119 | 0 | 50 | 2 |
... | ... | ... | ... | ... | ... | ... |
2059 | R4 | 2019-05-30 | 1966320 | 0 | 4 | 515 |
2061 | R4 | 2019-05-31 | 1909319 | 1 | 1 | 516 |
2062 | R1 | 2019-05-31 | 5900798 | 1 | 18 | 516 |
2060 | R2 | 2019-05-31 | 4351299 | 1 | 11 | 516 |
2063 | R3 | 2019-05-31 | 3440408 | 1 | 9 | 516 |
2064 rows × 6 columns
threshold = 0.8
train = df[df['idx'] <= df['idx'].max() * threshold]
test = df[df['idx'] > df['idx'].max() * threshold]
train
unique_id | ds | y | Holiday | Total_Discounts | idx | |
---|---|---|---|---|---|---|
0 | R4 | 2018-01-01 | 2286812 | 1 | 50 | 1 |
1 | R2 | 2018-01-01 | 4436859 | 1 | 105 | 1 |
2 | R3 | 2018-01-01 | 3527439 | 1 | 86 | 1 |
3 | R1 | 2018-01-01 | 5094374 | 1 | 124 | 1 |
4 | R4 | 2018-01-02 | 2545119 | 0 | 50 | 2 |
... | ... | ... | ... | ... | ... | ... |
1640 | R3 | 2019-02-15 | 3300873 | 0 | 73 | 411 |
1644 | R2 | 2019-02-16 | 4485144 | 0 | 93 | 412 |
1645 | R4 | 2019-02-16 | 2120472 | 0 | 49 | 412 |
1646 | R1 | 2019-02-16 | 6426930 | 0 | 108 | 412 |
1647 | R3 | 2019-02-16 | 3960552 | 0 | 76 | 412 |
1648 rows × 6 columns
test
unique_id | ds | y | Holiday | Total_Discounts | idx | |
---|---|---|---|---|---|---|
1648 | R3 | 2019-02-17 | 4253736 | 0 | 83 | 413 |
1649 | R1 | 2019-02-17 | 6858420 | 0 | 120 | 413 |
1650 | R4 | 2019-02-17 | 2341383 | 0 | 49 | 413 |
1651 | R2 | 2019-02-17 | 4888986 | 0 | 99 | 413 |
1652 | R3 | 2019-02-18 | 3948027 | 0 | 78 | 414 |
... | ... | ... | ... | ... | ... | ... |
2059 | R4 | 2019-05-30 | 1966320 | 0 | 4 | 515 |
2061 | R4 | 2019-05-31 | 1909319 | 1 | 1 | 516 |
2062 | R1 | 2019-05-31 | 5900798 | 1 | 18 | 516 |
2060 | R2 | 2019-05-31 | 4351299 | 1 | 11 | 516 |
2063 | R3 | 2019-05-31 | 3440408 | 1 | 9 | 516 |
416 rows × 6 columns
prediction_window = test['idx'].max() - train['idx'].max()
prediction_window
104
train = train.drop(columns=['idx'], axis=1)
test = test.drop(columns=['idx'], axis=1)
seasonality = [7, 12, 30]
models = [
AutoARIMA(
season_length=12,
d=0,
approximation=True,
stationary=True
),
MSTL(season_length=[7, 12, 30],
trend_forecaster=AutoARIMA(
d=0,
approximation=True,
stationary=True,
prediction_intervals=ConformalIntervals(h=prediction_window),
)),
AutoMFLES(
season_length=seasonality,
test_size=20,
prediction_intervals=ConformalIntervals(h=prediction_window),
)
]
fcst = StatsForecast( models=models, freq="D", n_jobs=-1)
fcst.fit(train);
forecast = fcst.predict(h=prediction_window, level=[95], X_df=test.drop(columns=['y'], axis=1))
forecast
unique_id | ds | AutoARIMA | AutoARIMA-lo-95 | AutoARIMA-hi-95 | MSTL | MSTL-lo-95 | MSTL-hi-95 | AutoMFLES | AutoMFLES-lo-95 | AutoMFLES-hi-95 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | R1 | 2019-02-17 | 6666489.000 | 4.967450e+06 | 8365528.000 | 7860963.500 | 7.173004e+06 | 8548923.000 | 6284394.000 | 5568465.000 | 7000323.000 |
1 | R1 | 2019-02-18 | 6439762.000 | 4.580992e+06 | 8298532.500 | 6418570.500 | 5.809900e+06 | 7027240.500 | 6035970.500 | 5626605.500 | 6445336.000 |
2 | R1 | 2019-02-19 | 4717097.500 | 2.828473e+06 | 6605722.500 | 3911237.750 | 1.795879e+06 | 6026596.500 | 4297116.000 | 1185046.375 | 7409185.500 |
3 | R1 | 2019-02-20 | 5539322.500 | 3.644876e+06 | 7433769.500 | 5014326.500 | 3.016612e+06 | 7012041.000 | 5055353.000 | 2069786.250 | 8040920.000 |
4 | R1 | 2019-02-21 | 5380298.000 | 3.484707e+06 | 7275888.500 | 4978095.000 | 2.719740e+06 | 7236449.000 | 4924604.000 | 1592243.250 | 8256965.000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
411 | R4 | 2019-05-27 | 2244195.000 | 1.554406e+06 | 2933983.750 | 2056336.500 | 1.980181e+06 | 2132491.750 | 1875758.875 | 1646278.875 | 2105238.750 |
412 | R4 | 2019-05-28 | 2234609.750 | 1.544821e+06 | 2924398.500 | 1933951.875 | 1.729963e+06 | 2137941.000 | 1820966.250 | 1588354.000 | 2053578.625 |
413 | R4 | 2019-05-29 | 2052491.625 | 1.362703e+06 | 2742280.250 | 1975758.500 | 1.717912e+06 | 2233604.500 | 1684806.250 | 1307494.750 | 2062117.875 |
414 | R4 | 2019-05-30 | 1860788.375 | 1.171000e+06 | 2550577.000 | 1572959.375 | 1.177561e+06 | 1968357.625 | 1647948.625 | 1221317.625 | 2074579.625 |
415 | R4 | 2019-05-31 | 1395954.375 | 7.061657e+05 | 2085743.125 | 1374726.000 | 9.185508e+05 | 1830901.250 | 1486565.500 | 1341881.250 | 1631249.875 |
416 rows × 11 columns
merged =test.merge(forecast, on=['unique_id', 'ds'], how='left')
print("MAPE using ARIMA: ", mean_absolute_percentage_error(merged['y'], merged['AutoARIMA']))
print("MAPE using MSTL: ", mean_absolute_percentage_error(merged['y'], merged['MSTL']))
print("MAPE using AutoMFLES: ", mean_absolute_percentage_error(merged['y'], merged['AutoMFLES']))
MAPE using ARIMA: 0.1742272055815889 MAPE using MSTL: 0.17424597681344803 MAPE using AutoMFLES: 0.19126523176303173
def plot(train, test, forecast, unique_id, model_name):
filtered_train = train[train['unique_id'] == unique_id]
filtered_test = test[test['unique_id'] == unique_id]
filtered_forecast = forecast[forecast['unique_id'] == unique_id]
fig = go.Figure()
fig.add_trace(go.Scatter(x=filtered_train['ds'], y=filtered_train['y'], mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=filtered_test['ds'], y=filtered_test['y'], mode='lines', name='Test'))
fig.add_trace(go.Scatter(x=filtered_forecast['ds'], y=filtered_forecast[model_name], mode='lines', name='Forecast'))
fig.add_trace(go.Scatter(
x=filtered_forecast['ds'],
y=filtered_forecast[f"{model_name}-hi-95"],
mode='lines',
name='Upper Bound',
line=dict(width=0),
showlegend=False
))
# Add lower bound as an area plot
fig.add_trace(go.Scatter(
x=filtered_forecast['ds'],
y=filtered_forecast[f"{model_name}-lo-95"],
mode='lines',
name='Lower Bound',
fill='tonexty', # Fill area between this trace and the previous one
fillcolor='rgba(0, 100, 80, 0.2)', # Set fill color with opacity
line=dict(width=0),
showlegend=False
))
fig.update_layout(title=f'Total Sales Forecast for {unique_id}', xaxis_title='Date', yaxis_title='Total Sales')
fig.show()
plot(train, test, forecast, "R1", "AutoARIMA")
plot(train, test, forecast, "R2", "AutoARIMA")
plot(train, test, forecast, "R3", "AutoARIMA")
plot(train, test, forecast, "R4", "AutoARIMA")
plot(train, test, forecast, "R1", "MSTL")
plot(train, test, forecast, "R2", "MSTL")
plot(train, test, forecast, "R3", "MSTL")
plot(train, test, forecast, "R4", "MSTL")
plot(train, test, forecast, "R1", "AutoMFLES")
plot(train, test, forecast, "R2", "AutoMFLES")
plot(train, test, forecast, "R3", "AutoMFLES")
plot(train, test, forecast, "R4", "AutoMFLES")
train_agg
Date | Total_Sales | Avg_Sales | Total_Orders | Avg_Orders | Holiday | Total_Discounts | |
---|---|---|---|---|---|---|---|
0 | 2018-01-01 | 15345484 | 42042 | 19666 | 54 | 1 | 365 |
1 | 2018-01-02 | 19592415 | 53678 | 25326 | 69 | 0 | 365 |
2 | 2018-01-03 | 18652527 | 51103 | 24047 | 66 | 0 | 365 |
3 | 2018-01-04 | 19956267 | 54675 | 25584 | 70 | 0 | 364 |
4 | 2018-01-05 | 22902651 | 62747 | 28436 | 78 | 0 | 364 |
... | ... | ... | ... | ... | ... | ... | ... |
511 | 2019-05-27 | 17197023 | 47115 | 25447 | 70 | 0 | 321 |
512 | 2019-05-28 | 18652065 | 51102 | 27184 | 74 | 0 | 319 |
513 | 2019-05-29 | 16213497 | 44421 | 24047 | 66 | 0 | 193 |
514 | 2019-05-30 | 16082139 | 44061 | 24318 | 67 | 0 | 76 |
515 | 2019-05-31 | 15601825 | 42745 | 23602 | 65 | 1 | 39 |
516 rows × 7 columns
df = train_agg[["Date", "Total_Sales" , "Holiday","Total_Discounts"]]
date_mapping = {date: idx + 1 for idx, date in enumerate(sorted(df['Date'].unique()))}
df['idx'] = df['Date'].map(date_mapping)
df = df.rename(columns={"Total_Sales": "y","Date": "ds"})
df=df.sort_values(by='ds')
df["unique_id"] = 1
df
ds | y | Holiday | Total_Discounts | idx | unique_id | |
---|---|---|---|---|---|---|
0 | 2018-01-01 | 15345484 | 1 | 365 | 1 | 1 |
1 | 2018-01-02 | 19592415 | 0 | 365 | 2 | 1 |
2 | 2018-01-03 | 18652527 | 0 | 365 | 3 | 1 |
3 | 2018-01-04 | 19956267 | 0 | 364 | 4 | 1 |
4 | 2018-01-05 | 22902651 | 0 | 364 | 5 | 1 |
... | ... | ... | ... | ... | ... | ... |
511 | 2019-05-27 | 17197023 | 0 | 321 | 512 | 1 |
512 | 2019-05-28 | 18652065 | 0 | 319 | 513 | 1 |
513 | 2019-05-29 | 16213497 | 0 | 193 | 514 | 1 |
514 | 2019-05-30 | 16082139 | 0 | 76 | 515 | 1 |
515 | 2019-05-31 | 15601825 | 1 | 39 | 516 | 1 |
516 rows × 6 columns
threshold = 0.8
train = df[df['idx'] <= df['idx'].max() * threshold]
test = df[df['idx'] > df['idx'].max() * threshold]
train
ds | y | Holiday | Total_Discounts | idx | unique_id | |
---|---|---|---|---|---|---|
0 | 2018-01-01 | 15345484 | 1 | 365 | 1 | 1 |
1 | 2018-01-02 | 19592415 | 0 | 365 | 2 | 1 |
2 | 2018-01-03 | 18652527 | 0 | 365 | 3 | 1 |
3 | 2018-01-04 | 19956267 | 0 | 364 | 4 | 1 |
4 | 2018-01-05 | 22902651 | 0 | 364 | 5 | 1 |
... | ... | ... | ... | ... | ... | ... |
407 | 2019-02-12 | 12384495 | 0 | 7 | 408 | 1 |
408 | 2019-02-13 | 12457656 | 0 | 39 | 409 | 1 |
409 | 2019-02-14 | 13301838 | 0 | 139 | 410 | 1 |
410 | 2019-02-15 | 14306637 | 0 | 307 | 411 | 1 |
411 | 2019-02-16 | 16993098 | 0 | 326 | 412 | 1 |
412 rows × 6 columns
test
ds | y | Holiday | Total_Discounts | idx | unique_id | |
---|---|---|---|---|---|---|
412 | 2019-02-17 | 18342525 | 0 | 351 | 413 | 1 |
413 | 2019-02-18 | 16319103 | 0 | 319 | 414 | 1 |
414 | 2019-02-19 | 15816166 | 1 | 222 | 415 | 1 |
415 | 2019-02-20 | 12800094 | 0 | 59 | 416 | 1 |
416 | 2019-02-21 | 12383694 | 0 | 37 | 417 | 1 |
... | ... | ... | ... | ... | ... | ... |
511 | 2019-05-27 | 17197023 | 0 | 321 | 512 | 1 |
512 | 2019-05-28 | 18652065 | 0 | 319 | 513 | 1 |
513 | 2019-05-29 | 16213497 | 0 | 193 | 514 | 1 |
514 | 2019-05-30 | 16082139 | 0 | 76 | 515 | 1 |
515 | 2019-05-31 | 15601825 | 1 | 39 | 516 | 1 |
104 rows × 6 columns
prediction_window = test['idx'].max() - train['idx'].max()
prediction_window
104
train = train.drop(columns=['idx'], axis=1)
test = test.drop(columns=['idx'], axis=1)
seasonality = [7, 12, 30]
models = [
AutoARIMA(
season_length=12,
d=0,
approximation=True,
stationary=True
),
MSTL(season_length=[7, 12, 30],
trend_forecaster=AutoARIMA(
d=0,
approximation=True,
stationary=True,
prediction_intervals=ConformalIntervals(h=prediction_window),
)),
AutoMFLES(
season_length=seasonality,
test_size=20,
prediction_intervals=ConformalIntervals(h=prediction_window),
)
]
fcst = StatsForecast( models=models, freq="D", n_jobs=-1)
fcst.fit(train);
forecast = fcst.predict(h=prediction_window, level=[95], X_df=test.drop(columns=['y'], axis=1))
forecast
unique_id | ds | AutoARIMA | AutoARIMA-lo-95 | AutoARIMA-hi-95 | MSTL | MSTL-lo-95 | MSTL-hi-95 | AutoMFLES | AutoMFLES-lo-95 | AutoMFLES-hi-95 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2019-02-17 | 17897164.0 | 13301840.0 | 22492488.0 | 20815704.0 | 18865470.0 | 22765938.0 | 16443518.0 | 15116372.00 | 17770664.0 |
1 | 1 | 2019-02-18 | 17645540.0 | 12640227.0 | 22650854.0 | 17287290.0 | 15781578.0 | 18793000.0 | 16055208.0 | 15050656.00 | 17059760.0 |
2 | 1 | 2019-02-19 | 13183467.0 | 8105391.5 | 18261542.0 | 10942386.0 | 5298980.0 | 16585792.0 | 11509044.0 | 2970222.25 | 20047866.0 |
3 | 1 | 2019-02-20 | 14814651.0 | 9723128.0 | 19906174.0 | 13533622.0 | 8225240.0 | 18842004.0 | 12900191.0 | 4727806.50 | 21072574.0 |
4 | 1 | 2019-02-21 | 14426889.0 | 9332863.0 | 19520914.0 | 13593423.0 | 7848842.5 | 19338004.0 | 12633228.0 | 3807427.25 | 21459028.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99 | 1 | 2019-05-27 | 17838166.0 | 12724844.0 | 22951490.0 | 16683699.0 | 16239398.0 | 17128000.0 | 16079478.0 | 15046672.00 | 17112284.0 |
100 | 1 | 2019-05-28 | 17814770.0 | 12701448.0 | 22928094.0 | 15773848.0 | 14561388.0 | 16986308.0 | 16055208.0 | 14594145.00 | 17516272.0 |
101 | 1 | 2019-05-29 | 16340833.0 | 11227510.0 | 21454156.0 | 15926136.0 | 13938983.0 | 17913290.0 | 14526238.0 | 11550098.00 | 17502378.0 |
102 | 1 | 2019-05-30 | 14972177.0 | 9858854.0 | 20085500.0 | 12472893.0 | 9664525.0 | 15281260.0 | 13106480.0 | 10257496.00 | 15955465.0 |
103 | 1 | 2019-05-31 | 11072638.0 | 5959315.0 | 16185961.0 | 10983630.0 | 7566960.5 | 14400299.0 | 9288397.0 | 8639927.00 | 9936868.0 |
104 rows × 11 columns
merged =test.merge(forecast, on=['unique_id', 'ds'], how='left')
print("MAPE using ARIMA: ", mean_absolute_percentage_error(merged['y'], merged['AutoARIMA']))
print("MAPE using MSTL: ", mean_absolute_percentage_error(merged['y'], merged['MSTL']))
print("MAPE using AutoMFLES: ", mean_absolute_percentage_error(merged['y'], merged['AutoMFLES']))
MAPE using ARIMA: 0.17022523364912714 MAPE using MSTL: 0.1693179254887603 MAPE using AutoMFLES: 0.18515452838892862
def plot(train, test, forecast, unique_id, model_name):
filtered_train = train[train['unique_id'] == unique_id]
filtered_test = test[test['unique_id'] == unique_id]
filtered_forecast = forecast[forecast['unique_id'] == unique_id]
fig = go.Figure()
fig.add_trace(go.Scatter(x=filtered_train['ds'], y=filtered_train['y'], mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=filtered_test['ds'], y=filtered_test['y'], mode='lines', name='Test'))
fig.add_trace(go.Scatter(x=filtered_forecast['ds'], y=filtered_forecast[model_name], mode='lines', name='Forecast'))
fig.add_trace(go.Scatter(
x=filtered_forecast['ds'],
y=filtered_forecast[f"{model_name}-hi-95"],
mode='lines',
name='Upper Bound',
line=dict(width=0),
showlegend=False
))
# Add lower bound as an area plot
fig.add_trace(go.Scatter(
x=filtered_forecast['ds'],
y=filtered_forecast[f"{model_name}-lo-95"],
mode='lines',
name='Lower Bound',
fill='tonexty', # Fill area between this trace and the previous one
fillcolor='rgba(0, 100, 80, 0.2)', # Set fill color with opacity
line=dict(width=0),
showlegend=False
))
fig.update_layout(title=f'Total Sales Forecast for {unique_id}', xaxis_title='Date', yaxis_title='Total Sales')
fig.show()
plot(train, test, forecast, 1, "AutoARIMA")
plot(train, test, forecast, 1, "MSTL")
plot(train, test, forecast, 1, "AutoMFLES")