import plotly.io as pio
pio.renderers.default = "colab+notebook_connected+vscode"
import pandas as pd
import numpy as np
import duckdb as db
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import optuna
import warnings
warnings.filterwarnings('ignore')
# exponential smoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.exponential_smoothing.ets import ETSModel
from IPython.display import display, Markdown
# mape
from sklearn.metrics import mean_absolute_percentage_error
optuna.logging.set_verbosity(optuna.logging.ERROR)
df = pd.read_parquet("../../data/processed/train_enhanced.parquet")
train_agg = pd.read_parquet("../../data/processed/train_agg.parquet")
train_region_code_agg = pd.read_parquet("../../data/processed/train_region_code_agg.parquet")
holiday_df= pd.read_csv("../../data/processed/holidays.csv")
def split_train_test(df, test_size=0.2):
split_idx = int(len(df) * (1 - test_size))
return df.iloc[:split_idx], df.iloc[split_idx:]
train_region_code_agg_R1 = train_region_code_agg[train_region_code_agg['Region_Code'] == 'R1'].set_index('Date')
train_region_code_agg_R1.head()
Region_Code | Total_Sales | Avg_Sales | Total_Orders | Avg_Orders | Num_Stores | Holiday | Total_Discounts | |
---|---|---|---|---|---|---|---|---|
Date | ||||||||
2018-01-01 | R1 | 5094374 | 41084 | 6509 | 52 | 124 | 1 | 124 |
2018-01-02 | R1 | 7050675 | 56860 | 9738 | 79 | 124 | 0 | 124 |
2018-01-03 | R1 | 6851526 | 55254 | 9473 | 76 | 124 | 0 | 124 |
2018-01-04 | R1 | 7362648 | 59376 | 10132 | 82 | 124 | 0 | 124 |
2018-01-05 | R1 | 8153604 | 65755 | 10883 | 88 | 124 | 0 | 124 |
train_region_code_agg_R1_train, train_region_code_agg_R1_test = split_train_test(train_region_code_agg_R1)
train_region_code_agg_R1_train.shape, train_region_code_agg_R1_test.shape
((412, 8), (104, 8))
ses = ExponentialSmoothing(train_region_code_agg_R1_train['Total_Sales'], seasonal='add', seasonal_periods=12, freq='D').fit()
ses.summary()
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_region_code_agg_R1_train.index, y=train_region_code_agg_R1_train['Total_Sales'], mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=train_region_code_agg_R1_test.index, y=train_region_code_agg_R1_test['Total_Sales'], mode='lines', name='Test'))
fig.add_trace(go.Scatter(x=train_region_code_agg_R1_test.index, y=ses.forecast(len(train_region_code_agg_R1_test)), mode='lines', name='Forecast'))
fig.show()
mape = mean_absolute_percentage_error(train_region_code_agg_R1_test['Total_Sales'], ses.forecast(len(train_region_code_agg_R1_test)))
mape
0.19351601851390085
def objective(trial):
seasonal = trial.suggest_categorical('seasonal', ['add', 'mul', 'additive', 'multiplicative'])
seasonal_periods = trial.suggest_int('seasonal_periods', 7, 100)
model = ExponentialSmoothing(train_region_code_agg_R1_train['Total_Sales'], seasonal=seasonal, seasonal_periods=seasonal_periods, freq='D').fit()
forecast = model.forecast(len(train_region_code_agg_R1_test))
mape = mean_absolute_percentage_error(train_region_code_agg_R1_test['Total_Sales'], forecast)
return mape
study = optuna.create_study(direction='minimize', study_name='ses')
study.optimize(objective, n_trials=300)
study.best_params
{'seasonal': 'multiplicative', 'seasonal_periods': 12}
ses = ExponentialSmoothing(train_region_code_agg_R1_train['Total_Sales'], seasonal = 'multiplicative', seasonal_periods=12, freq='D').fit()
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_region_code_agg_R1_train.index, y=train_region_code_agg_R1_train['Total_Sales'], mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=train_region_code_agg_R1_test.index, y=train_region_code_agg_R1_test['Total_Sales'], mode='lines', name='Test'))
fig.add_trace(go.Scatter(x=train_region_code_agg_R1_test.index, y=ses.forecast(len(train_region_code_agg_R1_test)), mode='lines', name='Forecast'))
fig.show()
mape = mean_absolute_percentage_error(train_region_code_agg_R1_test['Total_Sales'], ses.forecast(len(train_region_code_agg_R1_test)))
mape
0.19180792258357673
def plot_forecast(model, train, test):
fig = go.Figure()
fig.add_trace(go.Scatter(x=train.index, y=train['Total_Sales'], mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=test.index, y=test['Total_Sales'], mode='lines', name='Test'))
fig.add_trace(go.Scatter(x=test.index, y=model.forecast(len(test)), mode='lines', name='Forecast'))
fig.show()
def tune_parameters(train, test):
def objective(trial):
seasonal = trial.suggest_categorical('seasonal', ['add', 'mul', 'additive', 'multiplicative'])
seasonal_periods = trial.suggest_int('seasonal_periods', 7, 100)
model = ExponentialSmoothing(train['Total_Sales'], seasonal=seasonal, seasonal_periods=seasonal_periods, freq='D').fit()
forecast = model.forecast(len(test))
mape = mean_absolute_percentage_error(test['Total_Sales'], forecast)
return mape
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)
return study.best_params
train = train_region_code_agg[train_region_code_agg['Region_Code'] == 'R2'].set_index('Date')
train_train, train_test = split_train_test(train)
params = tune_parameters(train_train, train_test)
print(params)
{'seasonal': 'multiplicative', 'seasonal_periods': 12}
ses = ExponentialSmoothing(train_train['Total_Sales'], seasonal = params['seasonal'], seasonal_periods=params["seasonal_periods"], freq='D').fit()
plot_forecast(ses, train_train, train_test)
print("MAPE: ",mean_absolute_percentage_error(train_test['Total_Sales'], ses.forecast(len(train_test))))
MAPE: 0.17683126754833497
train = train_region_code_agg[train_region_code_agg['Region_Code'] == 'R3'].set_index('Date')
train_train, train_test = split_train_test(train)
params = tune_parameters(train_train, train_test)
print(params)
{'seasonal': 'mul', 'seasonal_periods': 12}
ses = ExponentialSmoothing(train_train['Total_Sales'], seasonal = params['seasonal'], seasonal_periods=12, freq='D').fit()
plot_forecast(ses, train_train, train_test)
print("MAPE: ",mean_absolute_percentage_error(train_test['Total_Sales'], ses.forecast(len(train_test))))
MAPE: 0.20489387241933427
train = train_region_code_agg[train_region_code_agg['Region_Code'] == 'R4'].set_index('Date')
train_train, train_test = split_train_test(train)
params = tune_parameters(train_train, train_test)
print(params)
{'seasonal': 'multiplicative', 'seasonal_periods': 12}
ses = ExponentialSmoothing(train_train['Total_Sales'], seasonal = params['seasonal'], seasonal_periods=12, freq='D').fit()
plot_forecast(ses, train_train, train_test)
print("MAPE: ",mean_absolute_percentage_error(train_test['Total_Sales'], ses.forecast(len(train_test))))
MAPE: 0.198372394232825
train_agg
Date | Total_Sales | Avg_Sales | Total_Orders | Avg_Orders | Holiday | Total_Discounts | |
---|---|---|---|---|---|---|---|
0 | 2018-01-01 | 15345484 | 42042 | 19666 | 54 | 1 | 365 |
1 | 2018-01-02 | 19592415 | 53678 | 25326 | 69 | 0 | 365 |
2 | 2018-01-03 | 18652527 | 51103 | 24047 | 66 | 0 | 365 |
3 | 2018-01-04 | 19956267 | 54675 | 25584 | 70 | 0 | 364 |
4 | 2018-01-05 | 22902651 | 62747 | 28436 | 78 | 0 | 364 |
... | ... | ... | ... | ... | ... | ... | ... |
511 | 2019-05-27 | 17197023 | 47115 | 25447 | 70 | 0 | 321 |
512 | 2019-05-28 | 18652065 | 51102 | 27184 | 74 | 0 | 319 |
513 | 2019-05-29 | 16213497 | 44421 | 24047 | 66 | 0 | 193 |
514 | 2019-05-30 | 16082139 | 44061 | 24318 | 67 | 0 | 76 |
515 | 2019-05-31 | 15601825 | 42745 | 23602 | 65 | 1 | 39 |
516 rows × 7 columns
train = train_agg.set_index('Date')
train_train, train_test = split_train_test(train)
params = tune_parameters(train_train, train_test)
print(params)
{'seasonal': 'multiplicative', 'seasonal_periods': 63}
ses = ExponentialSmoothing(train_train['Total_Sales'], seasonal = params['seasonal'], seasonal_periods=12, freq='D').fit()
plot_forecast(ses, train_train, train_test)
print("MAPE: ",mean_absolute_percentage_error(train_test['Total_Sales'], ses.forecast(len(train_test))))
MAPE: 0.19000567422948877
db.sql("""
select Store_id, round(sum(Sales),0) t_sales,
round(sum(Sales)*100,0)/(select sum(Sales) from df where Location_Type = 'L1' ) as pct_sales,
from df where Location_Type = 'L1' group by Store_id order by pct_sales desc
""")
┌──────────┬────────────┬─────────────────────┐ │ Store_id │ t_sales │ pct_sales │ │ int64 │ double │ double │ ├──────────┼────────────┼─────────────────────┤ │ 298 │ 32572688.0 │ 0.9229065277013243 │ │ 346 │ 29651962.0 │ 0.8401514188664002 │ │ 39 │ 29234458.0 │ 0.8283219444027082 │ │ 49 │ 28886187.0 │ 0.8184541324856239 │ │ 278 │ 28776251.0 │ 0.8153392263539905 │ │ 195 │ 28151748.0 │ 0.7976447046967527 │ │ 357 │ 28143722.0 │ 0.7974173050680229 │ │ 201 │ 28051720.0 │ 0.7948105364863556 │ │ 97 │ 27748247.0 │ 0.7862120033783618 │ │ 18 │ 27613532.0 │ 0.7823950225152212 │ │ · │ · │ · │ │ · │ · │ · │ │ · │ · │ · │ │ 51 │ 17388121.0 │ 0.49267074395857163 │ │ 177 │ 17261487.0 │ 0.489082734397513 │ │ 187 │ 17158548.0 │ 0.48616608946166706 │ │ 238 │ 17059578.0 │ 0.48336188674650066 │ │ 148 │ 16884632.0 │ 0.47840502024680814 │ │ 46 │ 16698801.0 │ 0.4731397190143569 │ │ 360 │ 16221385.0 │ 0.4596127378789156 │ │ 159 │ 15963333.0 │ 0.4523011680927709 │ │ 142 │ 15932457.0 │ 0.4514263299943258 │ │ 365 │ 15867221.0 │ 0.44957795203091344 │ ├──────────┴────────────┴─────────────────────┤ │ 165 rows (20 shown) 3 columns │ └─────────────────────────────────────────────┘