import re
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
pd.options.display.max_columns = None
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' # TF INFO not printed
data_input_dir = "data_input/"
train_csv = data_input_dir + "train_2.csv"
test_key_csv = data_input_dir + "key_2.csv"
submission_csv = "./submission.csv"
pred_horizon = 62
lookback_weeks = 16 # The look back weeks for computing the median features.
np.random.seed = 0
tf.random.set_seed(42)
def load_data(train_csv: str) -> tuple:
"""
Read data from CSV and split page column from views time series.
"""
train = pd.read_csv(train_csv)
page = train['Page'].copy()
views = train.iloc[:, 1:]
return page, views
def prepare_data(page: pd.DataFrame, views: pd.DataFrame, lookback_weeks: int) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, int]:
"""
- Split Page column to meta features.
- Split days to train and test sets for 2016 and 2017. Modeling will take advantage of yearly seasonality, 2016 for training, 2017 for test prediction.
X_train: 2016-03-14 ~ 2016-09-10
y_train: 2016-09-13 ~ 2016-11-14, 9 weeks (1 more day than required)
X_test: 2017-03-14 ~ 2017-09-10
y_test: 2017-09-13 ~ 2017-11-13
"""
meta = page.str.rsplit('_', n=3, expand=True)
meta.columns = ['title', 'site', 'access', 'agent']
meta = pd.concat([page, meta['site'], meta[['access', 'agent']].agg('_'.join, axis=1)], axis=1)
meta.columns = ['Page', 'Site', 'AccessAgent']
lookback_days = lookback_weeks * 7
X_train_end_idx = views.columns.get_loc('2016-09-10') + 1 # cut off at 09/10 as train_2 data stops at 2017/09/10, so we can use yearly seasonality
X_train = views.iloc[:, (X_train_end_idx - lookback_days) : X_train_end_idx].iloc[:, ::-1]
X_train = np.log1p(X_train)
y_train_start_idx = views.columns.get_loc('2016-09-13')
y_train = views.iloc[:, y_train_start_idx : (y_train_start_idx + pred_horizon + 1)].fillna(0)#.astype('float32').copy()
X_test = views.iloc[:, -lookback_days:].iloc[:, ::-1]
X_test = np.log1p(X_test)
return meta, X_train, y_train, X_test
page, views = load_data(train_csv)
meta, X_visits_train, y_visits_train, X_visits_test = prepare_data(page, views, lookback_weeks)
def add_median(X, y=None, n_weeks: int=16):
"""
Features: weekly median of log1p(visits) - n_weeks median of log1p(visits), for the latest n_weeks//2 weeks.
Labels: log1p(visits) - n_weeks median of log1p(visits)
"""
X_median_all = X.iloc[:, : 7 * n_weeks].median(axis=1).fillna(0).values.reshape(-1, 1)
n_features = n_weeks // 2
X_medians = np.empty((X.shape[0], n_features))
for i in range(n_features):
X_medians[:, i] = X.iloc[:, i*7 : (i+1)*7].median(axis=1, skipna=True).values
X_medians = np.nan_to_num(X_medians - X_median_all, nan=0.)
if y is not None:
y_medians = np.nan_to_num(np.log1p(y.values) - X_median_all, nan=0.)
else:
y_medians = None
return X_medians, y_medians, X_median_all
def one_hot_encode(valid):
onehot_encoder = OneHotEncoder()
site_access_enc = onehot_encoder.fit_transform(valid[['Site', 'AccessAgent']]).toarray()
return site_access_enc
# add median
X_medians_train, y_train, X_medians_all_train = add_median(X_visits_train, y_visits_train, lookback_weeks)
X_medians_test, _, X_medians_all_test = add_median(X_visits_test, None, lookback_weeks)
# one-hot encode category variables
X_cat = one_hot_encode(meta)
# Combine numerical and categorical features
X_train = np.c_[X_medians_train, X_cat]
X_test = np.c_[X_medians_test, X_cat]
def smape(y_true, y_pred):
"""
Compute the SMAPE metric. Input could be >1D.
"""
y_true, y_pred = np.ravel(y_true), np.ravel(y_pred)
denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
ape = np.abs(y_true - y_pred) / denom # absolute percentage error before averaging
ape[denom == 0.0] = 0.0 # Replace NaN with 0.0 in case div by 0
mape = np.nanmean(ape) # Mean of APE
return mape
def clipped_mae_loss(y_true, y_pred):
"""
Mean absolute error clipped, used as training loss.
"""
return K.mean(K.clip(K.abs(y_pred - y_true), 0., 1.), axis=-1)
def build_one_dnn(dim_X, dim_y, dropout_rate=0.5, C=0.00004):
"""
Build one DNN model with a skip connection.
"""
# Input tensor
input_tensor = keras.Input(shape=(dim_X,))
# hidden layer 1
hidden1 = keras.layers.Dense(
200, activation='relu',
kernel_initializer='lecun_uniform',
kernel_regularizer=keras.regularizers.L2(C)
)(input_tensor)
hidden1 = keras.layers.Dropout(dropout_rate)(hidden1)
# Wide concatenation
concat = keras.layers.Concatenate()([input_tensor, hidden1])
# hidden layer 2 with batch normalization
hidden2 = keras.layers.Dense(
200, activation='relu',
kernel_initializer='lecun_uniform',
kernel_regularizer=keras.regularizers.L2(C)
)(concat)
hidden2 = keras.layers.BatchNormalization(
beta_regularizer=keras.regularizers.L2(C),
gamma_regularizer=keras.regularizers.L2(C)
)(hidden2)
hidden2 = keras.layers.Dropout(dropout_rate)(hidden2)
# hidden layer 3
hidden3 = keras.layers.Dense(
100, activation='relu',
kernel_initializer='lecun_uniform',
kernel_regularizer=keras.regularizers.L2(C)
)(hidden2)
hidden3 = keras.layers.Dropout(dropout_rate)(hidden3)
# hidden layer 4
hidden4 = keras.layers.Dense(
200, activation='relu',
kernel_initializer='lecun_uniform',
kernel_regularizer=keras.regularizers.L2(C)
)(hidden3)
hidden4 = keras.layers.Dropout(dropout_rate)(hidden4)
# output layer
output = keras.layers.Dense(
dim_y, activation='linear',
kernel_initializer='lecun_uniform',
kernel_regularizer=keras.regularizers.L2(C)
)(hidden4)
# generate model
model = keras.Model(inputs=input_tensor, outputs=[output])
model.compile(loss=clipped_mae_loss, optimizer='adam')
return model
def fit_predict(
X_train, y_train, median_n_weeks_train, y_visits_train,
X_test, median_n_weeks_test,
n_bagging, n_rounds, epochs, batch_size):
"""
Train a bag of DNN models on K-fold train splits. The test predictions are the medians of individual model predictions.
"""
model_bag = [build_one_dnn(X_train.shape[1], y_train.shape[1]) for _ in range(n_bagging)]
kfold = KFold(n_splits=n_bagging)
y_test_pred = np.zeros((n_bagging, *y_train.shape)) # y_test y_train same shape[1]
visits_test_pred = np.zeros(y_train.shape) # y_test y_train same shape[1]
min_valid_loss = float('inf')
for i in range(n_rounds):
print(f"Round {i}", end=' ')
y_pred = np.zeros(y_train.shape)
for k, (train_index, test_index) in tqdm(
enumerate(kfold.split(X_train, y_train)), total=n_bagging):
X_train_k, y_train_k = X_train[train_index, :], y_train[train_index, :]
X_test_k = X_train[test_index, :]
model = model_bag[k]
history = model.fit(X_train_k, y_train_k,
#validation_data=[[X_train_median_diff[test_index, :], X_train_cat[test_index, :]], y_train[test_index, :]],
epochs=epochs, batch_size=batch_size, verbose=0, shuffle=True)
y_pred[test_index, :] = model.predict(X_test_k, batch_size=batch_size)
_ = model.predict(X_test)
visits_pred = np.expm1(y_pred + median_n_weeks_train)
visits_pred[visits_pred < 0.5] = 0
valid_loss = smape(y_visits_train, visits_pred)
print(f"{valid_loss = :.5f}")
# update test prediction if validation improves
if valid_loss < min_valid_loss:
for k in range(n_bagging):
y_test_pred[k, :, :] = model_bag[k].predict(X_test, batch_size=batch_size)
visits_test_pred = np.expm1(np.nanmedian(y_test_pred, axis=0) + median_n_weeks_test)
visits_test_pred[visits_test_pred < 0.5] = 0
return visits_test_pred
n_bagging = 20
n_rounds = 20
epochs=10
batch_size = 4096
visits_test_pred = fit_predict(
X_train, y_train, X_medians_all_train, y_visits_train,
X_test, X_medians_all_test,
n_bagging, n_rounds=n_rounds, epochs=epochs, batch_size=batch_size)
2022-01-01 22:53:52.269441: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory 2022-01-01 22:53:52.269460: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
Round 0
100%|██████████| 20/20 [04:38<00:00, 13.90s/it] /tmp/ipykernel_85912/1704954077.py:7: RuntimeWarning: invalid value encountered in true_divide ape = np.abs(y_true - y_pred) / denom # absolute percentage error before averaging
valid_loss = 0.48138 Round 1
100%|██████████| 20/20 [04:24<00:00, 13.23s/it]
valid_loss = 0.47140 Round 2
100%|██████████| 20/20 [02:41<00:00, 8.08s/it]
valid_loss = 0.45864 Round 3
100%|██████████| 20/20 [02:22<00:00, 7.14s/it]
valid_loss = 0.45100 Round 4
100%|██████████| 20/20 [02:17<00:00, 6.88s/it]
valid_loss = 0.44859 Round 5
100%|██████████| 20/20 [02:16<00:00, 6.83s/it]
valid_loss = 0.44747 Round 6
100%|██████████| 20/20 [26:45<00:00, 80.27s/it]
valid_loss = 0.44657 Round 7
100%|██████████| 20/20 [03:09<00:00, 9.49s/it]
valid_loss = 0.44559 Round 8
100%|██████████| 20/20 [02:22<00:00, 7.10s/it]
valid_loss = 0.44567 Round 9
100%|██████████| 20/20 [02:18<00:00, 6.93s/it]
valid_loss = 0.44559 Round 10
100%|██████████| 20/20 [02:16<00:00, 6.82s/it]
valid_loss = 0.44496 Round 11
100%|██████████| 20/20 [02:16<00:00, 6.83s/it]
valid_loss = 0.44496 Round 12
100%|██████████| 20/20 [02:26<00:00, 7.34s/it]
valid_loss = 0.44466 Round 13
100%|██████████| 20/20 [02:21<00:00, 7.10s/it]
valid_loss = 0.44460 Round 14
100%|██████████| 20/20 [02:23<00:00, 7.16s/it]
valid_loss = 0.44462 Round 15
100%|██████████| 20/20 [02:23<00:00, 7.16s/it]
valid_loss = 0.44425 Round 16
100%|██████████| 20/20 [02:23<00:00, 7.15s/it]
valid_loss = 0.44424 Round 17
100%|██████████| 20/20 [02:17<00:00, 6.90s/it]
valid_loss = 0.44412 Round 18
100%|██████████| 20/20 [02:17<00:00, 6.88s/it]
valid_loss = 0.44441 Round 19
100%|██████████| 20/20 [02:17<00:00, 6.88s/it]
valid_loss = 0.44431
# make submission
test_key = pd.read_csv(test_key_csv)
test_key_split = test_key['Page'].str.rsplit('_', n=1, expand=True)
test_key_split.columns = ['Page', 'Date']
test_key = pd.concat([test_key_split, test_key[['Id']]], axis=1)
test_dates = sorted(test_key['Date'].unique())
visits_test_pred_df = pd.DataFrame(visits_test_pred[:, 1:], columns=test_dates)
visits_test_pred_df = pd.concat([visits_test_pred_df, meta[['Page']]], axis=1)
visits_test_pred_df = pd.melt(visits_test_pred_df, id_vars=['Page'], var_name='Date', value_name='Visits')
submission_df = visits_test_pred_df.merge(test_key, how='left', on=['Page', 'Date'])[['Id', 'Visits']]
submission_df.to_csv(submission_csv, index=False)
Score: 37.73212 (at 11th place)