In [1]:
import re
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold

import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K

pd.options.display.max_columns = None

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'  # TF INFO not printed
In [2]:
data_input_dir = "data_input/"
train_csv = data_input_dir + "train_2.csv"
test_key_csv = data_input_dir + "key_2.csv"
submission_csv = "./submission.csv"

pred_horizon = 62
lookback_weeks = 16  # The look back weeks for computing the median features.
In [3]:
np.random.seed = 0
tf.random.set_seed(42)

I. Prepare Data

In [4]:
def load_data(train_csv: str) -> tuple:
    """
    Read data from CSV and split page column from views time series.
    """
    train = pd.read_csv(train_csv)
    page = train['Page'].copy()
    views = train.iloc[:, 1:]
    return page, views

def prepare_data(page: pd.DataFrame, views: pd.DataFrame, lookback_weeks: int) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, int]:
    """
    - Split Page column to meta features.
    - Split days to train and test sets for 2016 and 2017. Modeling will take advantage of yearly seasonality, 2016 for training, 2017 for test prediction.
        X_train: 2016-03-14 ~ 2016-09-10
        y_train: 2016-09-13 ~ 2016-11-14, 9 weeks (1 more day than required)
        X_test: 2017-03-14 ~ 2017-09-10
        y_test: 2017-09-13 ~ 2017-11-13
    """
    meta = page.str.rsplit('_', n=3, expand=True)
    meta.columns = ['title', 'site', 'access', 'agent']
    meta = pd.concat([page, meta['site'], meta[['access', 'agent']].agg('_'.join, axis=1)], axis=1)
    meta.columns = ['Page', 'Site', 'AccessAgent']

    lookback_days = lookback_weeks * 7
    X_train_end_idx = views.columns.get_loc('2016-09-10') + 1  # cut off at 09/10 as train_2 data stops at 2017/09/10, so we can use yearly seasonality
    X_train = views.iloc[:, (X_train_end_idx - lookback_days) : X_train_end_idx].iloc[:, ::-1]
    X_train = np.log1p(X_train)

    y_train_start_idx = views.columns.get_loc('2016-09-13')
    y_train = views.iloc[:, y_train_start_idx : (y_train_start_idx + pred_horizon + 1)].fillna(0)#.astype('float32').copy()

    X_test = views.iloc[:, -lookback_days:].iloc[:, ::-1]
    X_test = np.log1p(X_test)

    return meta, X_train, y_train, X_test
In [5]:
page, views = load_data(train_csv)
meta, X_visits_train, y_visits_train, X_visits_test = prepare_data(page, views, lookback_weeks)

II. Feature Engineering

In [6]:
def add_median(X, y=None, n_weeks: int=16):
    """
    Features: weekly median of log1p(visits) - n_weeks median of log1p(visits), for the latest n_weeks//2 weeks.
    Labels: log1p(visits) - n_weeks median of log1p(visits)
    """
    X_median_all = X.iloc[:, : 7 * n_weeks].median(axis=1).fillna(0).values.reshape(-1, 1)

    n_features = n_weeks // 2
    X_medians = np.empty((X.shape[0], n_features))
    for i in range(n_features):
        X_medians[:, i] = X.iloc[:, i*7 : (i+1)*7].median(axis=1, skipna=True).values
    X_medians = np.nan_to_num(X_medians - X_median_all, nan=0.)

    if y is not None:
        y_medians = np.nan_to_num(np.log1p(y.values) - X_median_all, nan=0.)
    else:
        y_medians = None

    return X_medians, y_medians, X_median_all

def one_hot_encode(valid):
    onehot_encoder = OneHotEncoder()
    site_access_enc = onehot_encoder.fit_transform(valid[['Site', 'AccessAgent']]).toarray()
    return site_access_enc
In [7]:
# add median
X_medians_train, y_train, X_medians_all_train = add_median(X_visits_train, y_visits_train, lookback_weeks)
X_medians_test, _, X_medians_all_test = add_median(X_visits_test, None, lookback_weeks)

# one-hot encode category variables
X_cat = one_hot_encode(meta)

# Combine numerical and categorical features
X_train = np.c_[X_medians_train, X_cat]
X_test = np.c_[X_medians_test, X_cat]

III. Model Training and Prediction

In [8]:
def smape(y_true, y_pred):
    """
    Compute the SMAPE metric. Input could be >1D.
    """
    y_true, y_pred = np.ravel(y_true), np.ravel(y_pred)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    ape = np.abs(y_true - y_pred) / denom  # absolute percentage error before averaging
    ape[denom == 0.0] = 0.0  # Replace NaN with 0.0 in case div by 0
    mape = np.nanmean(ape)  # Mean of APE
    return mape

def clipped_mae_loss(y_true, y_pred):
    """
    Mean absolute error clipped, used as training loss.
    """
    return K.mean(K.clip(K.abs(y_pred - y_true), 0., 1.), axis=-1)

def build_one_dnn(dim_X, dim_y, dropout_rate=0.5, C=0.00004):
    """
    Build one DNN model with a skip connection.
    """
    # Input tensor
    input_tensor = keras.Input(shape=(dim_X,))

    # hidden layer 1
    hidden1 = keras.layers.Dense(
        200, activation='relu',
        kernel_initializer='lecun_uniform',
        kernel_regularizer=keras.regularizers.L2(C)
    )(input_tensor)
    hidden1 = keras.layers.Dropout(dropout_rate)(hidden1)

    # Wide concatenation
    concat = keras.layers.Concatenate()([input_tensor, hidden1])

    # hidden layer 2 with batch normalization
    hidden2 = keras.layers.Dense(
        200, activation='relu',
        kernel_initializer='lecun_uniform',
        kernel_regularizer=keras.regularizers.L2(C)
        )(concat)
    hidden2 = keras.layers.BatchNormalization(
        beta_regularizer=keras.regularizers.L2(C),
        gamma_regularizer=keras.regularizers.L2(C)
        )(hidden2)
    hidden2 = keras.layers.Dropout(dropout_rate)(hidden2)

    # hidden layer 3
    hidden3 = keras.layers.Dense(
        100, activation='relu',
        kernel_initializer='lecun_uniform',
        kernel_regularizer=keras.regularizers.L2(C)
    )(hidden2)
    hidden3 = keras.layers.Dropout(dropout_rate)(hidden3)

    # hidden layer 4
    hidden4 = keras.layers.Dense(
        200, activation='relu',
        kernel_initializer='lecun_uniform',
        kernel_regularizer=keras.regularizers.L2(C)
    )(hidden3)
    hidden4 = keras.layers.Dropout(dropout_rate)(hidden4)

    # output layer
    output = keras.layers.Dense(
        dim_y, activation='linear',
        kernel_initializer='lecun_uniform',
        kernel_regularizer=keras.regularizers.L2(C)
    )(hidden4)
    
    # generate model
    model = keras.Model(inputs=input_tensor, outputs=[output])
    model.compile(loss=clipped_mae_loss, optimizer='adam')
    return model

def fit_predict(
    X_train, y_train, median_n_weeks_train, y_visits_train,
    X_test, median_n_weeks_test,
    n_bagging, n_rounds, epochs, batch_size):
    """
    Train a bag of DNN models on K-fold train splits. The test predictions are the medians of individual model predictions.
    """
    model_bag = [build_one_dnn(X_train.shape[1], y_train.shape[1]) for _ in range(n_bagging)]
    kfold = KFold(n_splits=n_bagging)

    y_test_pred = np.zeros((n_bagging, *y_train.shape))  # y_test y_train same shape[1]
    visits_test_pred = np.zeros(y_train.shape)  # y_test y_train same shape[1]
    min_valid_loss = float('inf')
    for i in range(n_rounds):
        print(f"Round {i}", end=' ')        
        y_pred = np.zeros(y_train.shape)
        for k, (train_index, test_index) in tqdm(
            enumerate(kfold.split(X_train, y_train)), total=n_bagging):
            X_train_k, y_train_k = X_train[train_index, :], y_train[train_index, :]
            X_test_k = X_train[test_index, :]
            model = model_bag[k]
            history = model.fit(X_train_k, y_train_k,
                #validation_data=[[X_train_median_diff[test_index, :], X_train_cat[test_index, :]], y_train[test_index, :]],
                epochs=epochs, batch_size=batch_size, verbose=0, shuffle=True)
            y_pred[test_index, :] = model.predict(X_test_k, batch_size=batch_size)
            _ = model.predict(X_test)
        
        visits_pred = np.expm1(y_pred + median_n_weeks_train)
        visits_pred[visits_pred < 0.5] = 0
        valid_loss = smape(y_visits_train, visits_pred)
        print(f"{valid_loss = :.5f}")

        # update test prediction if validation improves
        if valid_loss < min_valid_loss:
            for k in range(n_bagging):
                y_test_pred[k, :, :] = model_bag[k].predict(X_test, batch_size=batch_size)
            visits_test_pred = np.expm1(np.nanmedian(y_test_pred, axis=0) + median_n_weeks_test)
            visits_test_pred[visits_test_pred < 0.5] = 0
            
    return visits_test_pred
In [9]:
n_bagging = 20
n_rounds = 20
epochs=10
batch_size = 4096

visits_test_pred = fit_predict(
    X_train, y_train, X_medians_all_train, y_visits_train,
    X_test, X_medians_all_test,
    n_bagging, n_rounds=n_rounds, epochs=epochs, batch_size=batch_size)
2022-01-01 22:53:52.269441: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-01-01 22:53:52.269460: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
Round 0 
100%|██████████| 20/20 [04:38<00:00, 13.90s/it]
/tmp/ipykernel_85912/1704954077.py:7: RuntimeWarning: invalid value encountered in true_divide
  ape = np.abs(y_true - y_pred) / denom  # absolute percentage error before averaging
valid_loss = 0.48138
Round 1 
100%|██████████| 20/20 [04:24<00:00, 13.23s/it]
valid_loss = 0.47140
Round 2 
100%|██████████| 20/20 [02:41<00:00,  8.08s/it]
valid_loss = 0.45864
Round 3 
100%|██████████| 20/20 [02:22<00:00,  7.14s/it]
valid_loss = 0.45100
Round 4 
100%|██████████| 20/20 [02:17<00:00,  6.88s/it]
valid_loss = 0.44859
Round 5 
100%|██████████| 20/20 [02:16<00:00,  6.83s/it]
valid_loss = 0.44747
Round 6 
100%|██████████| 20/20 [26:45<00:00, 80.27s/it]  
valid_loss = 0.44657
Round 7 
100%|██████████| 20/20 [03:09<00:00,  9.49s/it]
valid_loss = 0.44559
Round 8 
100%|██████████| 20/20 [02:22<00:00,  7.10s/it]
valid_loss = 0.44567
Round 9 
100%|██████████| 20/20 [02:18<00:00,  6.93s/it]
valid_loss = 0.44559
Round 10 
100%|██████████| 20/20 [02:16<00:00,  6.82s/it]
valid_loss = 0.44496
Round 11 
100%|██████████| 20/20 [02:16<00:00,  6.83s/it]
valid_loss = 0.44496
Round 12 
100%|██████████| 20/20 [02:26<00:00,  7.34s/it]
valid_loss = 0.44466
Round 13 
100%|██████████| 20/20 [02:21<00:00,  7.10s/it]
valid_loss = 0.44460
Round 14 
100%|██████████| 20/20 [02:23<00:00,  7.16s/it]
valid_loss = 0.44462
Round 15 
100%|██████████| 20/20 [02:23<00:00,  7.16s/it]
valid_loss = 0.44425
Round 16 
100%|██████████| 20/20 [02:23<00:00,  7.15s/it]
valid_loss = 0.44424
Round 17 
100%|██████████| 20/20 [02:17<00:00,  6.90s/it]
valid_loss = 0.44412
Round 18 
100%|██████████| 20/20 [02:17<00:00,  6.88s/it]
valid_loss = 0.44441
Round 19 
100%|██████████| 20/20 [02:17<00:00,  6.88s/it]
valid_loss = 0.44431
In [10]:
# make submission
test_key = pd.read_csv(test_key_csv)
test_key_split = test_key['Page'].str.rsplit('_', n=1, expand=True)
test_key_split.columns = ['Page', 'Date']
test_key = pd.concat([test_key_split, test_key[['Id']]], axis=1)

test_dates = sorted(test_key['Date'].unique())
visits_test_pred_df = pd.DataFrame(visits_test_pred[:, 1:], columns=test_dates)
visits_test_pred_df = pd.concat([visits_test_pred_df, meta[['Page']]], axis=1)
visits_test_pred_df = pd.melt(visits_test_pred_df, id_vars=['Page'], var_name='Date', value_name='Visits')
submission_df = visits_test_pred_df.merge(test_key, how='left', on=['Page', 'Date'])[['Id', 'Visits']]
submission_df.to_csv(submission_csv, index=False)

Score: 37.73212 (at 11th place)