Notebook

In [1]:

%%time
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
import mlb
import os

import gc


BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
#train = pd.read_csv(BASE_DIR / 'train_updated.csv')
if os.path.isfile(BASE_DIR / 'train_updated.csv'):
    train = pd.read_csv(BASE_DIR / 'train_updated.csv')
    print(10*'=','train_updated.csv','load',10*'=')
else:
    train = pd.read_csv(BASE_DIR / 'train.csv')
    print(10*'=','train.csv','load',10*'=')
            
null = np.nan
true = True
false = False

for col in ['rosters','nextDayPlayerEngagement','playerBoxScores']:
    print(10*'*','this is',col,10*'*')
    if col == 'date': continue

    _index = train[col].notnull()
    train.loc[_index, col] = train.loc[_index, col].apply(lambda x: eval(x))

    outputs = []
    for index, date, record in train.loc[_index, ['date', col]].itertuples():
        _df = pd.DataFrame(record)
        _df['index'] = index
        _df['date'] = date
        outputs.append(_df)

    outputs = pd.concat(outputs).reset_index(drop=True)

    outputs.to_csv(f'{col}_train.csv', index=False)
    outputs.to_pickle(f'{col}_train.pkl')

    del outputs
    del train[col]
    gc.collect()

========== train_updated.csv load ==========
********** this is rosters **********
********** this is nextDayPlayerEngagement **********
********** this is playerBoxScores **********
CPU times: user 4min 38s, sys: 14.7 s, total: 4min 52s
Wall time: 5min 31s

In [2]:

BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('./')

players = pd.read_csv(BASE_DIR / 'players.csv')

rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
scores = pd.read_pickle(TRAIN_DIR / 'playerBoxScores_train.pkl')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()

In [3]:

targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName','heightInches','weight']
rosters_cols = ['playerId', 'teamId', 'status', 'date']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'date']

feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances','target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob',
 'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob',
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
 'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob']
feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances','target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob',
 'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob',
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
 'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob',
    'target1']

In [4]:

player_target_stats = pd.read_csv("../input/my-player-target-stat/player_target_stats.csv")
data_names=player_target_stats.columns.values.tolist()
data_names

Out[4]:

['playerId',
 'target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob',
 'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob',
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
 'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob']

In [5]:

# creat dataset
train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='left')
train = train.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')
train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])


# label encoding
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}
train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)

In [6]:

train_X = train[feature_cols]
train_y = train[['target1', 'target2', 'target3', 'target4']]

#_index = (train['date'] < 20210401)
_index = ((train['date'] > 20200529) & (train['date'] <= 20200831)) | ((train['date'] > 20190529) & (train['date'] <= 20190831)) | ((train['date'] > 20180529) & (train['date'] <= 20180831))
x_train1 = train_X.loc[~_index].reset_index(drop=True)
y_train1 = train_y.loc[~_index].reset_index(drop=True)
x_valid1 = train_X.loc[_index].reset_index(drop=True)
y_valid1 = train_y.loc[_index].reset_index(drop=True)

In [7]:

train_X = train[feature_cols2]
train_y = train[['target1', 'target2', 'target3', 'target4']]

#_index = (train['date'] < 20210401)
x_train2 = train_X.loc[~_index].reset_index(drop=True)
y_train2 = train_y.loc[~_index].reset_index(drop=True)
x_valid2 = train_X.loc[_index].reset_index(drop=True)
y_valid2 = train_y.loc[_index].reset_index(drop=True)

LGB¶

In [8]:

def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
        verbose=verbose)
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score


params1 = {'objective':'mae',
           'reg_alpha': 0.14547461820098767, 
           'reg_lambda': 0.10185644384043743, 
           'n_estimators': 3333, 
           'learning_rate': 0.1046301304430488, 
           'num_leaves': 674, 
           'feature_fraction': 0.8101240539122566, 
           'bagging_fraction': 0.8884451442950513, 
           'bagging_freq': 8, 
           'min_child_samples': 51}

params2 = {
 'objective':'mae',
           'reg_alpha': 0.14947461820098767, 
           'reg_lambda': 0.10185644384043743, 
           'n_estimators': 3633, 
           'learning_rate': 0.08046301304430488, 
           'num_leaves': 64, 
           'feature_fraction': 0.9101240539122566, 
           'bagging_fraction': 0.9884451442950513, 
           'bagging_freq': 3, 
           'min_child_samples': 15
}

params4 = {'objective':'mae',
           'reg_alpha': 0.016468100279441976, 
           'reg_lambda': 0.09128335764019105, 
           'n_estimators': 9868, 
           'learning_rate': 0.10528150510326864, 
           'num_leaves': 157, 
           'feature_fraction': 0.5419185713426886, 
           'bagging_fraction': 0.2637405128936662, 
           'bagging_freq': 19, 
           'min_child_samples': 71}


params = {
 'objective':'mae',
#  'reg_alpha': 0.1,
#  'reg_lambda': 0.1, 
 'n_estimators': 10000,
 'learning_rate': 0.1,
 'random_state': 2021,
 "num_leaves": 127,
 'feature_fraction': 0.5419185713426886, 
 'bagging_fraction': 0.5637405128936662, 
 'bagging_freq': 15, 
}



oof1, model1, score1 = fit_lgbm(
    x_train1, y_train1['target1'],
    x_valid1, y_valid1['target1'],
    params1
 )

oof2, model2, score2 = fit_lgbm(
    x_train2, y_train2['target2'],
    x_valid2, y_valid2['target2'],
    params2
)

oof3, model3, score3 = fit_lgbm(
    x_train2, y_train2['target3'],
    x_valid2, y_valid2['target3'],
   params
)

oof4, model4, score4 = fit_lgbm(
    x_train2, y_train2['target4'],
    x_valid2, y_valid2['target4'],
    params4
)

score = (score1+score2+score3+score4) / 4
print(f'score: {score}')

[LightGBM] [Warning] feature_fraction is set=0.8101240539122566, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8101240539122566
[LightGBM] [Warning] bagging_freq is set=8, subsample_freq=0 will be ignored. Current value: bagging_freq=8
[LightGBM] [Warning] bagging_fraction is set=0.8884451442950513, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8884451442950513
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.603681
[200]	valid_0's l1: 0.602494
[300]	valid_0's l1: 0.602064
[400]	valid_0's l1: 0.602072
[500]	valid_0's l1: 0.601976
[600]	valid_0's l1: 0.601959
[700]	valid_0's l1: 0.601676
Early stopping, best iteration is:
[673]	valid_0's l1: 0.601666
mae: 0.6016661242531858
[LightGBM] [Warning] feature_fraction is set=0.9101240539122566, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.9101240539122566
[LightGBM] [Warning] bagging_freq is set=3, subsample_freq=0 will be ignored. Current value: bagging_freq=3
[LightGBM] [Warning] bagging_fraction is set=0.9884451442950513, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9884451442950513
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 1.72755
[200]	valid_0's l1: 1.71505
[300]	valid_0's l1: 1.71076
[400]	valid_0's l1: 1.70847
[500]	valid_0's l1: 1.70694
[600]	valid_0's l1: 1.70668
[700]	valid_0's l1: 1.7052
[800]	valid_0's l1: 1.70479
[900]	valid_0's l1: 1.70432
[1000]	valid_0's l1: 1.7037
[1100]	valid_0's l1: 1.703
[1200]	valid_0's l1: 1.70253
[1300]	valid_0's l1: 1.70207
[1400]	valid_0's l1: 1.702
[1500]	valid_0's l1: 1.70185
[1600]	valid_0's l1: 1.7011
[1700]	valid_0's l1: 1.70071
[1800]	valid_0's l1: 1.70073
[1900]	valid_0's l1: 1.70063
[2000]	valid_0's l1: 1.7004
[2100]	valid_0's l1: 1.70023
[2200]	valid_0's l1: 1.69993
Early stopping, best iteration is:
[2192]	valid_0's l1: 1.69993
mae: 1.699927207171509
[LightGBM] [Warning] bagging_fraction is set=0.5637405128936662, subsample=1.0 will be ignored. Current value: bagging_fraction=0.5637405128936662
[LightGBM] [Warning] feature_fraction is set=0.5419185713426886, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5419185713426886
[LightGBM] [Warning] bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.715969
[200]	valid_0's l1: 0.714891
[300]	valid_0's l1: 0.714887
[400]	valid_0's l1: 0.714884
[500]	valid_0's l1: 0.714883
[600]	valid_0's l1: 0.714882
[700]	valid_0's l1: 0.714879
[800]	valid_0's l1: 0.714878
[900]	valid_0's l1: 0.714714
[1000]	valid_0's l1: 0.714713
[1100]	valid_0's l1: 0.714712
[1200]	valid_0's l1: 0.714711
[1300]	valid_0's l1: 0.714711
[1400]	valid_0's l1: 0.71471
[1500]	valid_0's l1: 0.714709
[1600]	valid_0's l1: 0.714709
[1700]	valid_0's l1: 0.71455
[1800]	valid_0's l1: 0.71455
[1900]	valid_0's l1: 0.714549
[2000]	valid_0's l1: 0.714549
[2100]	valid_0's l1: 0.714548
[2200]	valid_0's l1: 0.714548
[2300]	valid_0's l1: 0.714547
[2400]	valid_0's l1: 0.714547
[2500]	valid_0's l1: 0.714546
[2600]	valid_0's l1: 0.714546
[2700]	valid_0's l1: 0.714545
[2800]	valid_0's l1: 0.714545
[2900]	valid_0's l1: 0.714544
[3000]	valid_0's l1: 0.714543
[3100]	valid_0's l1: 0.714543
[3200]	valid_0's l1: 0.714543
[3300]	valid_0's l1: 0.714542
[3400]	valid_0's l1: 0.714542
[3500]	valid_0's l1: 0.714543
Early stopping, best iteration is:
[3473]	valid_0's l1: 0.714542
mae: 0.7145421805738932
[LightGBM] [Warning] feature_fraction is set=0.5419185713426886, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5419185713426886
[LightGBM] [Warning] bagging_freq is set=19, subsample_freq=0 will be ignored. Current value: bagging_freq=19
[LightGBM] [Warning] bagging_fraction is set=0.2637405128936662, subsample=1.0 will be ignored. Current value: bagging_fraction=0.2637405128936662
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.82029
[200]	valid_0's l1: 0.817858
[300]	valid_0's l1: 0.816503
[400]	valid_0's l1: 0.815647
[500]	valid_0's l1: 0.815143
[600]	valid_0's l1: 0.814731
[700]	valid_0's l1: 0.814472
[800]	valid_0's l1: 0.814144
[900]	valid_0's l1: 0.813819
[1000]	valid_0's l1: 0.813664
[1100]	valid_0's l1: 0.813584
Early stopping, best iteration is:
[1053]	valid_0's l1: 0.813545
mae: 0.8135446889692893
score: 0.9574200502419693

Cat¶

In [9]:

import pickle
from catboost import CatBoostRegressor

def fit_lgbm(x_train, y_train, x_valid, y_valid, target, params: dict=None, verbose=100):
    oof_pred_lgb = np.zeros(len(y_valid), dtype=np.float32)
    oof_pred_cat = np.zeros(len(y_valid), dtype=np.float32)
    
    if os.path.isfile(f'../input/mlb-lightgbm-training/mymodel_lgb_{target}.pkl'):
        with open(f'../input/mlb-lightgbm-training/mymodel_lgb_{target}.pkl', 'rb') as fin:
            model = pickle.load(fin)
            oof_pred_lgb = model.predict(x_valid)
            score_lgb = mean_absolute_error(oof_pred_lgb, y_valid)
            print('*'*10,fin,'*'*10)
            print('mae:', score_lgb)
    else:
        with open(f'mymodel_lgb_{target}.pkl', 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

    
    if os.path.isfile(f'../input/mlb-catboost-training/mymodel_cb_{target}.pkl'):
        with open(f'../input/mlb-catboost-training/mymodel_cb_{target}.pkl', 'rb') as fin:
            model_cb = pickle.load(fin)
            oof_pred_cat = model_cb.predict(x_valid)
            score_cat = mean_absolute_error(oof_pred_cat, y_valid)
            print('*'*10,fin,'*'*10)
            print('mae:', score_cat)
    
    else:

        with open(f'model_cb_{target}.pkl', 'wb') as handle:
            pickle.dump(model_cb, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

    return oof_pred_lgb, model, oof_pred_cat, model_cb, score_lgb, score_cat


params = {
'boosting_type': 'gbdt',
'objective':'mae',
'subsample': 0.6,
'subsample_freq': 1,
'learning_rate': 0.03,
'num_leaves': 2**11-1,
'min_data_in_leaf': 2**12-1,
'feature_fraction': 0.6,
'max_bin': 100,
'n_estimators': 2500,
'boost_from_average': False,
"random_seed":2021,
}

oof_pred_lgb2, model_lgb2, oof_pred_cat2, model_cb2, score_lgb2, score_cat2 = fit_lgbm(
    x_train1, y_train1['target2'],
    x_valid1, y_valid1['target2'],
    2, params
)

oof_pred_lgb1, model_lgb1, oof_pred_cat1, model_cb1, score_lgb1, score_cat1 = fit_lgbm(
    x_train1, y_train1['target1'],
    x_valid1, y_valid1['target1'],
    1, params
)

oof_pred_lgb3, model_lgb3, oof_pred_cat3, model_cb3, score_lgb3, score_cat3 = fit_lgbm(
    x_train1, y_train1['target3'],
    x_valid1, y_valid1['target3'],
    3, params
)
oof_pred_lgb4, model_lgb4, oof_pred_cat4, model_cb4, score_lgb4, score_cat4= fit_lgbm(
    x_train1, y_train1['target4'],
    x_valid1, y_valid1['target4'],
    4, params
)

score = (score_lgb1+score_lgb2+score_lgb3+score_lgb4) / 4
print(f'LightGBM score: {score}')

score = (score_cat1+score_cat2+score_cat3+score_cat4) / 4
print(f'Catboost score: {score}')

********** <_io.BufferedReader name='../input/mlb-lightgbm-training/mymodel_lgb_2.pkl'> **********
mae: 1.7495870137280762
********** <_io.BufferedReader name='../input/mlb-catboost-training/mymodel_cb_2.pkl'> **********
mae: 1.8092167805205939
********** <_io.BufferedReader name='../input/mlb-lightgbm-training/mymodel_lgb_1.pkl'> **********
mae: 0.613446416791378
********** <_io.BufferedReader name='../input/mlb-catboost-training/mymodel_cb_1.pkl'> **********
mae: 0.6265251914020156
********** <_io.BufferedReader name='../input/mlb-lightgbm-training/mymodel_lgb_3.pkl'> **********
mae: 0.7371395237381603
********** <_io.BufferedReader name='../input/mlb-catboost-training/mymodel_cb_3.pkl'> **********
mae: 0.7406471371942968
********** <_io.BufferedReader name='../input/mlb-lightgbm-training/mymodel_lgb_4.pkl'> **********
mae: 0.8178008821616221
********** <_io.BufferedReader name='../input/mlb-catboost-training/mymodel_cb_4.pkl'> **********
mae: 0.846635209540955
LightGBM score: 0.9794934591048092
Catboost score: 1.0057560796644653

ANN¶

In [10]:

players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']

null = np.nan
true = True
false = False

In [11]:

import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm
import gc
from functools import reduce
from sklearn.model_selection import StratifiedKFold

ROOT_DIR = "../input/mlb-player-digital-engagement-forecasting"

#=======================#
def flatten(df, col):
    du = (df.pivot(index="playerId", columns="EvalDate", 
               values=col).add_prefix(f"{col}_").
      rename_axis(None, axis=1).reset_index())
    return du
#============================#
def reducer(left, right):
    return left.merge(right, on="playerId")
#========================

TGTCOLS = ["target1","target2","target3","target4"]
def train_lag(df, lag=1):
    dp = df[["playerId","EvalDate"]+TGTCOLS].copy()
    dp["EvalDate"]  =dp["EvalDate"] + timedelta(days=lag) 
    df = df.merge(dp, on=["playerId", "EvalDate"], suffixes=["",f"_{lag}"], how="left")
    return df
#=================================
def test_lag(sub):
    sub["playerId"] = sub["date_playerId"].apply(lambda s: int(  s.split("_")[1]  ) )
    assert sub.date.nunique() == 1
    dte = sub["date"].unique()[0]
    
    eval_dt = pd.to_datetime(dte, format="%Y%m%d")
    dtes = [eval_dt + timedelta(days=-k) for k in LAGS]
    mp_dtes = {eval_dt + timedelta(days=-k):k for k in LAGS}
    
    sl = LAST.loc[LAST.EvalDate.between(dtes[-1], dtes[0]), ["EvalDate","playerId"]+TGTCOLS].copy()
    sl["EvalDate"] = sl["EvalDate"].map(mp_dtes)
    du = [flatten(sl, col) for col in TGTCOLS]
    du = reduce(reducer, du)
    return du, eval_dt
    #
#===============

tr = pd.read_csv("../input/my-mlb-data/target.csv")
print(tr.shape)
gc.collect()

tr["EvalDate"] = pd.to_datetime(tr["EvalDate"])
tr["EvalDate"] = tr["EvalDate"] + timedelta(days=-1)
tr["EvalYear"] = tr["EvalDate"].dt.year

MED_DF = tr.groupby(["playerId","EvalYear"])[TGTCOLS].median().reset_index()
MEDCOLS = ["tgt1_med","tgt2_med", "tgt3_med", "tgt4_med"]
MED_DF.columns = ["playerId","EvalYear"] + MEDCOLS

LAGS = list(range(1,21))
FECOLS = [f"{col}_{lag}" for lag in reversed(LAGS) for col in TGTCOLS]

for lag in tqdm(LAGS):
    tr = train_lag(tr, lag=lag)
    gc.collect()
#===========
tr = tr.sort_values(by=["playerId", "EvalDate"])
print(tr.shape)
tr = tr.dropna()
print(tr.shape)
tr = tr.merge(MED_DF, on=["playerId","EvalYear"])
gc.collect()

X = tr[FECOLS+MEDCOLS].values
y = tr[TGTCOLS].values
cl = tr["playerId"].values

NFOLDS = 5
skf = StratifiedKFold(n_splits=NFOLDS)
folds = skf.split(X, cl)
folds = list(folds)

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

tf.random.set_seed(2021)

def make_model(n_in):
    inp = L.Input(name="inputs", shape=(n_in,))
    x = L.Dense(50, activation="relu", name="d1")(inp)
    x = L.Dense(50, activation="relu", name="d2")(x)
    preds = L.Dense(4, activation="linear", name="preds")(x)
    
    model = M.Model(inp, preds, name="ANN")
    model.compile(loss="mean_absolute_error", optimizer="adam")
    return model

net = make_model(X.shape[1])
print(net.summary())

oof = np.zeros(y.shape)
nets = []
for idx in range(NFOLDS):
    print("FOLD:", idx)
    tr_idx, val_idx = folds[idx]
    ckpt = ModelCheckpoint(f"../input/mlb-ann-training/w{idx}.h5", monitor='val_loss', verbose=1, save_best_only=True,mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=3, min_lr=0.0001)
    es = EarlyStopping(monitor='val_loss', patience=5)
    reg = make_model(X.shape[1])
#     reg.fit(X[tr_idx], y[tr_idx], epochs=10, batch_size=30_000, validation_data=(X[val_idx], y[val_idx]),
#             verbose=1, callbacks=[ckpt, reduce_lr, es])
    reg.load_weights(f"../input/mlb-ann-training/w{idx}.h5")
    oof[val_idx] = reg.predict(X[val_idx], batch_size=50_000, verbose=1)
    nets.append(reg)
    gc.collect()

mae = mean_absolute_error(y, oof)
mse = mean_squared_error(y, oof, squared=False)
print("mae:", mae)
print("mse:", mse)

# Historical information to use in prediction time
bound_dt = pd.to_datetime("2021-01-01")
LAST = tr.loc[tr.EvalDate>bound_dt].copy()

LAST_MED_DF = MED_DF.loc[MED_DF.EvalYear==2021].copy()
LAST_MED_DF.drop("EvalYear", axis=1, inplace=True)
del tr

#"""
import mlb
FE = []; SUB = [];

(2506176, 6)

100%|██████████| 20/20 [01:08<00:00,  3.43s/it]

(2506176, 87)
(2464956, 87)
Model: "ANN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
inputs (InputLayer)          [(None, 84)]              0         
_________________________________________________________________
d1 (Dense)                   (None, 50)                4250      
_________________________________________________________________
d2 (Dense)                   (None, 50)                2550      
_________________________________________________________________
preds (Dense)                (None, 4)                 204       
=================================================================
Total params: 7,004
Trainable params: 7,004
Non-trainable params: 0
_________________________________________________________________
None
FOLD: 0
10/10 [==============================] - 0s 17ms/step
FOLD: 1
10/10 [==============================] - 0s 14ms/step
FOLD: 2
10/10 [==============================] - 0s 15ms/step
FOLD: 3
10/10 [==============================] - 0s 14ms/step
FOLD: 4
10/10 [==============================] - 0s 16ms/step
mae: 0.7727517316297969
mse: 3.9314386784209567

Predict¶

In [12]:

import copy

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test: # make predictions here
    
    sub = copy.deepcopy(sample_prediction_df.reset_index())
    sample_prediction_df = copy.deepcopy(sample_prediction_df.reset_index(drop=True))
    
    # LGBM summit
    # creat dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
    test_scores = test_scores.groupby('playerId').sum().reset_index()
    test = sample_prediction_df[['playerId']].copy()
    test = test.merge(players[players_cols], on='playerId', how='left')
    test = test.merge(test_rosters[rosters_cols], on='playerId', how='left')
    test = test.merge(test_scores[scores_cols], on='playerId', how='left')
    test = test.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
    

    test['label_playerId'] = test['playerId'].map(player2num)
    test['label_primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['label_teamId'] = test['teamId'].map(teamid2num)
    test['label_status'] = test['status'].map(status2num)
    
    test_X = test[feature_cols]
    # predict
    pred1 = model1.predict(test_X)
    
    # predict
    pred_lgd1 = model_lgb1.predict(test_X)
    pred_lgd2 = model_lgb2.predict(test_X)
    pred_lgd3 = model_lgb3.predict(test_X)
    pred_lgd4 = model_lgb4.predict(test_X)
    
    pred_cat1 = model_cb1.predict(test_X)
    pred_cat2 = model_cb2.predict(test_X)
    pred_cat3 = model_cb3.predict(test_X)
    pred_cat4 = model_cb4.predict(test_X)
    
    test['target1'] = np.clip(pred1,0,100)
    test_X = test[feature_cols2]

    pred2 = model2.predict(test_X)
    pred3 = model3.predict(test_X)
    pred4 = model4.predict(test_X)
    
    # merge submission
    sample_prediction_df['target1'] = 1.00*np.clip(pred1, 0, 100)+0.00*np.clip(pred_lgd1, 0, 100)+0.00*np.clip(pred_cat1, 0, 100)
    sample_prediction_df['target2'] = 0.05*np.clip(pred2, 0, 100)+0.54*np.clip(pred_lgd2, 0, 100)+0.405*np.clip(pred_cat2, 0, 100)
    sample_prediction_df['target3'] = 0.76*np.clip(pred3, 0, 100)+0.14*np.clip(pred_lgd3, 0, 100)+0.10*np.clip(pred_cat3, 0, 100)
    sample_prediction_df['target4'] = 0.77*np.clip(pred4, 0, 100)+0.13*np.clip(pred_lgd4, 0, 100)+0.10*np.clip(pred_cat4, 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0.)
    del sample_prediction_df['playerId']
    # TF summit
    # Features computation at Evaluation Date
    sub_fe, eval_dt = test_lag(sub)
    sub_fe = sub_fe.merge(LAST_MED_DF, on="playerId", how="left")
    sub_fe = sub_fe.fillna(0.)
    
    _preds = 0.
    for reg in nets:
        _preds += reg.predict(sub_fe[FECOLS + MEDCOLS]) / NFOLDS
    sub_fe[TGTCOLS] = np.clip(_preds, 0, 100)
    sub.drop(["date"]+TGTCOLS, axis=1, inplace=True)
    sub = sub.merge(sub_fe[["playerId"]+TGTCOLS], on="playerId", how="left")
    sub.drop("playerId", axis=1, inplace=True)
    sub = sub.fillna(0.)
    # Blending
    blend = pd.concat(
        [sub[['date_playerId']],
        (0.22*sub.drop('date_playerId', axis=1) + 0.78*sample_prediction_df.drop('date_playerId', axis=1))],
        axis=1
    )
    env.predict(blend)
    # Update Available information
    sub_fe["EvalDate"] = eval_dt
    #sub_fe.drop(MEDCOLS, axis=1, inplace=True)
    LAST = LAST.append(sub_fe)
    LAST = LAST.drop_duplicates(subset=["EvalDate","playerId"], keep="last")

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.

In [13]:

pd.concat(
    [sub[['date_playerId']],
    (sub.drop('date_playerId', axis=1) + sample_prediction_df.drop('date_playerId', axis=1)) / 2],
    axis=1
)

Out[13]:

	date_playerId	target1	target2	target3	target4
0	20210501_488726	1.417833	5.650557	6.859939e-02	1.995048
1	20210501_605218	0.003536	0.396355	1.702673e-03	0.868064
2	20210501_621563	0.099256	2.386864	7.695223e-02	0.771513
3	20210501_670084	0.022301	0.878644	6.095328e-04	0.277289
4	20210501_670970	0.010307	0.251098	2.952593e-02	0.118300
...	...	...	...	...	...
1182	20210501_596049	0.000276	0.009644	2.608818e-12	0.035421
1183	20210501_642851	0.000176	0.041358	1.818041e-07	0.079906
1184	20210501_596071	0.000451	0.083687	1.183390e-04	0.070381
1185	20210501_664901	0.003308	0.309083	3.068393e-02	0.199449
1186	20210501_605525	0.002655	0.555847	4.771943e-04	0.116648

1187 rows × 5 columns

In [14]:

sample_prediction_df

Out[14]:

	date_playerId	target1	target2	target3	target4
0	20210501_488726	2.728486e+00	8.022509	1.070538e-01	2.425154
1	20210501_605218	2.152436e-03	0.566561	3.182109e-03	0.828016
2	20210501_621563	1.422361e-01	2.763233	1.348766e-02	0.880154
3	20210501_670084	2.031692e-03	0.647363	1.219066e-03	0.119984
4	20210501_670970	5.941349e-04	0.162923	9.765327e-03	0.048072
...	...	...	...	...	...
1182	20210501_596049	9.109051e-15	0.016895	5.217635e-12	0.032081
1183	20210501_642851	0.000000e+00	0.072513	3.636082e-07	0.081339
1184	20210501_596071	1.820810e-04	0.109463	1.856178e-05	0.085482
1185	20210501_664901	6.615386e-03	0.358820	3.113339e-03	0.202974
1186	20210501_605525	8.827960e-04	0.593431	4.494436e-10	0.107011

1187 rows × 5 columns