#!/usr/bin/env python # coding: utf-8 # # Supervised Emphasized Denoising AutoEncoder # > A tutorial of supervised emphasized Denoising AutoEncoder (DAE) # # - toc: true # - badges: true # - comments: true # - categories: [notebook, kaggle] # This notebook was originally published [here](https://www.kaggle.com/jeongyoonlee/supervised-emphasized-denoising-autoencoder) at Kaggle. # # --- # # In this notebook, I will show how to build supervised emphasized Denoising AutoEncoder (DAE) with Keras. With pseudo label, we can train a classifier and the DAE together instead of training them separately as done in previous TPS competitions. # # If you're interested in how different components of DAE (denoising, stacked layers, emphasis, etc.) contribute to its performance, please check out [Vincent et al. (2010) "Stacked Denoising Autoencoders: Learning Useful Representations in a Deep Network with a Local Denoising Criterion", JMLR](https://www.jmlr.org/papers/volume11/vincent10a/vincent10a.pdf). # # This notebook is built on top of my previous notebook, [AutoEncoder + Pseudo Label + AutoLGB](https://www.kaggle.com/jeongyoonlee/autoencoder-pseudo-label-autolgb/). The first part (section 1, 2, 3 and 5) of the notebook is the same as the previous one. # # The contents of the notebook are as follows: # 1. **Package Installation**: Installing latest version of `Kaggler` using `Pip`. # 2. **Feature Engineering**: [code](https://www.kaggle.com/udbhavpangotra/tps-apr21-eda-model) by @udbhavpangotra # 3. **Feature Transformation**: Using `kaggler.preprocessing.LabelEncoder` to impute missing values and group rare categories automatically. # 4. **Stacked Emphasized Denoising AutoEncoder (DAE)**: Adding random noise mask and **emphasized** version of AutoEncoder, called "Embphasized Denoising AutoEncoder". # 5. **LightGBM Model Training**: 5-fold CV + Pseudo label from @hiro5299834's [data](https://www.kaggle.com/hiro5299834/tps-apr-2021-voting-pseudo-labeling) + `kaggler.model.AutoLGB`'s feature selection and hyperparameter optimization # 6. **Supervised DAE**: Training the classifier and DAE simultaneously. # # # Part 1: DAE + AutoLGB # ## Load Libraries and Install `Kaggler` # In[ ]: # This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session # In[ ]: import lightgbm as lgb from matplotlib import pyplot as plt import numpy as np import pandas as pd from pathlib import Path import tensorflow as tf from tensorflow import keras from tensorflow.keras import backend as K from tensorflow.keras.losses import mean_squared_error from tensorflow.keras.metrics import AUC from tensorflow.python.keras.utils import control_flow_util import seaborn as sns from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import StandardScaler from sklearn.metrics import roc_auc_score, confusion_matrix import warnings # In[ ]: get_ipython().system('pip install kaggler') # In[ ]: import kaggler from kaggler.model import AutoLGB from kaggler.preprocessing import LabelEncoder print(f'Kaggler: {kaggler.__version__}') print(f'TensorFlow: {tf.__version__}') # In[ ]: warnings.simplefilter('ignore') plt.style.use('fivethirtyeight') pd.set_option('max_columns', 100) # ## Feature Engineering (ref: [code](https://www.kaggle.com/udbhavpangotra/tps-apr21-eda-model) by @udbhavpangotra) # In[ ]: data_dir = Path('/kaggle/input/tabular-playground-series-apr-2021/') trn_file = data_dir / 'train.csv' tst_file = data_dir / 'test.csv' sample_file = data_dir / 'sample_submission.csv' pseudo_label_file = '/kaggle/input/tps-apr-2021-label/voting_submission_from_5_best.csv' target_col = 'Survived' id_col = 'PassengerId' feature_name = 'dae' algo_name = 'lgb' model_name = f'{algo_name}_{feature_name}' feature_file = f'{feature_name}.csv' predict_val_file = f'{model_name}.val.txt' predict_tst_file = f'{model_name}.tst.txt' submission_file = f'{model_name}.sub.csv' # In[ ]: trn = pd.read_csv(trn_file, index_col=id_col) tst = pd.read_csv(tst_file, index_col=id_col) sub = pd.read_csv(sample_file, index_col=id_col) pseudo_label = pd.read_csv(pseudo_label_file, index_col=id_col) print(trn.shape, tst.shape, sub.shape, pseudo_label.shape) # In[ ]: tst[target_col] = pseudo_label[target_col] n_trn = trn.shape[0] df = pd.concat([trn, tst], axis=0) df.head() # In[ ]: # Feature engineering code from https://www.kaggle.com/udbhavpangotra/tps-apr21-eda-model df['Embarked'] = df['Embarked'].fillna('No') df['Cabin'] = df['Cabin'].fillna('_') df['CabinType'] = df['Cabin'].apply(lambda x:x[0]) df.Ticket = df.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X') df['Age'].fillna(round(df['Age'].median()), inplace=True,) df['Age'] = df['Age'].apply(round).astype(int) df['Fare'].fillna(round(df['Fare'].median()), inplace=True,) df['FirstName'] = df['Name'].str.split(', ').str[0] df['SecondName'] = df['Name'].str.split(', ').str[1] df['n'] = 1 gb = df.groupby('FirstName') df_names = gb['n'].sum() df['SameFirstName'] = df['FirstName'].apply(lambda x:df_names[x]) gb = df.groupby('SecondName') df_names = gb['n'].sum() df['SameSecondName'] = df['SecondName'].apply(lambda x:df_names[x]) df['Sex'] = (df['Sex'] == 'male').astype(int) df['FamilySize'] = df.SibSp + df.Parch + 1 feature_cols = ['Pclass', 'Age','Embarked','Parch','SibSp','Fare','CabinType','Ticket','SameFirstName', 'SameSecondName', 'Sex', 'FamilySize', 'FirstName', 'SecondName'] cat_cols = ['Pclass','Embarked','CabinType','Ticket', 'FirstName', 'SecondName'] num_cols = [x for x in feature_cols if x not in cat_cols] print(len(feature_cols), len(cat_cols), len(num_cols)) # ## Feature Transformation Using `Kaggler` # In[ ]: for col in ['SameFirstName', 'SameSecondName', 'Fare', 'FamilySize', 'Parch', 'SibSp']: df[col] = np.log2(1 + df[col]) scaler = StandardScaler() df[num_cols] = scaler.fit_transform(df[num_cols]) lbe = LabelEncoder(min_obs=50) df[cat_cols] = lbe.fit_transform(df[cat_cols]).astype(int) # ## Emphasized Denoising AutoEncoder (DAE) Using `Keras` # In[ ]: encoding_dim = 128 masking_prob = .2 emphasis_ratio = 2. seed = 42 def get_dae(encoding_dim, dropout=.2): num_dim = len(num_cols) num_input = keras.layers.Input((num_dim,), name='num_input') cat_inputs = [] cat_embs = [] emb_dims = 0 for col in cat_cols: cat_input = keras.layers.Input((1,), name=f'{col}_input') emb_dim = max(8, int(np.log2(1 + df[col].nunique()) * 4)) cat_emb = keras.layers.Embedding(input_dim=df[col].max() + 1, output_dim=emb_dim)(cat_input) cat_emb = keras.layers.Dropout(dropout)(cat_emb) cat_emb = keras.layers.Reshape((emb_dim,))(cat_emb) cat_inputs.append(cat_input) cat_embs.append(cat_emb) emb_dims += emb_dim merged_inputs = keras.layers.Concatenate()([num_input] + cat_embs) batch_size, merged_inputs_dim = merged_inputs.get_shape() training = K.learning_phase() def mask_inputs(): mask = tf.random.stateless_binomial(shape=(batch_size, merged_inputs_dim), seed=seed, counts=tf.ones((merged_inputs_dim,)), probs=[masking_prob] * merged_inputs_dim) return tf.where(mask == 1, tf.zeros_like(merged_inputs), merged_inputs) masked_inputs = control_flow_util.smart_cond(training, mask_inputs, lambda: merged_inputs) encoded = keras.layers.Dense(encoding_dim, activation='relu')(masked_inputs) encoded = keras.layers.Dropout(dropout)(encoded) encoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded) encoded = keras.layers.Dropout(dropout)(encoded) encoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded) decoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded) decoded = keras.layers.Dropout(dropout)(decoded) decoded = keras.layers.Dense(encoding_dim, activation='relu')(decoded) decoded = keras.layers.Dropout(dropout)(decoded) decoded = keras.layers.Dense(num_dim + emb_dims, activation='linear')(decoded) encoder = keras.Model([num_input] + cat_inputs, encoded) ae = keras.Model([num_input] + cat_inputs, decoded, name='ae') reconstruction_loss = K.mean( # masked inputs mean_squared_error(merged_inputs, tf.where(merged_inputs != masked_inputs, decoded, merged_inputs)) / masking_prob * emphasis_ratio \ # original inputs + mean_squared_error(merged_inputs, tf.where(merged_inputs == masked_inputs, decoded, merged_inputs)) / (1. - masking_prob) ) ae.add_loss(reconstruction_loss) ae.compile(optimizer='adam') return ae, encoder # In[ ]: ae, encoder = get_dae(encoding_dim) ae.summary() # In[ ]: inputs = [df[num_cols].values] + [df[x].values for x in cat_cols] ae.fit(inputs, inputs, epochs=30, batch_size=16384, shuffle=True, validation_split=.2) # In[ ]: encoding = encoder.predict(inputs) print(encoding.shape) np.savetxt(feature_file, encoding, fmt='%.6f', delimiter=',') # ## Model Training + Feature Selection + HPO Using `Kaggler`'s `AutoLGB` # In[ ]: n_fold = 5 X = pd.concat((df[feature_cols], pd.DataFrame(encoding, columns=[f'enc_{x}' for x in range(encoding_dim)])), axis=1) y = df[target_col] X_tst = X.iloc[n_trn:] cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed) p = np.zeros_like(y, dtype=float) p_tst = np.zeros((tst.shape[0],)) for i, (i_trn, i_val) in enumerate(cv.split(X, y)): if i == 0: clf = AutoLGB(objective='binary', metric='auc', random_state=seed) clf.tune(X.iloc[i_trn], y[i_trn]) features = clf.features params = clf.params n_best = clf.n_best print(f'{n_best}') print(f'{params}') print(f'{features}') trn_data = lgb.Dataset(X.iloc[i_trn], y[i_trn]) val_data = lgb.Dataset(X.iloc[i_val], y[i_val]) clf = lgb.train(params, trn_data, n_best, val_data, verbose_eval=100) p[i_val] = clf.predict(X.iloc[i_val]) p_tst += clf.predict(X_tst) / n_fold print(f'CV #{i + 1} AUC: {roc_auc_score(y[i_val], p[i_val]):.6f}') np.savetxt(predict_val_file, p, fmt='%.6f') np.savetxt(predict_tst_file, p_tst, fmt='%.6f') # In[ ]: print(f' CV AUC: {roc_auc_score(y, p):.6f}') print(f'Test AUC: {roc_auc_score(pseudo_label[target_col], p_tst)}') # ## Submission File for DAE + AutoLGB # In[ ]: n_pos = int(0.34911 * tst.shape[0]) th = sorted(p_tst, reverse=True)[n_pos] print(th) confusion_matrix(pseudo_label[target_col], (p_tst > th).astype(int)) # In[ ]: sub[target_col] = (p_tst > th).astype(int) sub.to_csv(submission_file) # # Part 2: Supervised DAE # In[ ]: feature_name = 'dae' algo_name = 'sdae' model_name = f'{algo_name}_{feature_name}' feature_file = f'{feature_name}.csv' predict_val_file = f'{model_name}.val.txt' predict_tst_file = f'{model_name}.tst.txt' submission_file = f'{model_name}.sub.csv' # ## Supervised DAE with `Keras` # We are adding a classifier **head** to the DAE network. It requires the additional loss and metric for the classifier in addition to the `reconstruction_loss` for DAE. # In[ ]: def get_sdae(encoding_dim, dropout=.2): num_dim = len(num_cols) num_input = keras.layers.Input((num_dim,), name='num_input') cat_inputs = [] cat_embs = [] emb_dims = 0 for col in cat_cols: cat_input = keras.layers.Input((1,), name=f'{col}_input') emb_dim = max(8, int(np.log2(1 + df[col].nunique()) * 4)) cat_emb = keras.layers.Embedding(input_dim=df[col].max() + 1, output_dim=emb_dim)(cat_input) cat_emb = keras.layers.Dropout(dropout)(cat_emb) cat_emb = keras.layers.Reshape((emb_dim,))(cat_emb) cat_inputs.append(cat_input) cat_embs.append(cat_emb) emb_dims += emb_dim inputs = [num_input] + cat_inputs merged_inputs = keras.layers.Concatenate()([num_input] + cat_embs) # masking batch_size, merged_inputs_dim = merged_inputs.get_shape() training = K.learning_phase() def mask_inputs(): mask = tf.random.stateless_binomial(shape=(batch_size, merged_inputs_dim), seed=seed, counts=tf.ones((merged_inputs_dim,)), probs=[masking_prob] * merged_inputs_dim) return tf.where(mask == 1, tf.zeros_like(merged_inputs), merged_inputs) masked_inputs = control_flow_util.smart_cond(training, mask_inputs, lambda: merged_inputs) # encoder encoded_1 = keras.layers.Dense(encoding_dim, activation='relu')(masked_inputs) encoded_1 = keras.layers.Dropout(dropout)(encoded_1) encoded_2 = keras.layers.Dense(encoding_dim, activation='relu')(encoded_1) encoded_2 = keras.layers.Dropout(dropout)(encoded_2) encoded_3 = keras.layers.Dense(encoding_dim, activation='relu')(encoded_2) encoded_concat = keras.layers.Concatenate()([encoded_1, encoded_2, encoded_3]) encoder = keras.Model(inputs, encoded_concat) decoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded_3) decoded = keras.layers.Dropout(dropout)(decoded) decoded = keras.layers.Dense(encoding_dim, activation='relu')(decoded) decoded = keras.layers.Dropout(dropout)(decoded) decoded = keras.layers.Dense(num_dim + emb_dims, activation='linear')(decoded) ae = keras.Model([num_input] + cat_inputs, decoded) # classifier clf_encoded_input = keras.Input((encoding_dim * 3,)) x = keras.layers.Dense(encoding_dim, 'relu')(clf_encoded_input) x = keras.layers.Dropout(dropout)(x) clf_output = keras.layers.Dense(1, activation='sigmoid')(x) clf = keras.Model(inputs=clf_encoded_input, outputs=clf_output, name='clf') outputs = [ae(inputs), clf(encoder(inputs))] model = keras.Model(inputs, outputs, name='sdae') reconstruction_loss = K.mean( # masked inputs mean_squared_error(merged_inputs, tf.where(merged_inputs != masked_inputs, decoded, merged_inputs)) / masking_prob * emphasis_ratio \ # original inputs + mean_squared_error(merged_inputs, tf.where(merged_inputs == masked_inputs, decoded, merged_inputs)) / (1. - masking_prob) ) model.add_loss(reconstruction_loss) model.compile(optimizer='adam', loss={'clf': 'binary_crossentropy'}, metrics={'clf': [AUC()]}) return model, encoder # In[ ]: sdae, encoder = get_sdae(encoding_dim) sdae.summary() # ## Model Training: Supervised DAE with 5-CV # In[ ]: n_fold = 5 X = df[feature_cols] y = df[target_col] X_tst = X.iloc[n_trn:] inputs_tst = [X_tst[num_cols].values] + [X_tst[x].values for x in cat_cols] cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed) p = np.zeros_like(y, dtype=float) p_tst = np.zeros((tst.shape[0],)) for i, (i_trn, i_val) in enumerate(cv.split(X, y)): X_trn = X.iloc[i_trn] X_val = X.iloc[i_val] inputs_trn = [X[num_cols].values[i_trn]] + [X[x].values[i_trn] for x in cat_cols] inputs_val = [X[num_cols].values[i_val]] + [X[x].values[i_val] for x in cat_cols] sdae, _ = get_sdae(encoding_dim) sdae.fit(inputs_trn, y[i_trn], epochs=20, batch_size=16384, shuffle=True, validation_data=(inputs_val, y[i_val])) p[i_val] = sdae.predict(inputs_val)[1].flatten() p_tst += sdae.predict(inputs_tst)[1].flatten() / n_fold print(f'CV #{i + 1} AUC: {roc_auc_score(y[i_val], p[i_val]):.6f}') np.savetxt(predict_val_file, p, fmt='%.6f') np.savetxt(predict_tst_file, p_tst, fmt='%.6f') # In[ ]: print(f' CV AUC: {roc_auc_score(y, p):.6f}') print(f'Test AUC: {roc_auc_score(pseudo_label[target_col], p_tst)}') # In[ ]: n_pos = int(0.34911 * tst.shape[0]) th = sorted(p_tst, reverse=True)[n_pos] print(th) confusion_matrix(pseudo_label[target_col], (p_tst > th).astype(int)) # In[ ]: sub[target_col] = (p_tst > th).astype(int) sub.to_csv(submission_file) # # Part 3: Simple Ensemble # In[ ]: submission_file = 'simple_ensemble_dae.csv' model_names = ['lgb_dae', 'sdae_dae'] predict_val_files = [f'{x}.val.txt' for x in model_names] predict_tst_files = [f'{x}.tst.txt' for x in model_names] dict_val_predict = {} dict_tst_predict = {} for name, val_file, tst_file in zip(model_name, predict_val_files, predict_tst_files): dict_val_predict[name] = np.loadtxt(val_file) dict_tst_predict[name] = np.loadtxt(tst_file) p = pd.DataFrame(dict_val_predict).mean(axis=1).values p_tst = pd.DataFrame(dict_tst_predict).mean(axis=1).values print(f' CV AUC: {roc_auc_score(y, p):.6f}') print(f'Test AUC: {roc_auc_score(pseudo_label[target_col], p_tst)}') # In[ ]: n_pos = int(0.34911 * tst.shape[0]) th = sorted(p_tst, reverse=True)[n_pos] print(th) confusion_matrix(pseudo_label[target_col], (p_tst > th).astype(int)) # In[ ]: sub[target_col] = (p_tst > th).astype(int) sub.to_csv(submission_file) # If you find it helpful, please upvote the notebook and give a star to [Kaggler](http://github.com/jeongyoonlee/Kaggler). If you have questions and/or feature requests for Kaggler, please post them as Issue in the Kaggler GitHub repository. # # Happy Kaggling! # In[ ]: