#!/usr/bin/env python # coding: utf-8 # # DAE with 2 Lines of Code with Kaggler # > A tutorial on Kaggler's new DAE feature transformation # # - toc: true # - badges: true # - comments: true # - categories: [notebook, kaggle] # # **UPDATE on 5/1/2021** # # Today, [`Kaggler`](https://github.com/jeongyoonlee/Kaggler) v0.9.4 is released with additional features for DAE as follows: # * In addition to the swap noise (`swap_prob`), the Gaussian noise (`noise_std`) and zero masking (`mask_prob`) have been added to DAE to overcome overfitting. # * Stacked DAE is available through the `n_layer` input argument (see Figure 3. in [Vincent et al. (2010), "Stacked Denoising Autoencoders"](https://www.jmlr.org/papers/volume11/vincent10a/vincent10a.pdf) for reference). # # For example, to build a stacked DAE with 3 pairs of encoder/decoder and all three types of noises, you can do: # ```python # from kaggler.preprocessing import DAE # # dae = DAE(cat_cols=cat_cols, num_cols=num_cols, n_layer=3, noise_std=.05, swap_prob=.2, masking_prob=.1) # X = dae.fit_transform(pd.concat([trn, tst], axis=0)) # ``` # # If you're using previous versions, please upgrade `Kaggler` using `pip install -U kaggler`. # # --- # # Today I released a new version (v0.9.0) of the `Kaggler` package with Denoising AutoEncoder (DAE) with the swap noise. # # Now you can train a DAE with only 2 lines of code as follows: # # ```python # dae = DAE(cat_cols=cat_cols, num_cols=num_cols, encoding_dim=encoding_dim) # X = dae.fit_transform(df[feature_cols]) # ``` # # In addition to the new DAE feature encoder, `Kaggler` supports many of feature transformations used in Kaggle including: # * `TargetEncoder`: with smoothing and cross-validation to avoid overfitting # * `FrequencyEncoder` # * `LabelEncoder`: that imputes missing values and groups rare categories # * `OneHotEncoder`: that imputes missing values and groups rare categories # * `EmbeddingEncoder`: that transforms categorical features into embeddings # * `QuantileEncoder`: that transforms numerical features into quantiles # # In the notebook below, I will show how to use `Kaggler`'s `LabelEncoder`, `TargetEncoder`, and `DAE` for feature engineering, then use `Kaggler`'s `AutoLGB` to do feature selection and hyperparameter optimization. # This notebook was originally published [here](https://www.kaggle.com/jeongyoonlee/dae-with-2-lines-of-code-with-kaggler) at Kaggle. # # --- # # Today I released a new version (v0.9.0) of the `Kaggler` package with Denoising AutoEncoder (DAE) with the swap noise. # # Now you can train a DAE with only 2 lines of code as follows: # # ```python # dae = DAE(cat_cols=cat_cols, num_cols=num_cols, encoding_dim=encoding_dim) # X = dae.fit_transform(df[feature_cols]) # ``` # # In addition to the new DAE feature encoder, `Kaggler` supports many of feature transformations used in Kaggle including: # * `TargetEncoder`: with smoothing and cross-validation to avoid overfitting # * `FrequencyEncoder` # * `LabelEncoder`: that imputes missing values and groups rare categories # * `OneHotEncoder`: that imputes missing values and groups rare categories # * `EmbeddingEncoder`: that transforms categorical features into embeddings # * `QuantileEncoder`: that transforms numerical features into quantiles # # In the notebook below, I will show how to use `Kaggler`'s `LabelEncoder`, `TargetEncoder`, and `DAE` for feature engineering, then use `Kaggler`'s `AutoLGB` to do feature selection and hyperparameter optimization. # # Part 1: Data Loading & Feature Engineering # In[ ]: import lightgbm as lgb import numpy as np import pandas as pd from pathlib import Path import tensorflow as tf from tensorflow import keras from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import StandardScaler from sklearn.metrics import roc_auc_score, confusion_matrix import warnings # In[ ]: get_ipython().system('pip install kaggler') # In[ ]: import kaggler from kaggler.model import AutoLGB from kaggler.preprocessing import DAE, TargetEncoder, LabelEncoder print(f'Kaggler: {kaggler.__version__}') # In[ ]: warnings.simplefilter('ignore') pd.set_option('max_columns', 100) # In[ ]: feature_name = 'dae' algo_name = 'lgb' model_name = f'{algo_name}_{feature_name}' data_dir = Path('/kaggle/input/tabular-playground-series-apr-2021/') trn_file = data_dir / 'train.csv' tst_file = data_dir / 'test.csv' sample_file = data_dir / 'sample_submission.csv' pseudo_label_file = '../input/tps-apr-2021-pseudo-label-dae/tps04-sub-006.csv' feature_file = f'{feature_name}.csv' predict_val_file = f'{model_name}.val.txt' predict_tst_file = f'{model_name}.tst.txt' submission_file = f'{model_name}.sub.csv' target_col = 'Survived' id_col = 'PassengerId' # In[ ]: n_fold = 5 seed = 42 encoding_dim = 64 # In[ ]: trn = pd.read_csv(trn_file, index_col=id_col) tst = pd.read_csv(tst_file, index_col=id_col) sub = pd.read_csv(sample_file, index_col=id_col) pseudo_label = pd.read_csv(pseudo_label_file, index_col=id_col) print(trn.shape, tst.shape, sub.shape, pseudo_label.shape) # In[ ]: tst[target_col] = pseudo_label[target_col] n_trn = trn.shape[0] df = pd.concat([trn, tst], axis=0) df.head() # In[ ]: # Feature engineering code from https://www.kaggle.com/udbhavpangotra/tps-apr21-eda-model df['Embarked'] = df['Embarked'].fillna('No') df['Cabin'] = df['Cabin'].fillna('_') df['CabinType'] = df['Cabin'].apply(lambda x:x[0]) df.Ticket = df.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X') df['Age'].fillna(round(df['Age'].median()), inplace=True,) df['Age'] = df['Age'].apply(round).astype(int) # Fare, fillna with mean value fare_map = df[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict() df['Fare'] = df['Fare'].fillna(df['Pclass'].map(fare_map['Fare'])) df['FirstName'] = df['Name'].str.split(', ').str[0] df['SecondName'] = df['Name'].str.split(', ').str[1] df['n'] = 1 gb = df.groupby('FirstName') df_names = gb['n'].sum() df['SameFirstName'] = df['FirstName'].apply(lambda x:df_names[x]).fillna(1) gb = df.groupby('SecondName') df_names = gb['n'].sum() df['SameSecondName'] = df['SecondName'].apply(lambda x:df_names[x]).fillna(1) df['Sex'] = (df['Sex'] == 'male').astype(int) df['FamilySize'] = df.SibSp + df.Parch + 1 feature_cols = ['Pclass', 'Age','Embarked','Parch','SibSp','Fare','CabinType','Ticket','SameFirstName', 'SameSecondName', 'Sex', 'FamilySize', 'FirstName', 'SecondName'] cat_cols = ['Pclass','Embarked','CabinType','Ticket', 'FirstName', 'SecondName'] num_cols = [x for x in feature_cols if x not in cat_cols] print(len(feature_cols), len(cat_cols), len(num_cols)) # In[ ]: for col in ['SameFirstName', 'SameSecondName', 'Fare', 'FamilySize', 'Parch', 'SibSp']: df[col] = np.log2(1 + df[col]) scaler = StandardScaler() df[num_cols] = scaler.fit_transform(df[num_cols]) # ## Label encoding with rare category grouping and missing value imputation # In[ ]: lbe = LabelEncoder(min_obs=50) df[cat_cols] = lbe.fit_transform(df[cat_cols]).astype(int) # ## Target encoding with smoothing and 5-fold cross-validation # In[ ]: cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed) te = TargetEncoder(cv=cv) df_te = te.fit_transform(df[cat_cols], df[target_col]) df_te.columns = [f'te_{col}' for col in cat_cols] df_te.head() # ## DAE # In[ ]: dae = DAE(cat_cols=cat_cols, num_cols=num_cols, encoding_dim=encoding_dim) X = dae.fit_transform(df[feature_cols]) # In[ ]: df_dae = pd.DataFrame(X, columns=[f'dae_{i}' for i in range(encoding_dim)]) print(df_dae.shape) # # Part 2: Model Training # ## AutoLGB for Feature Selection and Hyperparameter Optimization # In[ ]: X = pd.concat([df[feature_cols], df_te, df_dae], axis=1) y = df[target_col] X_tst = X.iloc[n_trn:] p = np.zeros_like(y, dtype=float) p_tst = np.zeros((tst.shape[0],)) print(f'Training a stacking ensemble LightGBM model:') for i, (i_trn, i_val) in enumerate(cv.split(X, y)): if i == 0: clf = AutoLGB(objective='binary', metric='auc', random_state=seed) clf.tune(X.iloc[i_trn], y[i_trn]) features = clf.features params = clf.params n_best = clf.n_best print(f'{n_best}') print(f'{params}') print(f'{features}') trn_data = lgb.Dataset(X.iloc[i_trn], y[i_trn]) val_data = lgb.Dataset(X.iloc[i_val], y[i_val]) clf = lgb.train(params, trn_data, n_best, val_data, verbose_eval=100) p[i_val] = clf.predict(X.iloc[i_val]) p_tst += clf.predict(X_tst) / n_fold print(f'CV #{i + 1} AUC: {roc_auc_score(y[i_val], p[i_val]):.6f}') # In[ ]: print(f' CV AUC: {roc_auc_score(y, p):.6f}') print(f'Test AUC: {roc_auc_score(pseudo_label[target_col], p_tst)}') # ## Submission # In[ ]: n_pos = int(0.34911 * tst.shape[0]) th = sorted(p_tst, reverse=True)[n_pos] print(th) confusion_matrix(pseudo_label[target_col], (p_tst > th).astype(int)) # In[ ]: sub[target_col] = (p_tst > th).astype(int) sub.to_csv(submission_file) # If you find it useful, please upvote the notebook and leave your feedback. It will be greatly appreciated! # # Also please check my previous notebooks as well: # * [AutoEncoder + Pseudo Label + AutoLGB](https://www.kaggle.com/jeongyoonlee/autoencoder-pseudo-label-autolgb): shows how to build a basic AutoEncoder using Keras, and perform automated feature selection and hyperparameter optimization using Kaggler's AutoLGB. # * [Supervised Emphasized Denoising AutoEncoder](https://www.kaggle.com/jeongyoonlee/supervised-emphasized-denoising-autoencoder): shows how to build a more sophiscated version of AutoEncoder, called supervised emphasized Denoising AutoEncoder (DAE), which trains DAE and a classifier simultaneously. # * [Stacking Ensemble](https://www.kaggle.com/jeongyoonlee/stacking-ensemble): shows how to perform stacking ensemble.