#!/usr/bin/env python # coding: utf-8 # In[14]: import math import numpy as np import pandas as pd import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers import tensorflow_addons as tfa import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score from tensorflow.keras.callbacks import EarlyStopping from tabtransformertf.models.tabtransformer import TabTransformer from tabtransformertf.utils.preprocessing import df_to_dataset, build_categorical_prep # ## Download Data # In[2]: CSV_HEADER = [ "age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket", ] train_data_url = ( "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" ) train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER) test_data_url = ( "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test" ) test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER) print(f"Train dataset shape: {train_data.shape}") print(f"Test dataset shape: {test_data.shape}") # In[3]: train_data.head() # ## Preprocess # In[4]: # Column information NUMERIC_FEATURES = train_data.select_dtypes(include=np.number).columns CATEGORICAL_FEATURES = train_data.select_dtypes(exclude=np.number).columns[:-1] # exclude label column and DT FEATURES = list(NUMERIC_FEATURES) + list(CATEGORICAL_FEATURES) LABEL = 'income_bracket' # In[5]: # encoding as binary target train_data[LABEL] = train_data[LABEL].apply(lambda x: int(x == ' >50K')) test_data[LABEL] = test_data[LABEL].apply(lambda x: int(x == ' >50K.')) train_data[LABEL].mean(), test_data[LABEL].mean() # In[6]: test_data = test_data.iloc[1:, :] # drop invalid row # In[7]: # Set data types train_data[CATEGORICAL_FEATURES] = train_data[CATEGORICAL_FEATURES].astype(str) test_data[CATEGORICAL_FEATURES] = test_data[CATEGORICAL_FEATURES].astype(str) train_data[NUMERIC_FEATURES] = train_data[NUMERIC_FEATURES].astype(float) test_data[NUMERIC_FEATURES] = test_data[NUMERIC_FEATURES].astype(float) # In[8]: # Train/test split X_train, X_val = train_test_split(train_data, test_size=0.2) # ## Modelling Prep # In[9]: # Category preprocessing layers category_prep_layers = build_categorical_prep(X_train, CATEGORICAL_FEATURES) # In[10]: # To TF Dataset train_dataset = df_to_dataset(X_train[FEATURES + [LABEL]], LABEL) val_dataset = df_to_dataset(X_val[FEATURES + [LABEL]], LABEL, shuffle=False) # No shuffle test_dataset = df_to_dataset(test_data[FEATURES + [LABEL]], shuffle=False) # No target, no shuffle # ## TabTransformer # In[11]: tabtransformer = TabTransformer( numerical_features = NUMERIC_FEATURES, categorical_features = CATEGORICAL_FEATURES, categorical_lookup=category_prep_layers, embedding_dim=32, out_dim=1, out_activation='sigmoid', depth=4, heads=8, attn_dropout=0.2, ff_dropout=0.2, mlp_hidden_factors=[2, 4], use_column_embedding=True, ) # In[12]: LEARNING_RATE = 0.0001 WEIGHT_DECAY = 0.0001 NUM_EPOCHS = 1000 optimizer = tfa.optimizers.AdamW( learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY ) tabtransformer.compile( optimizer = optimizer, loss = tf.keras.losses.BinaryCrossentropy(), metrics= [tf.keras.metrics.AUC(name="PR AUC", curve='PR')], ) # In[13]: early = EarlyStopping(monitor="val_loss", mode="min", patience=10, restore_best_weights=True) callback_list = [early] history = tabtransformer.fit( train_dataset, epochs=NUM_EPOCHS, validation_data=val_dataset, callbacks=callback_list ) # In[15]: test_preds = tabtransformer.predict(test_dataset) # In[16]: print("Test ROC AUC:", np.round(roc_auc_score(test_data[LABEL], test_preds.ravel()), 4)) print("Test PR AUC:", np.round(average_precision_score(test_data[LABEL], test_preds.ravel()), 4)) print("Test Accuracy:", np.round(accuracy_score(test_data[LABEL], test_preds.ravel() > 0.5), 4))