import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import norm
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import tensorflow as tf
from tensorflow import keras
/usr/local/lib/python3.6/dist-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
Random forest and GBM options.
N_ESTIMATORS = 200
Keras and TensorFlow options
EPOCHS = 20
BATCH_SIZE = 32
UNITS = 64
!wget https://github.com/MaxGhenis/random/raw/master/Roboto-Regular.ttf -P /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf
mpl.font_manager._rebuild()
Redirecting output to ‘wget-log’.
sns.set_style('white')
DPI = 200
mpl.rc('savefig', dpi=DPI)
mpl.rcParams['figure.dpi'] = DPI
mpl.rcParams['figure.figsize'] = 6.4, 4.8 # Default.
mpl.rcParams['font.sans-serif'] = 'Roboto'
mpl.rcParams['font.family'] = 'sans-serif'
# Set title text color to dark gray (https://material.io/color) not black.
TITLE_COLOR = '#212121'
mpl.rcParams['text.color'] = TITLE_COLOR
# Axis titles and tick marks are medium gray.
AXIS_COLOR = '#757575'
mpl.rcParams['axes.labelcolor'] = AXIS_COLOR
mpl.rcParams['xtick.color'] = AXIS_COLOR
mpl.rcParams['ytick.color'] = AXIS_COLOR
!wget -N https://github.com/open-source-economics/taxdata/raw/master/cps_data/cps.csv.gz
!gunzip -f cps.csv.gz
Redirecting output to ‘wget-log.1’.
raw = pd.read_csv('cps.csv')
Select only common variables.
TARGET = 'e00900'
dat = raw.copy(deep=True)
Feature engineering.
# CPS data has MARS values of 1, 2, and 4.
dat[['MARS2', 'MARS4']] = pd.get_dummies(dat.MARS, drop_first=True)
dat['e19800_e20100'] = dat.e19800 + dat.e20100
PREDICTORS = ['DSI', 'EIC', 'MARS2', 'MARS4', 'XTOT',
'e00200', 'e00300', 'e00400', 'e00600', 'e00800',
'e01400', 'e01500', 'e01700', 'e02100', 'e02300', 'e02400',
'e03150', 'e03210', 'e03240', 'e03270', 'e03300', 'e17500',
'e18400', 'e18500', 'e19200', 'e19800_e20100', 'e20400',
'e32800', 'f2441', 'n24', 'e01100']
dat = dat[PREDICTORS + [TARGET]]
Create log-transformed version. Calculate y later.
LOG_PREDICTORS = ['e00200', 'e00300', 'e00400', 'e00600', 'e00800',
'e01400', 'e01500', 'e01700', 'e02100', 'e02300', 'e02400',
'e03150', 'e03210', 'e03240', 'e03270', 'e03300', 'e17500',
'e18400', 'e18500', 'e19200', 'e19800_e20100', 'e20400',
'e32800', 'e01100']
for i in LOG_PREDICTORS:
dat[i + '_log'] = np.log(dat[i] - dat[i].min() + 1)
Initial order may or may not be random.
X_train_full, X_test_full, y_train, y_test = train_test_split(
dat.drop(TARGET, axis=1), dat[TARGET], test_size=0.2, random_state=42)
mean = X_train_full.mean(axis=0)
std = X_train_full.std(axis=0)
X_train_full_std = (X_train_full - mean) / std
X_test_full_std = (X_test_full - mean) / std
Separate into standard and log versions.
X_train = X_train_full_std[PREDICTORS]
X_test = X_test_full_std[PREDICTORS]
X_train_log = X_train_full_std.drop(PREDICTORS, axis=1)
X_test_log = X_test_full_std.drop(PREDICTORS, axis=1)
Reformat data for statsmodels
.
X_train_w_constant = sm.add_constant(X_train)
X_test_w_constant = sm.add_constant(X_test)
X_train_log_w_constant = sm.add_constant(X_train_log)
X_test_log_w_constant = sm.add_constant(X_test_log)
Reformat 1-column data for tensorflow
.
y_train_expanded = np.expand_dims(y_train, 1)
Log features.
y_train_sign = np.sign(y_train)
y_train_pos = y_train[y_train > 0]
y_train_pos_log = np.log(y_train_pos)
X_train_log_w_constant_pos = X_train_log_w_constant[y_train > 0]
y_train_neg = y_train[y_train < 0]
y_train_neg_log = np.log(-y_train_neg)
X_train_log_w_constant_neg = X_train_log_w_constant[y_train < 0]
Dataset per method, quantile, and x
value.
METHODS = ['OLS', 'LogitOLS', 'QuantReg', 'Random forests', 'Gradient boosting', 'Keras',
'TensorFlow']
QUANTILES = [0.1, 0.3, 0.5, 0.7, 0.9]
quantiles_legend = [str(int(q * 100)) + 'th percentile' for q in QUANTILES]
# sns.set_palette(sns.color_palette('Blues', len(QUANTILES)))
sns.set_palette(sns.color_palette('Blues'))
# Set dots to a light gray
dot_color = sns.color_palette('coolwarm', 3)[1]
preds = np.array([(method, q, ix)
for method in METHODS
for q in QUANTILES
for ix in y_test.index])
preds = pd.DataFrame(preds)
preds.columns = ['method', 'q', 'ix']
preds['label'] = np.resize(y_test, preds.shape[0])
preds = preds.apply(lambda x: pd.to_numeric(x, errors='ignore'))
ols = sm.OLS(y_train, X_train_w_constant).fit()
def ols_quantile(m, X, q):
# m: OLS model.
# X: X matrix.
# q: Quantile.
#
# Set alpha based on q. Vectorized for different values of q.
mean_pred = m.predict(X)
se = np.sqrt(m.scale)
return mean_pred + norm.ppf(q) * se
preds.loc[preds.method == 'OLS', 'pred'] = np.concatenate(
[ols_quantile(ols, X_test_w_constant, q) for q in QUANTILES])
Predict the sign (negative/zero/positive), then apply log regressions.
See taxdata #221, taxdata #267, taxdata #275, my notebook, and Avi Leventhal's notebook.
Segment into positive and negative.
mult = linear_model.LogisticRegression(
multi_class='multinomial', solver='newton-cg', random_state=3)
mult.fit(X_train_log, y_train_sign)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='multinomial', n_jobs=1, penalty='l2', random_state=3, solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
ols_pos = sm.OLS(y_train_pos_log, X_train_log_w_constant_pos).fit()
ols_neg = sm.OLS(y_train_neg_log, X_train_log_w_constant_neg).fit()
mult_probs = mult.predict_proba(X_test_log)
test_logit_ols = pd.DataFrame({
'neg_prob': mult_probs[:, 0],
'zero_prob': mult_probs[:, 1],
'neg_pred': ols_neg.predict(X_test_log_w_constant),
'neg_se': np.sqrt(ols_neg.scale),
'pos_pred': ols_pos.predict(X_test_log_w_constant),
'pos_se': np.sqrt(ols_pos.scale)
})
def ols_q(mean, q, se):
return mean + norm.ppf(q) * se
def mult_ols_quantile(df, q):
# df: DataFrame with columns for neg_prob, zero_prob, neg_pred, neg_se,
# pos_pred, and pos_se.
# q: Quantile.
#
# Calculate the quantile within each OLS model.
neg_q = q / df.neg_prob
pos_q = (q - df.neg_prob - df.zero_prob) / (1 - df.neg_prob - df.zero_prob)
sign = np.where(q < df.neg_prob, -1,
np.where(q < (df.neg_prob + df.zero_prob), 0, 1))
return np.where(sign == -1, -np.exp(ols_q(df.neg_pred, neg_q, df.neg_se)),
np.where(sign == 1,
np.exp(ols_q(df.pos_pred, pos_q, df.pos_se)),
0))
preds.loc[preds.method == 'LogitOLS', 'pred'] = np.concatenate(
[mult_ols_quantile(test_logit_ols, q) for q in QUANTILES])
# Don't fit yet, since we'll fit once per quantile.
quantreg = sm.QuantReg(y_train, X_train_w_constant)
preds.loc[preds.method == 'QuantReg', 'pred'] = np.concatenate(
[quantreg.fit(q=q).predict(X_test_w_constant) for q in QUANTILES])
rf = ensemble.RandomForestRegressor(n_estimators=N_ESTIMATORS,
min_samples_leaf=1, random_state=3,
verbose=True,
n_jobs=-1) # Use maximum number of cores.
rf.fit(X_train, y_train)
[Parallel(n_jobs=-1)]: Done 46 tasks | elapsed: 3.8min [Parallel(n_jobs=-1)]: Done 196 tasks | elapsed: 16.0min [Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 16.3min finished
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1, oob_score=False, random_state=3, verbose=True, warm_start=False)
def rf_quantile(m, X, q):
rf_preds = []
for estimator in m.estimators_:
rf_preds.append(estimator.predict(X))
rf_preds = np.array(rf_preds).transpose() # One row per record.
return np.percentile(rf_preds, q * 100, axis=1)
preds.loc[preds.method == 'Random forests', 'pred'] = np.concatenate(
[rf_quantile(rf, X_test, q) for q in QUANTILES])
def gb_quantile(X_train, train_labels, X, q):
print(q)
gbf = ensemble.GradientBoostingRegressor(loss='quantile', alpha=q,
n_estimators=N_ESTIMATORS,
max_depth=3,
verbose=True,
learning_rate=0.1,
min_samples_leaf=9,
min_samples_split=9)
gbf.fit(X_train, train_labels)
return gbf.predict(X)
preds.loc[preds.method == 'Gradient boosting', 'pred'] = np.concatenate(
[gb_quantile(X_train, y_train, X_test, q)
for q in QUANTILES])
0.1 Iter Train Loss Remaining Time 1 1035.2734 2.26m 2 1035.2734 2.23m 3 1035.2734 2.23m 4 1035.2734 2.22m 5 1035.2734 2.22m 6 1035.2734 2.22m 7 1035.2734 2.22m 8 1035.2734 2.22m 9 1035.2734 2.21m 10 1035.2734 2.21m 20 1035.2734 2.10m 30 1035.2734 2.00m 40 1035.2734 1.90m 50 1035.2734 1.78m 60 1035.2734 1.66m 70 1035.2734 1.54m 80 1035.2734 1.42m 90 1035.2734 1.30m 100 1035.2734 1.18m 200 1035.2734 0.00s 0.3 Iter Train Loss Remaining Time 1 2870.9439 2.38m 2 2870.8054 2.37m 3 2870.7081 2.40m 4 2869.6915 2.40m 5 2869.0260 2.40m 6 2868.7040 2.38m 7 2868.3893 2.37m 8 2868.1410 2.36m 9 2867.9482 2.35m 10 2867.7302 2.34m 20 2866.5036 2.27m 30 2862.5196 2.14m 40 2860.7919 2.01m 50 2857.2792 1.90m 60 2856.3757 1.79m 70 2851.8679 1.66m 80 2849.5281 1.54m 90 2846.7930 1.42m 100 2844.0305 1.29m 200 2830.1153 0.00s 0.5 Iter Train Loss Remaining Time 1 4692.0647 2.61m 2 4678.8259 2.64m 3 4667.3195 2.58m 4 4658.5275 2.56m 5 4649.6677 2.52m 6 4642.8820 2.48m 7 4636.9898 2.46m 8 4631.8154 2.43m 9 4626.1916 2.41m 10 4619.1359 2.40m 20 4598.4591 2.27m 30 4593.3814 2.13m 40 4588.2154 1.99m 50 4583.3671 1.85m 60 4578.0011 1.73m 70 4573.5810 1.61m 80 4569.7910 1.49m 90 4566.1793 1.37m 100 4562.6173 1.25m 200 4536.8804 0.00s 0.7 Iter Train Loss Remaining Time 1 6488.9940 2.95m 2 6434.5371 2.94m 3 6397.0718 2.90m 4 6355.7493 2.88m 5 6321.0080 2.83m 6 6291.4892 2.82m 7 6269.4393 2.81m 8 6251.7225 2.80m 9 6238.1897 2.78m 10 6227.4985 2.77m 20 5840.2182 2.68m 30 5743.4240 2.60m 40 5705.0136 2.46m 50 5692.1113 2.30m 60 5685.9822 2.15m 70 5680.2777 2.01m 80 5675.6369 1.87m 90 5671.0865 1.71m 100 5666.9214 1.55m 200 5643.8790 0.00s 0.9 Iter Train Loss Remaining Time 1 7563.8009 2.44m 2 7353.6364 2.42m 3 7161.3795 2.42m 4 6904.6536 2.50m 5 6683.4869 2.53m 6 6496.3361 2.55m 7 6323.7878 2.58m 8 6184.3325 2.59m 9 6057.4769 2.59m 10 5946.5049 2.59m 20 5298.2247 2.41m 30 5083.7560 2.29m 40 4997.9081 2.17m 50 4892.4644 2.04m 60 4821.2919 1.90m 70 4783.5055 1.79m 80 4778.8681 1.68m 90 4772.9954 1.54m 100 4770.0468 1.39m 200 4709.8788 0.00s
From https://github.com/sachinruk/KerasQuantileModel/blob/master/Keras%20Quantile%20Model.ipynb
One area that Deep Learning has not explored extensively is the uncertainty in estimates. However, as far as decision making goes, most people actually require quantiles as opposed to true uncertainty in an estimate. eg. For a given age the weight of an individual will vary. What would be interesting is the (for arguments sake) the 10th and 90th percentile. The uncertainty of the estimate of an individuals weight is less interesting.
def tilted_loss(q, y, f):
e = (y - f)
return keras.backend.mean(keras.backend.maximum(q * e, (q - 1) * e),
axis=-1)
optimizer = tf.train.AdamOptimizer(0.001)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
sess = tf.Session()
#tf.reset_default_graph()
def keras_pred(x_train, train_labels, x_test, q):
print(q)
# optimizer = tf.train.AdamOptimizer(0.001)
# early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
# Set input_dim for the number of features.
if len(x_train.shape) == 1:
input_dim = 1
else:
input_dim = x_train.shape[1]
model = keras.Sequential([
keras.layers.Dense(UNITS, activation=tf.nn.relu,
input_dim=input_dim),
keras.layers.Dense(UNITS, activation=tf.nn.relu),
keras.layers.Dense(1)
])
model.compile(loss=lambda y, f: tilted_loss(q, y, f), optimizer=optimizer)
model.fit(x_train, train_labels, epochs=EPOCHS, batch_size=BATCH_SIZE,
verbose=0, validation_split=0.2, callbacks=[early_stop])
# Predict the quantile
return model.predict(x_test)
preds.loc[preds.method == 'Keras', 'pred'] = np.concatenate(
[keras_pred(X_train, y_train, X_test, q)
for q in QUANTILES])
0.1 0.3 0.5 0.7 0.9
# Initialize session
tf.reset_default_graph()
sess = tf.Session()
# Create network
class q_model:
def __init__(self,
sess,
quantiles,
in_shape=1,
out_shape=1,
batch_size=32):
# tf.reset_default_graph()
self.sess = sess
self.quantiles = quantiles
self.num_quantiles = len(quantiles)
self.in_shape = in_shape
self.out_shape = out_shape
self.batch_size = batch_size
self.outputs = []
self.losses = []
self.loss_history = []
self.build_model()
def build_model(self, scope='q_model', reuse=tf.AUTO_REUSE):
with tf.variable_scope(scope, reuse=reuse) as scope:
self.x = tf.placeholder(tf.float32, shape=(None, self.in_shape))
self.y = tf.placeholder(tf.float32, shape=(None, self.out_shape))
self.layer0 = tf.layers.dense(self.x,
units=UNITS,
activation=tf.nn.relu)
self.layer1 = tf.layers.dense(self.layer0,
units=UNITS,
activation=tf.nn.relu)
# Create outputs and losses for all quantiles
for i, q in enumerate(self.quantiles):
# Get output layers
output = tf.layers.dense(self.layer1, self.out_shape,
name="{}_q{}".format(i, int(q * 100)))
self.outputs.append(output)
# Create losses
error = tf.subtract(self.y, output)
loss = tf.reduce_mean(tf.maximum(q * error, (q - 1) * error),
axis=-1)
self.losses.append(loss)
# Create combined loss
self.combined_loss = tf.reduce_mean(tf.add_n(self.losses))
self.train_step = tf.train.AdamOptimizer().minimize(
self.combined_loss)
def fit(self, x, y, epochs=EPOCHS):
for epoch in range(epochs):
epoch_losses = []
for idx in range(0, x.shape[0], self.batch_size):
batch_x = x[idx : min(idx + self.batch_size, x.shape[0]), :]
batch_y = y[idx : min(idx + self.batch_size, y.shape[0]), :]
feed_dict = {self.x: batch_x,
self.y: batch_y}
_, c_loss = self.sess.run([self.train_step, self.combined_loss],
feed_dict)
epoch_losses.append(c_loss)
epoch_loss = np.mean(epoch_losses)
self.loss_history.append(epoch_loss)
if epoch % 10 == 0:
print("Epoch {}: {}".format(epoch, epoch_loss))
def predict(self, x):
# Run model to get outputs
feed_dict = {self.x: x}
predictions = sess.run(self.outputs, feed_dict)
return predictions
# Instantiate model
tf_model = q_model(sess, quantiles=QUANTILES, in_shape=X_train.shape[1],
out_shape=1, batch_size=BATCH_SIZE)
# Initialize all variables
init_op = tf.global_variables_initializer()
sess.run(init_op)
# Run training
tf_model.fit(np.array(X_train), y_train_expanded, EPOCHS)
Epoch 0: 21430.86328125 Epoch 10: 15456.2646484375
preds.loc[preds.method == 'TensorFlow', 'pred'] = \
np.array([item for sublist in tf_model.predict(X_test)
for item in sublist])
# pandas version rather than Keras.
def quantile_loss(q, y, f):
e = (y - f)
return np.maximum(q * e, (q - 1) * e)
preds['quantile_loss'] = quantile_loss(preds.q, preds.label, preds.pred)
def plot_loss_comparison(preds):
overall_loss_comparison = preds[~preds.quantile_loss.isnull()].\
pivot_table(index='method', values='quantile_loss').\
sort_values('quantile_loss')
# Show overall table.
print(overall_loss_comparison)
# Plot overall.
with sns.color_palette('Blues', 1):
ax = overall_loss_comparison.plot.barh()
plt.title('Total quantile loss', loc='left')
sns.despine(left=True, bottom=True)
plt.xlabel('Quantile loss')
plt.ylabel('')
ax.legend_.remove()
# Per quantile.
per_quantile_loss_comparison = preds[~preds.quantile_loss.isnull()].\
pivot_table(index='q', columns='method', values='quantile_loss')
# Sort by overall quantile loss.
per_quantile_loss_comparison = \
per_quantile_loss_comparison[overall_loss_comparison.index]
print(per_quantile_loss_comparison)
# Plot per quantile.
with sns.color_palette('Blues', 7):
ax = per_quantile_loss_comparison.plot.barh()
plt.title('Quantile loss per quantile', loc='left')
sns.despine(left=True, bottom=True)
handles, labels = ax.get_legend_handles_labels()
plt.xlabel('Quantile loss')
plt.ylabel('Quantile')
# Reverse legend.
ax.legend(reversed(handles), reversed(labels));
plot_loss_comparison(preds)
quantile_loss method Random forests 1780.631185 Keras 2902.365968 TensorFlow 2947.832552 Gradient boosting 3714.276605 LogitOLS 4143.752395 QuantReg 4344.568016 OLS 11349.309620 method Random forests Keras TensorFlow Gradient boosting \ q 0.1 733.596181 1001.585328 988.429851 1001.582703 0.3 1582.886857 2314.623860 2355.565532 2796.654666 0.5 2109.426515 3460.318307 3408.195883 4502.006050 0.7 2404.227426 4078.920450 4150.995083 5600.271240 0.9 2073.018947 3656.381897 3835.976413 4670.868368 method LogitOLS QuantReg OLS q 0.1 1035.972781 1001.582703 9589.040988 0.3 2931.184005 2836.598634 12109.529109 0.5 4460.544050 4664.776094 7560.337541 0.7 5826.730336 6360.021823 15555.600464 0.9 6464.330806 6859.860824 11932.040000
Examine individual predictions.
worst_logitols = preds[preds.method == 'LogitOLS'] \
.sort_values('quantile_loss', ascending=False).head(1)
worst_logitols_ix = worst_logitols['ix'].values[0]
worst_logitols_label = worst_logitols['label'].values[0]
example = preds[(preds['ix'] == worst_logitols_ix) &
(preds.method.isin(['OLS', 'LogitOLS', 'Random forests']))]
example_pivot = example.pivot_table('pred', 'q', 'method')
with sns.color_palette('Blues', 3):
example_pivot.plot()
sns.despine(left=True, bottom=True)
plt.title('Inverse CDF by method: RECID ' + worst_logitols_ix.astype(str),
loc='left', y=1.04)
plt.axhline(worst_logitols_label, color='gray', ls='dashed')