# Imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score
# You can download the data from Kaggle:
# https://www.kaggle.com/datasets/zalando-research/fashionmnist?resource=download
raw_train_data = pd.read_csv('fashion-mnist_train.csv')
raw_test_data = pd.read_csv('fashion-mnist_test.csv')
raw_train_data.head
<bound method NDFrame.head of label pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 \ 0 2 0 0 0 0 0 0 0 0 1 9 0 0 0 0 0 0 0 0 2 6 0 0 0 0 0 0 0 5 3 0 0 0 0 1 2 0 0 0 4 3 0 0 0 0 0 0 0 0 ... ... ... ... ... ... ... ... ... ... 59995 9 0 0 0 0 0 0 0 0 59996 1 0 0 0 0 0 0 0 0 59997 8 0 0 0 0 0 0 0 0 59998 8 0 0 0 0 0 0 0 0 59999 7 0 0 0 0 0 0 0 0 pixel9 ... pixel775 pixel776 pixel777 pixel778 pixel779 \ 0 0 ... 0 0 0 0 0 1 0 ... 0 0 0 0 0 2 0 ... 0 0 0 30 43 3 0 ... 3 0 0 0 0 4 0 ... 0 0 0 0 0 ... ... ... ... ... ... ... ... 59995 0 ... 0 0 0 0 0 59996 0 ... 73 0 0 0 0 59997 0 ... 160 162 163 135 94 59998 0 ... 0 0 0 0 0 59999 0 ... 0 0 0 0 0 pixel780 pixel781 pixel782 pixel783 pixel784 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 1 0 0 0 0 4 0 0 0 0 0 ... ... ... ... ... ... 59995 0 0 0 0 0 59996 0 0 0 0 0 59997 0 0 0 0 0 59998 0 0 0 0 0 59999 0 0 0 0 0 [60000 rows x 785 columns]>
raw_test_data.head
<bound method NDFrame.head of label pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 \ 0 0 0 0 0 0 0 0 0 9 1 1 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 14 53 3 2 0 0 0 0 0 0 0 0 4 3 0 0 0 0 0 0 0 0 ... ... ... ... ... ... ... ... ... ... 9995 0 0 0 0 0 0 0 0 0 9996 6 0 0 0 0 0 0 0 0 9997 8 0 0 0 0 0 0 0 0 9998 8 0 1 3 0 0 0 0 0 9999 1 0 0 0 0 0 0 0 140 pixel9 ... pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 \ 0 8 ... 103 87 56 0 0 0 1 0 ... 34 0 0 0 0 0 2 99 ... 0 0 0 0 63 53 3 0 ... 137 126 140 0 133 224 4 0 ... 0 0 0 0 0 0 ... ... ... ... ... ... ... ... ... 9995 0 ... 32 23 14 20 0 0 9996 0 ... 0 0 0 2 52 23 9997 0 ... 175 172 172 182 199 222 9998 0 ... 0 0 0 0 0 1 9999 119 ... 111 95 75 44 1 0 pixel781 pixel782 pixel783 pixel784 0 0 0 0 0 1 0 0 0 0 2 31 0 0 0 3 222 56 0 0 4 0 0 0 0 ... ... ... ... ... 9995 1 0 0 0 9996 28 0 0 0 9997 42 0 1 0 9998 0 0 0 0 9999 0 0 0 0 [10000 rows x 785 columns]>
# From documentation, label --> clothing mapping is:
# 0 --> t-shirt/top
# 1 --> trouser
# 2 --> pullover
# 3 --> dress
# 4 --> coat
# 5 --> sandal
# 6 --> shirt
# 7 --> sneaker
# 8 --> bag
# 9 --> ankle boot
raw_train_data['class'] = raw_train_data['label'].apply(
lambda x: 1 if (x==7 or x==2) else -1 if (x==5 or x==6) else 0
)
raw_test_data['class'] = raw_test_data['label'].apply(
lambda x: 1 if (x==7 or x==2) else -1 if (x==5 or x==6) else 0
)
raw_train_data = raw_train_data[raw_train_data['class'].isin([-1, 1])]
raw_test_data = raw_test_data[raw_test_data['class'].isin([-1,1])]
train_data = raw_train_data.drop('label', axis=1)
test_data = raw_test_data.drop('label', axis=1)
train_data.head
<bound method NDFrame.head of pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 \ 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 5 0 7 0 0 0 0 0 0 0 0 0 14 0 0 0 0 1 1 0 0 0 15 0 0 0 0 0 0 0 0 16 ... ... ... ... ... ... ... ... ... ... 59988 0 0 0 0 0 0 0 0 0 59991 0 0 0 0 0 0 0 0 0 59992 0 0 0 0 0 0 0 0 0 59993 0 0 0 0 0 0 1 0 0 59999 0 0 0 0 0 0 0 0 0 pixel10 ... pixel776 pixel777 pixel778 pixel779 pixel780 \ 0 0 ... 0 0 0 0 0 2 0 ... 0 0 30 43 0 7 0 ... 0 0 0 0 0 14 0 ... 0 118 190 162 82 15 43 ... 0 1 1 1 1 ... ... ... ... ... ... ... ... 59988 0 ... 0 0 0 0 0 59991 0 ... 0 0 0 0 0 59992 0 ... 0 0 0 0 0 59993 0 ... 0 0 0 141 113 59999 0 ... 0 0 0 0 0 pixel781 pixel782 pixel783 pixel784 class 0 0 0 0 0 1 2 0 0 0 0 -1 7 0 0 0 0 -1 14 0 0 0 0 1 15 0 0 0 0 1 ... ... ... ... ... ... 59988 0 0 0 0 -1 59991 0 0 0 0 -1 59992 0 0 0 0 -1 59993 9 0 0 0 1 59999 0 0 0 0 1 [24000 rows x 785 columns]>
test_data.head
<bound method NDFrame.head of pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 \ 2 0 0 0 0 0 0 14 53 99 3 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 44 105 44 10 7 0 0 0 0 0 0 0 1 0 8 0 0 0 0 0 0 0 0 0 ... ... ... ... ... ... ... ... ... ... 9984 0 0 0 0 0 0 0 0 0 9987 0 0 0 0 0 0 0 0 0 9988 0 0 0 0 0 0 0 0 0 9990 0 0 0 0 0 0 0 0 0 9996 0 0 0 0 0 0 0 0 0 pixel10 ... pixel776 pixel777 pixel778 pixel779 pixel780 \ 2 17 ... 0 0 0 63 53 3 161 ... 126 140 0 133 224 5 0 ... 64 30 0 0 0 7 0 ... 136 155 31 0 1 8 0 ... 0 0 0 0 0 ... ... ... ... ... ... ... ... 9984 0 ... 0 0 0 0 0 9987 0 ... 44 0 0 38 38 9988 0 ... 0 0 0 0 0 9990 0 ... 0 0 0 0 0 9996 0 ... 0 0 2 52 23 pixel781 pixel782 pixel783 pixel784 class 2 31 0 0 0 1 3 222 56 0 0 1 5 0 0 0 0 1 7 0 0 0 0 -1 8 0 0 0 0 -1 ... ... ... ... ... ... 9984 0 0 0 0 -1 9987 44 0 0 0 -1 9988 0 0 0 0 1 9990 0 0 0 0 1 9996 28 0 0 0 -1 [4000 rows x 785 columns]>
# Separate X, Y for train and test
Y_train, Y_test = train_data['class'], test_data['class']
X_train, X_test = train_data.drop('class', axis=1), test_data.drop('class', axis=1)
half_train_data = train_data.sample(frac=0.5, random_state=42)
X_half_train, Y_half_train = half_train_data.drop('class', axis=1), half_train_data['class']
# Function for reporting accuracy
def report_accuracy(model, X, Y, is_balanced=False):
Y_pred = model.predict(X)
if not is_balanced:
return accuracy_score(Y, Y_pred)
else:
return balanced_accuracy_score(Y, Y_pred)
# Function for training a logistic lasso regression with a default regularization hyperparameter
def train_lr_lasso(X_train, Y_train, weights=None):
lr_lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, max_iter=200)
if weights is None:
lr_lasso.fit(X_train, Y_train)
else:
lr_lasso.fit(X_train, Y_train, sample_weight=weights)
train_accuracy = report_accuracy(lr_lasso, X_train, Y_train)
return lr_lasso, train_accuracy
# Train on half of data
half_train_model, half_train_acc = train_lr_lasso(X_half_train, Y_half_train)
print(f'Training accuracy on half of training data: {half_train_acc}')
Training accuracy on half of training data: 0.9119166666666667
half_test_acc = report_accuracy(half_train_model, X_test, Y_test)
print(f'Testing accuracy on testing data: {half_test_acc}')
Testing accuracy on testing data: 0.8815
# Train on all of data
train_model, train_acc = train_lr_lasso(X_train, Y_train)
print(f'Training accuracy on all of training data: {train_acc}')
Training accuracy on all of training data: 0.9025833333333333
test_acc = report_accuracy(train_model, X_test, Y_test)
print(f'Testing accuracy on testing data: {test_acc}')
Testing accuracy on testing data: 0.886
# Function for getting biased datasets with a given fraction
def get_biased_data(data, n, bias_fraction):
sneaker_sample = data[data['label']==7].sample(n=(n/2*bias_fraction).astype(int), random_state=42)
pullover_sample = data[data['label']==2].sample(n=(n/2*(1-bias_fraction)).astype(int), random_state=42)
sandal_sample = data[data['label']==5].sample(n=(n/2*bias_fraction).astype(int), random_state=42)
shirt_sample = data[data['label']==6].sample(n=(n/2*(1-bias_fraction)).astype(int), random_state=42)
return pd.concat([sneaker_sample, pullover_sample, sandal_sample, shirt_sample], ignore_index=True)
# Get biased datasets
biased_datasets = {}
for Lambda in np.arange(0.05, 0.95, .05):
biased_datasets[f'{Lambda}'] = get_biased_data(raw_train_data, 12000, Lambda).drop('label', axis=1)
# Train on each biased dataset and evaluate training and test performance
for Lambda in biased_datasets.keys():
biased_data = biased_datasets[Lambda]
biased_X_train, biased_Y_train = biased_data.drop('class', axis=1), biased_data['class']
train_model, train_acc = train_lr_lasso(biased_X_train, biased_Y_train)
test_acc = report_accuracy(train_model, X_test, Y_test)
print(f'-----\nLambda = {Lambda}\n-----\n')
print(f'Training Accuracy: {train_acc}')
print(f'Testing Accuracy: {test_acc}\n\n')
----- Lambda = 0.05 ----- Training Accuracy: 0.8815 Testing Accuracy: 0.8325 ----- Lambda = 0.1 ----- Training Accuracy: 0.883 Testing Accuracy: 0.85 ----- Lambda = 0.15000000000000002 ----- Training Accuracy: 0.8855833333333333 Testing Accuracy: 0.863 ----- Lambda = 0.2 ----- Training Accuracy: 0.88875 Testing Accuracy: 0.87225 ----- Lambda = 0.25 ----- Training Accuracy: 0.8899166666666667 Testing Accuracy: 0.87775 ----- Lambda = 0.3 ----- Training Accuracy: 0.8956666666666667 Testing Accuracy: 0.87725 ----- Lambda = 0.35000000000000003 ----- Training Accuracy: 0.8992332055342557 Testing Accuracy: 0.87925 ----- Lambda = 0.4 ----- Training Accuracy: 0.9036666666666666 Testing Accuracy: 0.884 ----- Lambda = 0.45 ----- Training Accuracy: 0.9080833333333334 Testing Accuracy: 0.8815 ----- Lambda = 0.5 ----- Training Accuracy: 0.9125833333333333 Testing Accuracy: 0.8785 ----- Lambda = 0.55 ----- Training Accuracy: 0.916819469911652 Testing Accuracy: 0.8795 ----- Lambda = 0.6000000000000001 ----- Training Accuracy: 0.9224037339556592 Testing Accuracy: 0.878 ----- Lambda = 0.6500000000000001 ----- Training Accuracy: 0.9277379563260544 Testing Accuracy: 0.87575 ----- Lambda = 0.7000000000000001 ----- Training Accuracy: 0.9336556092682113 Testing Accuracy: 0.874 ----- Lambda = 0.7500000000000001 ----- Training Accuracy: 0.9411568594765795 Testing Accuracy: 0.87375 ----- Lambda = 0.8 ----- Training Accuracy: 0.9491581930321721 Testing Accuracy: 0.86625 ----- Lambda = 0.8500000000000001 ----- Training Accuracy: 0.9554925820970162 Testing Accuracy: 0.85925 ----- Lambda = 0.9000000000000001 ----- Training Accuracy: 0.9635772628771462 Testing Accuracy: 0.846
# Select bias with lambda = 0.1 for covariate shift correction
# Testing accuracy with no weights was 0.85
biased_data = biased_datasets['0.1']
# Change label of biased data and test data to distribution indicator
# Then combine to form dataset for propensity score model
biased_prop_score_data = biased_data.copy()
biased_prop_score_data['dataset_label'] = -1
test_prop_score_data = test_data
test_prop_score_data['dataset_label'] = 1
prop_score_data = pd.concat([biased_prop_score_data, test_prop_score_data]).drop('class', axis=1)
# We have unequal sample sizes between train and test datasets, so we need to reweight in the propensity score model
# So, we weight by 1/n_{train} for train observations and 1/n_{test} for test observations
# This up-weights whichever dataset has fewer observations and down-weights the other
n_train, n_test = biased_prop_score_data.shape[0], test_prop_score_data.shape[0]
sample_size_wts = prop_score_data['dataset_label'].apply(
lambda x: 1/n_train if (x==-1) else 1/n_test
)
# Fit (incorrect!!) propensity score model without weighting by sample size
wrong_prop_score_model = LogisticRegression(solver='liblinear', max_iter=200)
wrong_prop_score_model.fit(prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'])
wrong_prop_score_bal_acc = report_accuracy(wrong_prop_score_model, prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], is_balanced=True)
# Fit propensity score model with weighting by sample size
prop_score_model = LogisticRegression(solver='liblinear', max_iter=200)
prop_score_model.fit(prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], sample_weight=sample_size_wts)
prop_score_bal_acc = report_accuracy(prop_score_model, prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], is_balanced=True)
print(f'Balanced accuracy of predicting class w/o sample size weighting: {wrong_prop_score_bal_acc}\nBalanced accuracy of predicting class w/ sample size weighting: {prop_score_bal_acc}')
Balanced accuracy of predicting class w/o sample size weighting: 0.6745 Balanced accuracy of predicting class w/ sample size weighting: 0.7094583333333333
# Predict on training data to get estimated propensity score and propensity score weights
prop_scores_pred = prop_score_model.predict_proba(biased_data.drop(['class', 'dataset_label'], axis=1))[:,1]
prop_weights = prop_scores_pred / (1-prop_scores_pred)
print(prop_weights)
[2.58588899 2.3293252 9.84757802 ... 0.37086764 0.42212846 0.64516629]
# Fit model using propensity score weights and report (unbalanced) train accuracy
cov_shift_corrected_model, cov_shift_corrected_train_acc = train_lr_lasso(biased_data.drop(['class', 'dataset_label'], axis=1), biased_data['class'], prop_weights)
print(f'Covariate-shift corrected training accuracy: {cov_shift_corrected_train_acc}')
Covariate-shift corrected training accuracy: 0.8695833333333334
# Report unbalanced test accuracy
# Note that the accuracy has improved
cov_shift_corrected_test_acc = report_accuracy(cov_shift_corrected_model, test_data.drop(['class', 'dataset_label'], axis=1), test_data['class'])
print(f'Covariate-shift corrected testing accuracy: {cov_shift_corrected_test_acc}')
Covariate-shift corrected testing accuracy: 0.867