Notebook

Setup¶

In [ ]:

# Imports
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [ ]:

# You can download the data from Kaggle:
# https://www.kaggle.com/datasets/zalando-research/fashionmnist?resource=download
raw_train_data = pd.read_csv('fashion-mnist_train.csv')
raw_test_data = pd.read_csv('fashion-mnist_test.csv')

In [ ]:

raw_train_data.head

Out[ ]:

<bound method NDFrame.head of        label  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0          2       0       0       0       0       0       0       0       0   
1          9       0       0       0       0       0       0       0       0   
2          6       0       0       0       0       0       0       0       5   
3          0       0       0       0       1       2       0       0       0   
4          3       0       0       0       0       0       0       0       0   
...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
59995      9       0       0       0       0       0       0       0       0   
59996      1       0       0       0       0       0       0       0       0   
59997      8       0       0       0       0       0       0       0       0   
59998      8       0       0       0       0       0       0       0       0   
59999      7       0       0       0       0       0       0       0       0   

       pixel9  ...  pixel775  pixel776  pixel777  pixel778  pixel779  \
0           0  ...         0         0         0         0         0   
1           0  ...         0         0         0         0         0   
2           0  ...         0         0         0        30        43   
3           0  ...         3         0         0         0         0   
4           0  ...         0         0         0         0         0   
...       ...  ...       ...       ...       ...       ...       ...   
59995       0  ...         0         0         0         0         0   
59996       0  ...        73         0         0         0         0   
59997       0  ...       160       162       163       135        94   
59998       0  ...         0         0         0         0         0   
59999       0  ...         0         0         0         0         0   

       pixel780  pixel781  pixel782  pixel783  pixel784  
0             0         0         0         0         0  
1             0         0         0         0         0  
2             0         0         0         0         0  
3             1         0         0         0         0  
4             0         0         0         0         0  
...         ...       ...       ...       ...       ...  
59995         0         0         0         0         0  
59996         0         0         0         0         0  
59997         0         0         0         0         0  
59998         0         0         0         0         0  
59999         0         0         0         0         0  

[60000 rows x 785 columns]>

In [ ]:

raw_test_data.head

Out[ ]:

<bound method NDFrame.head of       label  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0         0       0       0       0       0       0       0       0       9   
1         1       0       0       0       0       0       0       0       0   
2         2       0       0       0       0       0       0      14      53   
3         2       0       0       0       0       0       0       0       0   
4         3       0       0       0       0       0       0       0       0   
...     ...     ...     ...     ...     ...     ...     ...     ...     ...   
9995      0       0       0       0       0       0       0       0       0   
9996      6       0       0       0       0       0       0       0       0   
9997      8       0       0       0       0       0       0       0       0   
9998      8       0       1       3       0       0       0       0       0   
9999      1       0       0       0       0       0       0       0     140   

      pixel9  ...  pixel775  pixel776  pixel777  pixel778  pixel779  pixel780  \
0          8  ...       103        87        56         0         0         0   
1          0  ...        34         0         0         0         0         0   
2         99  ...         0         0         0         0        63        53   
3          0  ...       137       126       140         0       133       224   
4          0  ...         0         0         0         0         0         0   
...      ...  ...       ...       ...       ...       ...       ...       ...   
9995       0  ...        32        23        14        20         0         0   
9996       0  ...         0         0         0         2        52        23   
9997       0  ...       175       172       172       182       199       222   
9998       0  ...         0         0         0         0         0         1   
9999     119  ...       111        95        75        44         1         0   

      pixel781  pixel782  pixel783  pixel784  
0            0         0         0         0  
1            0         0         0         0  
2           31         0         0         0  
3          222        56         0         0  
4            0         0         0         0  
...        ...       ...       ...       ...  
9995         1         0         0         0  
9996        28         0         0         0  
9997        42         0         1         0  
9998         0         0         0         0  
9999         0         0         0         0  

[10000 rows x 785 columns]>

Q1¶

(a)¶

In [ ]:

# From documentation, label --> clothing mapping is:
# 0 --> t-shirt/top
# 1 --> trouser
# 2 --> pullover
# 3 --> dress
# 4 --> coat
# 5 --> sandal
# 6 --> shirt
# 7 --> sneaker
# 8 --> bag
# 9 --> ankle boot
raw_train_data['class'] = raw_train_data['label'].apply(
    lambda x: 1 if (x==7 or x==2) else -1 if (x==5 or x==6) else 0
)
raw_test_data['class'] = raw_test_data['label'].apply(
    lambda x: 1 if (x==7 or x==2) else -1 if (x==5 or x==6) else 0
)

raw_train_data = raw_train_data[raw_train_data['class'].isin([-1, 1])]
raw_test_data = raw_test_data[raw_test_data['class'].isin([-1,1])]

train_data = raw_train_data.drop('label', axis=1)
test_data = raw_test_data.drop('label', axis=1)

In [ ]:

train_data.head

Out[ ]:

<bound method NDFrame.head of        pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  pixel9  \
0           0       0       0       0       0       0       0       0       0   
2           0       0       0       0       0       0       0       5       0   
7           0       0       0       0       0       0       0       0       0   
14          0       0       0       0       1       1       0       0       0   
15          0       0       0       0       0       0       0       0      16   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
59988       0       0       0       0       0       0       0       0       0   
59991       0       0       0       0       0       0       0       0       0   
59992       0       0       0       0       0       0       0       0       0   
59993       0       0       0       0       0       0       1       0       0   
59999       0       0       0       0       0       0       0       0       0   

       pixel10  ...  pixel776  pixel777  pixel778  pixel779  pixel780  \
0            0  ...         0         0         0         0         0   
2            0  ...         0         0        30        43         0   
7            0  ...         0         0         0         0         0   
14           0  ...         0       118       190       162        82   
15          43  ...         0         1         1         1         1   
...        ...  ...       ...       ...       ...       ...       ...   
59988        0  ...         0         0         0         0         0   
59991        0  ...         0         0         0         0         0   
59992        0  ...         0         0         0         0         0   
59993        0  ...         0         0         0       141       113   
59999        0  ...         0         0         0         0         0   

       pixel781  pixel782  pixel783  pixel784  class  
0             0         0         0         0      1  
2             0         0         0         0     -1  
7             0         0         0         0     -1  
14            0         0         0         0      1  
15            0         0         0         0      1  
...         ...       ...       ...       ...    ...  
59988         0         0         0         0     -1  
59991         0         0         0         0     -1  
59992         0         0         0         0     -1  
59993         9         0         0         0      1  
59999         0         0         0         0      1  

[24000 rows x 785 columns]>

In [ ]:

test_data.head

Out[ ]:

<bound method NDFrame.head of       pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  pixel9  \
2          0       0       0       0       0       0      14      53      99   
3          0       0       0       0       0       0       0       0       0   
5          0       0       0       0       0      44     105      44      10   
7          0       0       0       0       0       0       0       1       0   
8          0       0       0       0       0       0       0       0       0   
...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
9984       0       0       0       0       0       0       0       0       0   
9987       0       0       0       0       0       0       0       0       0   
9988       0       0       0       0       0       0       0       0       0   
9990       0       0       0       0       0       0       0       0       0   
9996       0       0       0       0       0       0       0       0       0   

      pixel10  ...  pixel776  pixel777  pixel778  pixel779  pixel780  \
2          17  ...         0         0         0        63        53   
3         161  ...       126       140         0       133       224   
5           0  ...        64        30         0         0         0   
7           0  ...       136       155        31         0         1   
8           0  ...         0         0         0         0         0   
...       ...  ...       ...       ...       ...       ...       ...   
9984        0  ...         0         0         0         0         0   
9987        0  ...        44         0         0        38        38   
9988        0  ...         0         0         0         0         0   
9990        0  ...         0         0         0         0         0   
9996        0  ...         0         0         2        52        23   

      pixel781  pixel782  pixel783  pixel784  class  
2           31         0         0         0      1  
3          222        56         0         0      1  
5            0         0         0         0      1  
7            0         0         0         0     -1  
8            0         0         0         0     -1  
...        ...       ...       ...       ...    ...  
9984         0         0         0         0     -1  
9987        44         0         0         0     -1  
9988         0         0         0         0      1  
9990         0         0         0         0      1  
9996        28         0         0         0     -1  

[4000 rows x 785 columns]>

In [ ]:

# Separate X, Y for train and test
Y_train, Y_test = train_data['class'], test_data['class']
X_train, X_test = train_data.drop('class', axis=1), test_data.drop('class', axis=1)

half_train_data = train_data.sample(frac=0.5, random_state=42)
X_half_train, Y_half_train = half_train_data.drop('class', axis=1), half_train_data['class']

(b)¶

In [ ]:

# Function for reporting accuracy
def report_accuracy(model, X, Y, is_balanced=False):
    Y_pred = model.predict(X)
    if not is_balanced:
        return accuracy_score(Y, Y_pred)
    else:
        return balanced_accuracy_score(Y, Y_pred)


# Function for training a logistic lasso regression with a default regularization hyperparameter
def train_lr_lasso(X_train, Y_train, weights=None):
    lr_lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, max_iter=200)
    if weights is None:
        lr_lasso.fit(X_train, Y_train)
    else:
        lr_lasso.fit(X_train, Y_train, sample_weight=weights)
    train_accuracy = report_accuracy(lr_lasso, X_train, Y_train)

    return lr_lasso, train_accuracy

In [ ]:

# Train on half of data
half_train_model, half_train_acc = train_lr_lasso(X_half_train, Y_half_train)

print(f'Training accuracy on half of training data: {half_train_acc}')

Training accuracy on half of training data: 0.9119166666666667

In [ ]:

half_test_acc = report_accuracy(half_train_model, X_test, Y_test)
print(f'Testing accuracy on testing data: {half_test_acc}')

Testing accuracy on testing data: 0.8815

In [ ]:

# Train on all of data
train_model, train_acc = train_lr_lasso(X_train, Y_train)

print(f'Training accuracy on all of training data: {train_acc}')

Training accuracy on all of training data: 0.9025833333333333

In [ ]:

test_acc = report_accuracy(train_model, X_test, Y_test)
print(f'Testing accuracy on testing data: {test_acc}')

Testing accuracy on testing data: 0.886

Q2¶

(a)¶

In [ ]:

# Function for getting biased datasets with a given fraction
def get_biased_data(data, n, bias_fraction):
    sneaker_sample = data[data['label']==7].sample(n=(n/2*bias_fraction).astype(int), random_state=42)
    pullover_sample = data[data['label']==2].sample(n=(n/2*(1-bias_fraction)).astype(int), random_state=42)
    sandal_sample = data[data['label']==5].sample(n=(n/2*bias_fraction).astype(int), random_state=42)
    shirt_sample = data[data['label']==6].sample(n=(n/2*(1-bias_fraction)).astype(int), random_state=42)

    return pd.concat([sneaker_sample, pullover_sample, sandal_sample, shirt_sample], ignore_index=True)

In [ ]:

# Get biased datasets
biased_datasets = {}
for Lambda in np.arange(0.05, 0.95, .05):
    biased_datasets[f'{Lambda}'] = get_biased_data(raw_train_data, 12000, Lambda).drop('label', axis=1)

(b)¶

In [ ]:

# Train on each biased dataset and evaluate training and test performance
for Lambda in biased_datasets.keys():
    biased_data = biased_datasets[Lambda]
    biased_X_train, biased_Y_train = biased_data.drop('class', axis=1), biased_data['class']
    train_model, train_acc = train_lr_lasso(biased_X_train, biased_Y_train)
    test_acc = report_accuracy(train_model, X_test, Y_test)

    print(f'-----\nLambda = {Lambda}\n-----\n')
    print(f'Training Accuracy: {train_acc}')
    print(f'Testing Accuracy: {test_acc}\n\n')

-----
Lambda = 0.05
-----

Training Accuracy: 0.8815
Testing Accuracy: 0.8325


-----
Lambda = 0.1
-----

Training Accuracy: 0.883
Testing Accuracy: 0.85


-----
Lambda = 0.15000000000000002
-----

Training Accuracy: 0.8855833333333333
Testing Accuracy: 0.863


-----
Lambda = 0.2
-----

Training Accuracy: 0.88875
Testing Accuracy: 0.87225


-----
Lambda = 0.25
-----

Training Accuracy: 0.8899166666666667
Testing Accuracy: 0.87775


-----
Lambda = 0.3
-----

Training Accuracy: 0.8956666666666667
Testing Accuracy: 0.87725


-----
Lambda = 0.35000000000000003
-----

Training Accuracy: 0.8992332055342557
Testing Accuracy: 0.87925


-----
Lambda = 0.4
-----

Training Accuracy: 0.9036666666666666
Testing Accuracy: 0.884


-----
Lambda = 0.45
-----

Training Accuracy: 0.9080833333333334
Testing Accuracy: 0.8815


-----
Lambda = 0.5
-----

Training Accuracy: 0.9125833333333333
Testing Accuracy: 0.8785


-----
Lambda = 0.55
-----

Training Accuracy: 0.916819469911652
Testing Accuracy: 0.8795


-----
Lambda = 0.6000000000000001
-----

Training Accuracy: 0.9224037339556592
Testing Accuracy: 0.878


-----
Lambda = 0.6500000000000001
-----

Training Accuracy: 0.9277379563260544
Testing Accuracy: 0.87575


-----
Lambda = 0.7000000000000001
-----

Training Accuracy: 0.9336556092682113
Testing Accuracy: 0.874


-----
Lambda = 0.7500000000000001
-----

Training Accuracy: 0.9411568594765795
Testing Accuracy: 0.87375


-----
Lambda = 0.8
-----

Training Accuracy: 0.9491581930321721
Testing Accuracy: 0.86625


-----
Lambda = 0.8500000000000001
-----

Training Accuracy: 0.9554925820970162
Testing Accuracy: 0.85925


-----
Lambda = 0.9000000000000001
-----

Training Accuracy: 0.9635772628771462
Testing Accuracy: 0.846

Q3¶

(a/b)¶

In [ ]:

# Select bias with lambda = 0.1 for covariate shift correction
# Testing accuracy with no weights was 0.85
biased_data = biased_datasets['0.1']

# Change label of biased data and test data to distribution indicator
# Then combine to form dataset for propensity score model
biased_prop_score_data = biased_data.copy()
biased_prop_score_data['dataset_label'] = -1
test_prop_score_data = test_data
test_prop_score_data['dataset_label'] = 1
prop_score_data = pd.concat([biased_prop_score_data, test_prop_score_data]).drop('class', axis=1)

In [ ]:

# We have unequal sample sizes between train and test datasets, so we need to reweight in the propensity score model
# So, we weight by 1/n_{train} for train observations and 1/n_{test} for test observations
# This up-weights whichever dataset has fewer observations and down-weights the other
n_train, n_test = biased_prop_score_data.shape[0], test_prop_score_data.shape[0]
sample_size_wts = prop_score_data['dataset_label'].apply(
    lambda x: 1/n_train if (x==-1) else 1/n_test
)

# Fit (incorrect!!) propensity score model without weighting by sample size
wrong_prop_score_model = LogisticRegression(solver='liblinear', max_iter=200)
wrong_prop_score_model.fit(prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'])
wrong_prop_score_bal_acc = report_accuracy(wrong_prop_score_model, prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], is_balanced=True)

# Fit propensity score model with weighting by sample size
prop_score_model = LogisticRegression(solver='liblinear', max_iter=200)
prop_score_model.fit(prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], sample_weight=sample_size_wts)
prop_score_bal_acc = report_accuracy(prop_score_model, prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], is_balanced=True)


print(f'Balanced accuracy of predicting class w/o sample size weighting: {wrong_prop_score_bal_acc}\nBalanced accuracy of predicting class w/ sample size weighting: {prop_score_bal_acc}')

Balanced accuracy of predicting class w/o sample size weighting: 0.6745
Balanced accuracy of predicting class w/ sample size weighting: 0.7094583333333333

(c)¶

In [ ]:

# Predict on training data to get estimated propensity score and propensity score weights
prop_scores_pred = prop_score_model.predict_proba(biased_data.drop(['class', 'dataset_label'], axis=1))[:,1]
prop_weights = prop_scores_pred / (1-prop_scores_pred)

print(prop_weights)

[2.58588899 2.3293252  9.84757802 ... 0.37086764 0.42212846 0.64516629]

(d)¶

In [ ]:

# Fit model using propensity score weights and report (unbalanced) train accuracy
cov_shift_corrected_model, cov_shift_corrected_train_acc = train_lr_lasso(biased_data.drop(['class', 'dataset_label'], axis=1), biased_data['class'], prop_weights)
print(f'Covariate-shift corrected training accuracy: {cov_shift_corrected_train_acc}')

Covariate-shift corrected training accuracy: 0.8695833333333334

In [ ]:

# Report unbalanced test accuracy
# Note that the accuracy has improved
cov_shift_corrected_test_acc = report_accuracy(cov_shift_corrected_model, test_data.drop(['class', 'dataset_label'], axis=1), test_data['class'])
print(f'Covariate-shift corrected testing accuracy: {cov_shift_corrected_test_acc}')

Covariate-shift corrected testing accuracy: 0.867