#!/usr/bin/env python
# coding: utf-8

# # Setup

# In[ ]:


# Imports
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score


# In[ ]:


# You can download the data from Kaggle:
# https://www.kaggle.com/datasets/zalando-research/fashionmnist?resource=download
raw_train_data = pd.read_csv('fashion-mnist_train.csv')
raw_test_data = pd.read_csv('fashion-mnist_test.csv')


# In[ ]:


raw_train_data.head


# In[ ]:


raw_test_data.head


# # Q1

# ## (a)

# In[ ]:


# From documentation, label --> clothing mapping is:
# 0 --> t-shirt/top
# 1 --> trouser
# 2 --> pullover
# 3 --> dress
# 4 --> coat
# 5 --> sandal
# 6 --> shirt
# 7 --> sneaker
# 8 --> bag
# 9 --> ankle boot
raw_train_data['class'] = raw_train_data['label'].apply(
    lambda x: 1 if (x==7 or x==2) else -1 if (x==5 or x==6) else 0
)
raw_test_data['class'] = raw_test_data['label'].apply(
    lambda x: 1 if (x==7 or x==2) else -1 if (x==5 or x==6) else 0
)

raw_train_data = raw_train_data[raw_train_data['class'].isin([-1, 1])]
raw_test_data = raw_test_data[raw_test_data['class'].isin([-1,1])]

train_data = raw_train_data.drop('label', axis=1)
test_data = raw_test_data.drop('label', axis=1)


# In[ ]:


train_data.head


# In[ ]:


test_data.head


# In[ ]:


# Separate X, Y for train and test
Y_train, Y_test = train_data['class'], test_data['class']
X_train, X_test = train_data.drop('class', axis=1), test_data.drop('class', axis=1)

half_train_data = train_data.sample(frac=0.5, random_state=42)
X_half_train, Y_half_train = half_train_data.drop('class', axis=1), half_train_data['class']


# ## (b)

# In[ ]:


# Function for reporting accuracy
def report_accuracy(model, X, Y, is_balanced=False):
    Y_pred = model.predict(X)
    if not is_balanced:
        return accuracy_score(Y, Y_pred)
    else:
        return balanced_accuracy_score(Y, Y_pred)


# Function for training a logistic lasso regression with a default regularization hyperparameter
def train_lr_lasso(X_train, Y_train, weights=None):
    lr_lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, max_iter=200)
    if weights is None:
        lr_lasso.fit(X_train, Y_train)
    else:
        lr_lasso.fit(X_train, Y_train, sample_weight=weights)
    train_accuracy = report_accuracy(lr_lasso, X_train, Y_train)

    return lr_lasso, train_accuracy


# In[ ]:


# Train on half of data
half_train_model, half_train_acc = train_lr_lasso(X_half_train, Y_half_train)

print(f'Training accuracy on half of training data: {half_train_acc}')


# In[ ]:


half_test_acc = report_accuracy(half_train_model, X_test, Y_test)
print(f'Testing accuracy on testing data: {half_test_acc}')


# In[ ]:


# Train on all of data
train_model, train_acc = train_lr_lasso(X_train, Y_train)

print(f'Training accuracy on all of training data: {train_acc}')


# In[ ]:


test_acc = report_accuracy(train_model, X_test, Y_test)
print(f'Testing accuracy on testing data: {test_acc}')


# # Q2

# ## (a)

# In[ ]:


# Function for getting biased datasets with a given fraction
def get_biased_data(data, n, bias_fraction):
    sneaker_sample = data[data['label']==7].sample(n=(n/2*bias_fraction).astype(int), random_state=42)
    pullover_sample = data[data['label']==2].sample(n=(n/2*(1-bias_fraction)).astype(int), random_state=42)
    sandal_sample = data[data['label']==5].sample(n=(n/2*bias_fraction).astype(int), random_state=42)
    shirt_sample = data[data['label']==6].sample(n=(n/2*(1-bias_fraction)).astype(int), random_state=42)

    return pd.concat([sneaker_sample, pullover_sample, sandal_sample, shirt_sample], ignore_index=True)


# In[ ]:


# Get biased datasets
biased_datasets = {}
for Lambda in np.arange(0.05, 0.95, .05):
    biased_datasets[f'{Lambda}'] = get_biased_data(raw_train_data, 12000, Lambda).drop('label', axis=1)


# ## (b)

# In[ ]:


# Train on each biased dataset and evaluate training and test performance
for Lambda in biased_datasets.keys():
    biased_data = biased_datasets[Lambda]
    biased_X_train, biased_Y_train = biased_data.drop('class', axis=1), biased_data['class']
    train_model, train_acc = train_lr_lasso(biased_X_train, biased_Y_train)
    test_acc = report_accuracy(train_model, X_test, Y_test)

    print(f'-----\nLambda = {Lambda}\n-----\n')
    print(f'Training Accuracy: {train_acc}')
    print(f'Testing Accuracy: {test_acc}\n\n')


# # Q3

# ## (a/b)

# In[ ]:


# Select bias with lambda = 0.1 for covariate shift correction
# Testing accuracy with no weights was 0.85
biased_data = biased_datasets['0.1']

# Change label of biased data and test data to distribution indicator
# Then combine to form dataset for propensity score model
biased_prop_score_data = biased_data.copy()
biased_prop_score_data['dataset_label'] = -1
test_prop_score_data = test_data
test_prop_score_data['dataset_label'] = 1
prop_score_data = pd.concat([biased_prop_score_data, test_prop_score_data]).drop('class', axis=1)


# In[ ]:


# We have unequal sample sizes between train and test datasets, so we need to reweight in the propensity score model
# So, we weight by 1/n_{train} for train observations and 1/n_{test} for test observations
# This up-weights whichever dataset has fewer observations and down-weights the other
n_train, n_test = biased_prop_score_data.shape[0], test_prop_score_data.shape[0]
sample_size_wts = prop_score_data['dataset_label'].apply(
    lambda x: 1/n_train if (x==-1) else 1/n_test
)

# Fit (incorrect!!) propensity score model without weighting by sample size
wrong_prop_score_model = LogisticRegression(solver='liblinear', max_iter=200)
wrong_prop_score_model.fit(prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'])
wrong_prop_score_bal_acc = report_accuracy(wrong_prop_score_model, prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], is_balanced=True)

# Fit propensity score model with weighting by sample size
prop_score_model = LogisticRegression(solver='liblinear', max_iter=200)
prop_score_model.fit(prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], sample_weight=sample_size_wts)
prop_score_bal_acc = report_accuracy(prop_score_model, prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], is_balanced=True)


print(f'Balanced accuracy of predicting class w/o sample size weighting: {wrong_prop_score_bal_acc}\nBalanced accuracy of predicting class w/ sample size weighting: {prop_score_bal_acc}')


# ## (c)

# In[ ]:


# Predict on training data to get estimated propensity score and propensity score weights
prop_scores_pred = prop_score_model.predict_proba(biased_data.drop(['class', 'dataset_label'], axis=1))[:,1]
prop_weights = prop_scores_pred / (1-prop_scores_pred)

print(prop_weights)


# ## (d)

# In[ ]:


# Fit model using propensity score weights and report (unbalanced) train accuracy
cov_shift_corrected_model, cov_shift_corrected_train_acc = train_lr_lasso(biased_data.drop(['class', 'dataset_label'], axis=1), biased_data['class'], prop_weights)
print(f'Covariate-shift corrected training accuracy: {cov_shift_corrected_train_acc}')


# In[ ]:


# Report unbalanced test accuracy
# Note that the accuracy has improved
cov_shift_corrected_test_acc = report_accuracy(cov_shift_corrected_model, test_data.drop(['class', 'dataset_label'], axis=1), test_data['class'])
print(f'Covariate-shift corrected testing accuracy: {cov_shift_corrected_test_acc}')