#!/usr/bin/env python # coding: utf-8 # # Setup # In[ ]: # Imports import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, balanced_accuracy_score # In[ ]: # You can download the data from Kaggle: # https://www.kaggle.com/datasets/zalando-research/fashionmnist?resource=download raw_train_data = pd.read_csv('fashion-mnist_train.csv') raw_test_data = pd.read_csv('fashion-mnist_test.csv') # In[ ]: raw_train_data.head # In[ ]: raw_test_data.head # # Q1 # ## (a) # In[ ]: # From documentation, label --> clothing mapping is: # 0 --> t-shirt/top # 1 --> trouser # 2 --> pullover # 3 --> dress # 4 --> coat # 5 --> sandal # 6 --> shirt # 7 --> sneaker # 8 --> bag # 9 --> ankle boot raw_train_data['class'] = raw_train_data['label'].apply( lambda x: 1 if (x==7 or x==2) else -1 if (x==5 or x==6) else 0 ) raw_test_data['class'] = raw_test_data['label'].apply( lambda x: 1 if (x==7 or x==2) else -1 if (x==5 or x==6) else 0 ) raw_train_data = raw_train_data[raw_train_data['class'].isin([-1, 1])] raw_test_data = raw_test_data[raw_test_data['class'].isin([-1,1])] train_data = raw_train_data.drop('label', axis=1) test_data = raw_test_data.drop('label', axis=1) # In[ ]: train_data.head # In[ ]: test_data.head # In[ ]: # Separate X, Y for train and test Y_train, Y_test = train_data['class'], test_data['class'] X_train, X_test = train_data.drop('class', axis=1), test_data.drop('class', axis=1) half_train_data = train_data.sample(frac=0.5, random_state=42) X_half_train, Y_half_train = half_train_data.drop('class', axis=1), half_train_data['class'] # ## (b) # In[ ]: # Function for reporting accuracy def report_accuracy(model, X, Y, is_balanced=False): Y_pred = model.predict(X) if not is_balanced: return accuracy_score(Y, Y_pred) else: return balanced_accuracy_score(Y, Y_pred) # Function for training a logistic lasso regression with a default regularization hyperparameter def train_lr_lasso(X_train, Y_train, weights=None): lr_lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, max_iter=200) if weights is None: lr_lasso.fit(X_train, Y_train) else: lr_lasso.fit(X_train, Y_train, sample_weight=weights) train_accuracy = report_accuracy(lr_lasso, X_train, Y_train) return lr_lasso, train_accuracy # In[ ]: # Train on half of data half_train_model, half_train_acc = train_lr_lasso(X_half_train, Y_half_train) print(f'Training accuracy on half of training data: {half_train_acc}') # In[ ]: half_test_acc = report_accuracy(half_train_model, X_test, Y_test) print(f'Testing accuracy on testing data: {half_test_acc}') # In[ ]: # Train on all of data train_model, train_acc = train_lr_lasso(X_train, Y_train) print(f'Training accuracy on all of training data: {train_acc}') # In[ ]: test_acc = report_accuracy(train_model, X_test, Y_test) print(f'Testing accuracy on testing data: {test_acc}') # # Q2 # ## (a) # In[ ]: # Function for getting biased datasets with a given fraction def get_biased_data(data, n, bias_fraction): sneaker_sample = data[data['label']==7].sample(n=(n/2*bias_fraction).astype(int), random_state=42) pullover_sample = data[data['label']==2].sample(n=(n/2*(1-bias_fraction)).astype(int), random_state=42) sandal_sample = data[data['label']==5].sample(n=(n/2*bias_fraction).astype(int), random_state=42) shirt_sample = data[data['label']==6].sample(n=(n/2*(1-bias_fraction)).astype(int), random_state=42) return pd.concat([sneaker_sample, pullover_sample, sandal_sample, shirt_sample], ignore_index=True) # In[ ]: # Get biased datasets biased_datasets = {} for Lambda in np.arange(0.05, 0.95, .05): biased_datasets[f'{Lambda}'] = get_biased_data(raw_train_data, 12000, Lambda).drop('label', axis=1) # ## (b) # In[ ]: # Train on each biased dataset and evaluate training and test performance for Lambda in biased_datasets.keys(): biased_data = biased_datasets[Lambda] biased_X_train, biased_Y_train = biased_data.drop('class', axis=1), biased_data['class'] train_model, train_acc = train_lr_lasso(biased_X_train, biased_Y_train) test_acc = report_accuracy(train_model, X_test, Y_test) print(f'-----\nLambda = {Lambda}\n-----\n') print(f'Training Accuracy: {train_acc}') print(f'Testing Accuracy: {test_acc}\n\n') # # Q3 # ## (a/b) # In[ ]: # Select bias with lambda = 0.1 for covariate shift correction # Testing accuracy with no weights was 0.85 biased_data = biased_datasets['0.1'] # Change label of biased data and test data to distribution indicator # Then combine to form dataset for propensity score model biased_prop_score_data = biased_data.copy() biased_prop_score_data['dataset_label'] = -1 test_prop_score_data = test_data test_prop_score_data['dataset_label'] = 1 prop_score_data = pd.concat([biased_prop_score_data, test_prop_score_data]).drop('class', axis=1) # In[ ]: # We have unequal sample sizes between train and test datasets, so we need to reweight in the propensity score model # So, we weight by 1/n_{train} for train observations and 1/n_{test} for test observations # This up-weights whichever dataset has fewer observations and down-weights the other n_train, n_test = biased_prop_score_data.shape[0], test_prop_score_data.shape[0] sample_size_wts = prop_score_data['dataset_label'].apply( lambda x: 1/n_train if (x==-1) else 1/n_test ) # Fit (incorrect!!) propensity score model without weighting by sample size wrong_prop_score_model = LogisticRegression(solver='liblinear', max_iter=200) wrong_prop_score_model.fit(prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label']) wrong_prop_score_bal_acc = report_accuracy(wrong_prop_score_model, prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], is_balanced=True) # Fit propensity score model with weighting by sample size prop_score_model = LogisticRegression(solver='liblinear', max_iter=200) prop_score_model.fit(prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], sample_weight=sample_size_wts) prop_score_bal_acc = report_accuracy(prop_score_model, prop_score_data.drop('dataset_label', axis=1), prop_score_data['dataset_label'], is_balanced=True) print(f'Balanced accuracy of predicting class w/o sample size weighting: {wrong_prop_score_bal_acc}\nBalanced accuracy of predicting class w/ sample size weighting: {prop_score_bal_acc}') # ## (c) # In[ ]: # Predict on training data to get estimated propensity score and propensity score weights prop_scores_pred = prop_score_model.predict_proba(biased_data.drop(['class', 'dataset_label'], axis=1))[:,1] prop_weights = prop_scores_pred / (1-prop_scores_pred) print(prop_weights) # ## (d) # In[ ]: # Fit model using propensity score weights and report (unbalanced) train accuracy cov_shift_corrected_model, cov_shift_corrected_train_acc = train_lr_lasso(biased_data.drop(['class', 'dataset_label'], axis=1), biased_data['class'], prop_weights) print(f'Covariate-shift corrected training accuracy: {cov_shift_corrected_train_acc}') # In[ ]: # Report unbalanced test accuracy # Note that the accuracy has improved cov_shift_corrected_test_acc = report_accuracy(cov_shift_corrected_model, test_data.drop(['class', 'dataset_label'], axis=1), test_data['class']) print(f'Covariate-shift corrected testing accuracy: {cov_shift_corrected_test_acc}')