This is a regular cross entropy classifier as a baseline for comparison with ordinal regression methods.
We will be using the cement_strength dataset from https://github.com/gagolews/ordinal_regression_data/blob/master/cement_strength.csv.
First, we are going to download and prepare the and save it as CSV files locally. This is a general procedure that is not specific to CORN.
This dataset has 5 ordinal labels (1, 2, 3, 4, and 5). Note that we require labels to be starting at 0, which is why we subtract "1" from the label column.
import pandas as pd
import numpy as np
data_df = pd.read_csv("https://raw.githubusercontent.com/gagolews/ordinal_regression_data/master/cement_strength.csv")
data_df["response"] = data_df["response"]-1 # labels should start at 0
data_labels = data_df["response"]
data_features = data_df.loc[:, ["V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8"]]
print('Number of features:', data_features.shape[1])
print('Number of examples:', data_features.shape[0])
print('Labels:', np.unique(data_labels.values))
Number of features: 8 Number of examples: 998 Labels: [0 1 2 3 4]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
data_features.values,
data_labels.values,
test_size=0.2,
random_state=1,
stratify=data_labels.values)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
In this section, we set up the data set and data loaders. This is a general procedure that is not specific to the method.
import torch
##########################
### SETTINGS
##########################
# Hyperparameters
random_seed = 1
learning_rate = 0.001
num_epochs = 50
batch_size = 128
# Architecture
NUM_CLASSES = 5
# Other
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Training on', DEVICE)
Training on cuda:0
from torch.utils.data import Dataset
class MyDataset(Dataset):
def __init__(self, feature_array, label_array, dtype=np.float32):
self.features = feature_array.astype(np.float32)
self.labels = label_array
def __getitem__(self, index):
inputs = self.features[index]
label = self.labels[index]
return inputs, label
def __len__(self):
return self.labels.shape[0]
import torch
from torch.utils.data import DataLoader
# Note transforms.ToTensor() scales input images
# to 0-1 range
train_dataset = MyDataset(X_train_std, y_train)
test_dataset = MyDataset(X_test_std, y_test)
train_loader = DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True, # want to shuffle the dataset
num_workers=0) # number processes/CPUs to use
test_loader = DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0)
# Checking the dataset
for inputs, labels in train_loader:
print('Input batch dimensions:', inputs.shape)
print('Input label dimensions:', labels.shape)
break
Input batch dimensions: torch.Size([128, 8]) Input label dimensions: torch.Size([128])
In this section, we are implementing a simple MLP for ordinal regression. To implement the Beckham et al. method, we add the parameter layer a
as self.a
, which is used to compute the predictions for the loss function later in the training loop:
class MLP(torch.nn.Module):
def __init__(self, in_features, num_classes, num_hidden_1=300, num_hidden_2=300):
super().__init__()
self.num_classes = num_classes
self.my_network = torch.nn.Sequential(
# 1st hidden layer
torch.nn.Linear(in_features, num_hidden_1, bias=False),
torch.nn.LeakyReLU(),
torch.nn.Dropout(0.2),
torch.nn.BatchNorm1d(num_hidden_1),
# 2nd hidden layer
torch.nn.Linear(num_hidden_1, num_hidden_2, bias=False),
torch.nn.LeakyReLU(),
torch.nn.Dropout(0.2),
torch.nn.BatchNorm1d(num_hidden_2),
# Output layer
torch.nn.Linear(num_hidden_2, num_classes)
)
def forward(self, x):
logits = self.my_network(x)
return logits
torch.manual_seed(random_seed)
model = MLP(in_features=8, num_classes=NUM_CLASSES)
model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
model = model.train()
for batch_idx, (features, class_labels) in enumerate(train_loader):
class_labels = class_labels.to(DEVICE)
features = features.to(DEVICE)
logits = model(features)
logits = model(features)
loss = torch.nn.functional.cross_entropy(logits, class_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
### LOGGING
if not batch_idx % 200:
print ('Epoch: %03d/%03d | Batch %03d/%03d | Cost: %.4f'
%(epoch+1, num_epochs, batch_idx,
len(train_loader), loss))
Epoch: 001/050 | Batch 000/007 | Cost: 1.8506 Epoch: 002/050 | Batch 000/007 | Cost: 1.2779 Epoch: 003/050 | Batch 000/007 | Cost: 1.0849 Epoch: 004/050 | Batch 000/007 | Cost: 1.0136 Epoch: 005/050 | Batch 000/007 | Cost: 1.0655 Epoch: 006/050 | Batch 000/007 | Cost: 0.9198 Epoch: 007/050 | Batch 000/007 | Cost: 0.9269 Epoch: 008/050 | Batch 000/007 | Cost: 0.8566 Epoch: 009/050 | Batch 000/007 | Cost: 0.9192 Epoch: 010/050 | Batch 000/007 | Cost: 0.8459 Epoch: 011/050 | Batch 000/007 | Cost: 0.8595 Epoch: 012/050 | Batch 000/007 | Cost: 0.8126 Epoch: 013/050 | Batch 000/007 | Cost: 0.7344 Epoch: 014/050 | Batch 000/007 | Cost: 0.7982 Epoch: 015/050 | Batch 000/007 | Cost: 0.7587 Epoch: 016/050 | Batch 000/007 | Cost: 0.7278 Epoch: 017/050 | Batch 000/007 | Cost: 0.5626 Epoch: 018/050 | Batch 000/007 | Cost: 0.6570 Epoch: 019/050 | Batch 000/007 | Cost: 0.6695 Epoch: 020/050 | Batch 000/007 | Cost: 0.8091 Epoch: 021/050 | Batch 000/007 | Cost: 0.6433 Epoch: 022/050 | Batch 000/007 | Cost: 0.5846 Epoch: 023/050 | Batch 000/007 | Cost: 0.6255 Epoch: 024/050 | Batch 000/007 | Cost: 0.6438 Epoch: 025/050 | Batch 000/007 | Cost: 0.6645 Epoch: 026/050 | Batch 000/007 | Cost: 0.6947 Epoch: 027/050 | Batch 000/007 | Cost: 0.5889 Epoch: 028/050 | Batch 000/007 | Cost: 0.6015 Epoch: 029/050 | Batch 000/007 | Cost: 0.6087 Epoch: 030/050 | Batch 000/007 | Cost: 0.5184 Epoch: 031/050 | Batch 000/007 | Cost: 0.5749 Epoch: 032/050 | Batch 000/007 | Cost: 0.5191 Epoch: 033/050 | Batch 000/007 | Cost: 0.5260 Epoch: 034/050 | Batch 000/007 | Cost: 0.6051 Epoch: 035/050 | Batch 000/007 | Cost: 0.5267 Epoch: 036/050 | Batch 000/007 | Cost: 0.5485 Epoch: 037/050 | Batch 000/007 | Cost: 0.4345 Epoch: 038/050 | Batch 000/007 | Cost: 0.5198 Epoch: 039/050 | Batch 000/007 | Cost: 0.4047 Epoch: 040/050 | Batch 000/007 | Cost: 0.5052 Epoch: 041/050 | Batch 000/007 | Cost: 0.5436 Epoch: 042/050 | Batch 000/007 | Cost: 0.4116 Epoch: 043/050 | Batch 000/007 | Cost: 0.4640 Epoch: 044/050 | Batch 000/007 | Cost: 0.5765 Epoch: 045/050 | Batch 000/007 | Cost: 0.5034 Epoch: 046/050 | Batch 000/007 | Cost: 0.5579 Epoch: 047/050 | Batch 000/007 | Cost: 0.4336 Epoch: 048/050 | Batch 000/007 | Cost: 0.5188 Epoch: 049/050 | Batch 000/007 | Cost: 0.5183 Epoch: 050/050 | Batch 000/007 | Cost: 0.5013
Finally, after model training, we can evaluate the performance of the model. For example, via the mean absolute error and mean squared error measures.
def beckham_logits_to_labels(logits, model, num_classes):
predictions = beckham_logits_to_predictions(logits, model, num_classes)
return torch.round(predictions).float()
def compute_mae_and_mse(model, data_loader, device):
with torch.no_grad():
mae, mse, acc, num_examples = 0., 0., 0., 0
for i, (features, targets) in enumerate(data_loader):
features = features.to(device)
targets = targets.float().to(device)
logits = model(features)
predicted_labels = torch.argmax(logits, dim=1)
num_examples += targets.size(0)
mae += torch.sum(torch.abs(predicted_labels - targets))
mse += torch.sum((predicted_labels - targets)**2)
mae = mae / num_examples
mse = mse / num_examples
return mae, mse
train_mae, train_mse = compute_mae_and_mse(model, train_loader, DEVICE)
test_mae, test_mse = compute_mae_and_mse(model, test_loader, DEVICE)
print(f'Mean absolute error (train/test): {train_mae:.2f} | {test_mae:.2f}')
print(f'Mean squared error (train/test): {train_mse:.2f} | {test_mse:.2f}')
Mean absolute error (train/test): 0.22 | 0.37 Mean squared error (train/test): 0.27 | 0.41