CORN MLP for predicting cement strength (cement_strength)

This tutorial explains how to train a deep neural network (here: multilayer perceptron) with the CORN loss function for ordinal regression.

0 -- Obtaining and preparing the cement_strength dataset

We will be using the cement_strength dataset from https://github.com/gagolews/ordinal_regression_data/blob/master/cement_strength.csv.

First, we are going to download and prepare the and save it as CSV files locally. This is a general procedure that is not specific to CORN.

This dataset has 5 ordinal labels (1, 2, 3, 4, and 5). Note that CORN requires labels to be starting at 0, which is why we subtract "1" from the label column.

import pandas as pd
import numpy as np


data_df = pd.read_csv("https://raw.githubusercontent.com/gagolews/ordinal_regression_data/master/cement_strength.csv")

data_df["response"] = data_df["response"]-1 # labels should start at 0

data_labels = data_df["response"]
data_features = data_df.loc[:, ["V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8"]]

print('Number of features:', data_features.shape[1])
print('Number of examples:', data_features.shape[0])
print('Labels:', np.unique(data_labels.values))

Number of features: 8
Number of examples: 998
Labels: [0 1 2 3 4]

Split into training and test data

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    data_features.values,
    data_labels.values,
    test_size=0.2,
    random_state=1,
    stratify=data_labels.values)

Standardize features

from sklearn.preprocessing import StandardScaler


sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

1 -- Setting up the dataset and dataloader

In this section, we set up the data set and data loaders. This is a general procedure that is not specific to CORN.

import torch


##########################
### SETTINGS
##########################

# Hyperparameters
random_seed = 1
learning_rate = 0.001
num_epochs = 20
batch_size = 128

# Architecture
NUM_CLASSES = 5

# Other
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Training on', DEVICE)

Training on cuda:0

from torch.utils.data import Dataset


class MyDataset(Dataset):

    def __init__(self, feature_array, label_array, dtype=np.float32):

        self.features = feature_array.astype(np.float32)
        self.labels = label_array

    def __getitem__(self, index):
        inputs = self.features[index]
        label = self.labels[index]
        return inputs, label

    def __len__(self):
        return self.labels.shape[0]

import torch
from torch.utils.data import DataLoader


# Note transforms.ToTensor() scales input images
# to 0-1 range
train_dataset = MyDataset(X_train_std, y_train)
test_dataset = MyDataset(X_test_std, y_test)


train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True, # want to shuffle the dataset
                          num_workers=0) # number processes/CPUs to use

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=batch_size,
                         shuffle=False,
                         num_workers=0)

# Checking the dataset
for inputs, labels in train_loader:  
    print('Input batch dimensions:', inputs.shape)
    print('Input label dimensions:', labels.shape)
    break

Input batch dimensions: torch.Size([128, 8])
Input label dimensions: torch.Size([128])

2 - Equipping MLP with a CORN layer

In this section, we are implementing a simple MLP for ordinal regression with CORN. Note that the only specific modification required is setting the number of output of the last layer (a fully connected layer) to the number of classes - 1 (these correspond to the binary tasks used in the extended binary classification as described in the paper).

class MLP(torch.nn.Module):

    def __init__(self, in_features, num_classes, num_hidden_1=300, num_hidden_2=300):
        super().__init__()

        self.my_network = torch.nn.Sequential(

            # 1st hidden layer
            torch.nn.Linear(in_features, num_hidden_1, bias=False),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.BatchNorm1d(num_hidden_1),

            # 2nd hidden layer
            torch.nn.Linear(num_hidden_1, num_hidden_2, bias=False),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.BatchNorm1d(num_hidden_2),

            ### Specify CORN layer
            torch.nn.Linear(num_hidden_2, (num_classes-1))
            ###--------------------------------------------------------------------###
        )

    def forward(self, x):
        logits = self.my_network(x)
        return logits



torch.manual_seed(random_seed)
model = MLP(in_features=8, num_classes=NUM_CLASSES)
model.to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

3 - Using the CORN loss for model training

During training, all you need to do is to use the corn_loss provided via coral_pytorch. The loss function will take care of the conditional training set processing and modeling the conditional probabilities used in the chain rule (aka general product rule).

from coral_pytorch.losses import corn_loss


for epoch in range(num_epochs):

    model = model.train()
    for batch_idx, (features, class_labels) in enumerate(train_loader):

        class_labels = class_labels.to(DEVICE)
        features = features.to(DEVICE)
        logits = model(features)

        #### CORN loss 
        loss = corn_loss(logits, class_labels, NUM_CLASSES)
        ###--------------------------------------------------------------------###   

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ### LOGGING
        if not batch_idx % 200:
            print ('Epoch: %03d/%03d | Batch %03d/%03d | Cost: %.4f' 
                   %(epoch+1, num_epochs, batch_idx, 
                     len(train_loader), loss))

Epoch: 001/020 | Batch 000/007 | Cost: 0.7095
Epoch: 002/020 | Batch 000/007 | Cost: 0.5793
Epoch: 003/020 | Batch 000/007 | Cost: 0.5107
Epoch: 004/020 | Batch 000/007 | Cost: 0.4893
Epoch: 005/020 | Batch 000/007 | Cost: 0.4294
Epoch: 006/020 | Batch 000/007 | Cost: 0.3942
Epoch: 007/020 | Batch 000/007 | Cost: 0.3905
Epoch: 008/020 | Batch 000/007 | Cost: 0.3877
Epoch: 009/020 | Batch 000/007 | Cost: 0.3327
Epoch: 010/020 | Batch 000/007 | Cost: 0.3442
Epoch: 011/020 | Batch 000/007 | Cost: 0.3513
Epoch: 012/020 | Batch 000/007 | Cost: 0.3395
Epoch: 013/020 | Batch 000/007 | Cost: 0.3272
Epoch: 014/020 | Batch 000/007 | Cost: 0.3372
Epoch: 015/020 | Batch 000/007 | Cost: 0.2994
Epoch: 016/020 | Batch 000/007 | Cost: 0.3409
Epoch: 017/020 | Batch 000/007 | Cost: 0.3158
Epoch: 018/020 | Batch 000/007 | Cost: 0.2988
Epoch: 019/020 | Batch 000/007 | Cost: 0.2793
Epoch: 020/020 | Batch 000/007 | Cost: 0.2516

4 -- Evaluate model

Finally, after model training, we can evaluate the performance of the model. For example, via the mean absolute error and mean squared error measures.

For this, we are going to use the corn_label_from_logits utility function from coral_pytorch to convert the probabilities back to the orginal label.

from coral_pytorch.dataset import corn_label_from_logits


def compute_mae_and_mse(model, data_loader, device):

    with torch.no_grad():

        mae, mse, acc, num_examples = 0., 0., 0., 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            predicted_labels = corn_label_from_logits(logits).float()

            num_examples += targets.size(0)
            mae += torch.sum(torch.abs(predicted_labels - targets))
            mse += torch.sum((predicted_labels - targets)**2)

        mae = mae / num_examples
        mse = mse / num_examples
        return mae, mse

train_mae, train_mse = compute_mae_and_mse(model, train_loader, DEVICE)
test_mae, test_mse = compute_mae_and_mse(model, test_loader, DEVICE)

print(f'Mean absolute error (train/test): {train_mae:.2f} | {test_mae:.2f}')
print(f'Mean squared error (train/test): {train_mse:.2f} | {test_mse:.2f}')

Mean absolute error (train/test): 0.29 | 0.36
Mean squared error (train/test): 0.34 | 0.39

Note that MNIST is not an ordinal dataset (there is no order between the image categories), so computing the MAE or MSE doesn't really make sense but we use it anyways for demonstration purposes.

5 -- Rank probabilities from logits

To obtain the rank probabilities from the logits, you can use the sigmoid function to get the conditional probabilities for each task and then compute the task probabilities via the chain rule for probabilities. Note that this is also done internally by the corn_label_from_logits we used above.

logits = model(features)

with torch.no_grad():
    probas = torch.sigmoid(logits)
    probas = torch.cumprod(probas, dim=1)
    print(probas)

tensor([[8.4400e-01, 1.1552e-01, 2.4885e-02, 2.1235e-02],
        [9.6955e-01, 9.6440e-01, 7.9017e-01, 4.0131e-01],
        [9.6926e-01, 9.6164e-01, 2.8837e-01, 1.1151e-01],
        [2.7557e-01, 1.7854e-03, 1.3533e-04, 6.4534e-05],
        [4.4200e-04, 2.9050e-05, 1.8071e-05, 8.5216e-06],
        [4.1626e-02, 6.8911e-06, 1.1300e-06, 1.1232e-06],
        [9.5031e-01, 3.2661e-01, 7.6083e-03, 3.6258e-03],
        [9.8467e-01, 9.0953e-01, 4.3580e-01, 3.9399e-01],
        [8.0870e-01, 1.9610e-01, 1.9341e-02, 1.6238e-03],
        [9.6289e-01, 7.2809e-01, 2.1034e-01, 1.4426e-01],
        [9.8087e-01, 3.4986e-01, 7.5893e-03, 2.1336e-04],
        [8.3218e-02, 2.9795e-04, 8.8117e-05, 7.7257e-05],
        [6.4886e-01, 3.3336e-01, 1.7751e-01, 1.1291e-01],
        [8.0380e-01, 5.5894e-03, 3.1419e-04, 2.4602e-04],
        [9.3716e-01, 9.3670e-01, 9.3338e-01, 8.3394e-01],
        [9.0723e-01, 9.0255e-01, 8.7473e-01, 4.9182e-01],
        [9.8959e-01, 3.3517e-01, 5.4329e-02, 1.7331e-03],
        [9.6824e-01, 8.0327e-01, 2.5958e-01, 8.4942e-03],
        [9.6470e-01, 9.1665e-01, 6.9238e-01, 3.8931e-01],
        [9.6623e-01, 9.6491e-01, 9.4429e-01, 4.3117e-01],
        [8.0910e-02, 1.5353e-04, 2.7122e-05, 2.1541e-05],
        [9.9247e-01, 8.6671e-01, 6.3087e-01, 6.6279e-02],
        [8.8915e-01, 2.5603e-02, 1.8793e-03, 1.5186e-03],
        [6.2060e-01, 1.8354e-01, 4.0813e-02, 2.1553e-02],
        [9.5856e-01, 9.5805e-01, 9.2657e-01, 1.6030e-01],
        [9.9292e-01, 6.5836e-01, 1.8671e-01, 6.0837e-02],
        [1.0555e-01, 4.6840e-03, 1.1164e-03, 1.7749e-04],
        [9.6029e-01, 4.0485e-01, 3.0195e-02, 2.0155e-03],
        [9.8264e-01, 9.1183e-01, 4.3322e-01, 2.3925e-03],
        [8.9595e-01, 3.6590e-01, 3.0114e-02, 1.9936e-03]], device='cuda:0')