Finetuning a DistilBERT with CORN Loss for Ordinal Regression

# pip install transformers
# pip install datasets
# pip install lightning
%load_ext watermark
%watermark -p torch,transformers,datasets,lightning,coral_pytorch
torch        : 2.0.0+cu118
transformers : 4.26.1
datasets     : 2.9.0
lightning    : 2.0.0
coral_pytorch: 1.4.0

1 Loading the Dataset

import pandas as pd
import numpy as np

df = pd.read_csv(

6995 beautiful hotel, stay punta cana majestic colo... 5
6996 stay, n't stay, stayed week april, weather ama... 5
6997 stay hotel fantastic, great location, looked n... 5
6998 birthday meal havnt stayed hotel staying barce... 5
6999 great hotel great location stayed royal magda ... 5

Basic datasets analysis and sanity checks

print("Class distribution:")
Class distribution:

array([   0, 1400, 1400, 1400, 1400, 1400])
array([1400, 1400, 1400, 1400, 1400])

Performance baseline

data_labels = df["LABEL_COLUMN_NAME"]

avg_prediction = np.median(data_labels.values)  # median minimizes MAE
baseline_mae = np.mean(np.abs(data_labels.values - avg_prediction))
print(f'Baseline MAE: {baseline_mae:.2f}')
Baseline MAE: 1.20

Split data into training, validation, and test sets

df_shuffled = df.sample(frac=1, random_state=1).reset_index()

train_idx = int(df_shuffled.shape[0]*0.7)
val_idx = int(df_shuffled.shape[0]*0.1) 

df_train = df_shuffled.iloc[:train_idx]
df_val = df_shuffled.iloc[train_idx:(train_idx+val_idx)]
df_test = df_shuffled.iloc[(train_idx+val_idx):]

df_train.to_csv("train.csv", index=False, encoding="utf-8")
df_val.to_csv("validation.csv", index=False, encoding="utf-8")
df_test.to_csv("test.csv", index=False, encoding="utf-8")

2 Tokenization and Numericalization

Load the dataset via load_dataset

from datasets import load_dataset

my_dataset = load_dataset(
        "train": "train.csv",
        "validation": "validation.csv",
        "test": "test.csv",

Using custom data configuration default-c2106402015b5d25

    train: Dataset({
        features: ['index', 'TEXT_COLUMN_NAME', 'LABEL_COLUMN_NAME'],
        num_rows: 4900
    validation: Dataset({
        features: ['index', 'TEXT_COLUMN_NAME', 'LABEL_COLUMN_NAME'],
        num_rows: 700
    test: Dataset({
        features: ['index', 'TEXT_COLUMN_NAME', 'LABEL_COLUMN_NAME'],
        num_rows: 1400

Tokenize the dataset

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)
Tokenizer input max length: 512
Tokenizer vocabulary size: 30522
def tokenize_text(batch):
    return tokenizer(batch["TEXT_COLUMN_NAME"], truncation=True, padding=True)
data_tokenized =, batched=True, batch_size=None)
data_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "LABEL_COLUMN_NAME"])
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

3 Set Up DataLoaders

from import DataLoader, Dataset

class MyDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows
train_dataset = MyDataset(data_tokenized, partition_key="train")
val_dataset = MyDataset(data_tokenized, partition_key="validation")
test_dataset = MyDataset(data_tokenized, partition_key="test")


train_loader = DataLoader(

val_loader = DataLoader(

test_loader = DataLoader(

4 Initializing DistilBERT

from transformers import AutoModelForSequenceClassification

NUM_CLASSES = np.bincount(df["LABEL_COLUMN_NAME"].values).shape[0]
print("Number of classes:", NUM_CLASSES)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=NUM_CLASSES)
Number of classes: 5

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)
  (classifier): Linear(in_features=768, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)

5 Finetuning

Wrap in LightningModule for Training

import lightning as L
import torch
import torchmetrics

from coral_pytorch.losses import corn_loss
from coral_pytorch.dataset import corn_label_from_logits

class LightningModel(L.LightningModule):
    def __init__(self, model, num_classes, learning_rate=5e-5):

        self.learning_rate = learning_rate
        self.model = model

        self.num_classes = num_classes

        self.train_mae = torchmetrics.MeanAbsoluteError()
        self.val_mae = torchmetrics.MeanAbsoluteError()
        self.test_mae = torchmetrics.MeanAbsoluteError()

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],

        loss = corn_loss(outputs["logits"], batch["LABEL_COLUMN_NAME"],

        self.log("train_loss", loss)

        predicted_labels = corn_label_from_logits(outputs["logits"])
        self.test_mae(predicted_labels, batch["LABEL_COLUMN_NAME"])
        self.log("train_mae", self.train_mae, prog_bar=True)

        return loss  # this is passed to the optimizer for training

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],

        loss = corn_loss(outputs["logits"], batch["LABEL_COLUMN_NAME"],
        self.log("val_loss", loss, prog_bar=True)

        predicted_labels = corn_label_from_logits(outputs["logits"])
        self.val_mae(predicted_labels, batch["LABEL_COLUMN_NAME"])
        self.log("val_mae", self.val_mae, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],

        predicted_labels = corn_label_from_logits(outputs["logits"])
        self.test_mae(predicted_labels, batch["LABEL_COLUMN_NAME"])
        self.log("test_mae", self.test_mae, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer
lightning_model = LightningModel(model, num_classes=NUM_CLASSES)
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger

callbacks = [
        save_top_k=1, mode="min", monitor="val_mae"
    )  # save top 1 model
logger = CSVLogger(save_dir="logs/", name="my-model")
trainer = L.Trainer(
  | Name      | Type                                | Params
0 | model     | DistilBertForSequenceClassification | 67.0 M
1 | train_mae | MeanAbsoluteError                   | 0     
2 | val_mae   | MeanAbsoluteError                   | 0     
3 | test_mae  | MeanAbsoluteError                   | 0     
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.829   Total estimated model params size (MB)
trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
┃        Test metric               DataLoader 0        ┃
│         test_mae              0.3761734664440155     │
[{'test_mae': 0.3761734664440155}]
trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
┃        Test metric               DataLoader 0        ┃
│         test_mae              0.38999998569488525    │
[{'test_mae': 0.38999998569488525}]
trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
┃        Test metric               DataLoader 0        ┃
│         test_mae              0.4214285612106323     │
[{'test_mae': 0.4214285612106323}]