2. Loss Functions

import numpy as np
import torch
import torch.nn as nn

seed = 42

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

%matplotlib inline

2.1. Perceptron

Each node in a neural network is called a perceptron unit, which has three “knobs”, a set of weights (\(w\)), a bias (\(b\)), and an activation function (\(f\)). The weights and bias are learned from the data, and the activation function is hand picked depending on the network designer’s intuition of the network and its target outputs. Mathematically,

\(y = f(wx + b)\)

class Perceptron(nn.Module):
    """
    A perceptron is one linear layer 
    """
    
    def __init__(self, input_dim):
        """
        Args:
            input_dim (int): size of the input features
        """
        super(Perceptron, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1)

    def forward(self, x_in):
        """The forward pass of the perceptron

        Args:
            x_in (torch.Tensor): an input data tensor
                x_in.shape should be (batch, num_features)
        Returns:
            the resulting tensor. tensor.shape should be (batch,).
        """
        return torch.sigmoid(self.fc1(x_in)).squeeze()

What does .squeeze() do in Python?

# Python program explaining
# numpy.squeeze function
  
import numpy as np
  
in_arr = np.array([[[2, 2, 2], [2, 2, 2]]])
   
print ("Input array : ", in_arr) 
print("Shape of input array : ", in_arr.shape)  
  
out_arr = np.squeeze(in_arr) 
  
print ("output squeezed array : ", out_arr)
print("Shape of output array : ", out_arr.shape) 
Input array :  [[[2 2 2]
  [2 2 2]]]
Shape of input array :  (1, 2, 3)
output squeezed array :  [[2 2 2]
 [2 2 2]]
Shape of output array :  (2, 3)

2.2. Softmax

softmax = nn.Softmax(dim=1)
x_input = torch.randn(1, 3)
y_output = softmax(x_input)
print(x_input)
print(y_output)
print(torch.sum(y_output, dim=1))
tensor([[0.3367, 0.1288, 0.2345]])
tensor([[0.3683, 0.2992, 0.3325]])
tensor([1.])

2.3. MSELoss

import torch
import torch.nn as nn

mse_loss = nn.MSELoss()
torch.manual_seed(42)
yhat = torch.randn(3, 5, requires_grad=True)
y = torch.randn(3, 5)
loss = mse_loss(yhat, y)
loss.backward()
print(loss)
tensor(1.0192, grad_fn=<MseLossBackward>)

2.4. MAE Loss - L1Loss

import torch
import torch.nn as nn

mse_loss = nn.L1Loss()
torch.manual_seed(42)
yhat = torch.randn(3, 5, requires_grad=True)
y = torch.randn(3, 5)
loss = mse_loss(yhat, y)
print(yhat)
print(y)
loss.backward()
print(loss)
tensor([[ 0.3367,  0.1288,  0.2345,  0.2303, -1.1229],
        [-0.1863,  2.2082, -0.6380,  0.4617,  0.2674],
        [ 0.5349,  0.8094,  1.1103, -1.6898, -0.9890]], requires_grad=True)
tensor([[ 0.9580,  1.3221,  0.8172, -0.7658, -0.7506],
        [ 1.3525,  0.6863, -0.3278,  0.7950,  0.2815],
        [ 0.0562,  0.5227, -0.2384, -0.0499,  0.5263]])
tensor(0.8502, grad_fn=<L1LossBackward>)

2.5. RMSE Loss

class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        eps = 1e-7
        return torch.sqrt(self.mse(yhat,y) + eps)

criterion = RMSELoss()

torch.manual_seed(42)
yhat = torch.randn(3, 5, requires_grad=True)
y = torch.randn(3, 5)
loss = criterion(yhat,y)
print(loss)
tensor(1.0096, grad_fn=<SqrtBackward>)

2.6. CrossEntropyLoss

import torch
import torch.nn as nn

ce_loss = nn.CrossEntropyLoss()

torch.manual_seed(42)
outputs = torch.randn(3, 5, requires_grad=True)
targets = torch.tensor([1, 0, 3], dtype=torch.int64)
loss = ce_loss(outputs, targets)
loss.backward()
print (loss)
tensor(2.6812, grad_fn=<NllLossBackward>)

2.6.1. Cross Entropy Loss - Manual Calculation

labels = torch.tensor([1.0, 0.0])
predictions = torch.tensor([.9, .2])
# Positive class (labels == 1)
positive_pred = predictions[labels == 1]
first_summation = torch.log(positive_pred).sum()
# Negative class (labels == 0)
negative_pred = predictions[labels == 0]
second_summation = torch.log(1 - negative_pred).sum()
# n_total = n_pos + n_neg
n_total = labels.size(0)
loss = -(first_summation + second_summation) / n_total
loss
tensor(0.1643)
positive_pred = predictions[labels == 1]
print(labels == 1)
print(positive_pred)
tensor([ True, False])
tensor([0.9000])
summation = torch.sum(labels * torch.log(predictions) + (1 - labels) * torch.log(1 - predictions))
loss = -summation / n_total
loss
tensor(0.1643)

2.6.2. Binary Cross Entropy Loss - PyTorch

bce_loss = nn.BCELoss()
sigmoid = nn.Sigmoid()

torch.manual_seed(42)
probabilities = sigmoid(torch.randn(4, 1, requires_grad=True))
print(probabilities)

targets = torch.tensor([1, 0, 1, 0], dtype=torch.float32).view(4, 1)
loss = bce_loss(probabilities, targets)
loss.backward()
print(loss)
tensor([[0.5834],
        [0.5322],
        [0.5583],
        [0.5573]], grad_fn=<SigmoidBackward>)
tensor(0.6741, grad_fn=<BinaryCrossEntropyBackward>)
labels = torch.tensor([1.0, 0.0])
predictions = torch.tensor([.9, .2])
# RIGHT
right_loss = bce_loss(predictions, labels)
# WRONG
wrong_loss = bce_loss(labels, predictions)
print(right_loss, wrong_loss)
tensor(0.1643) tensor(15.0000)

2.6.3. BCE with Logits Loss

import numpy as np
import torch.nn as nn

def log_odds_ratio(prob):
    return np.log(prob/(1-prob))
logit1 = log_odds_ratio(.9)
logit2 = log_odds_ratio(.2)
labels = torch.tensor([1.0, 0.0])
logits = torch.tensor([logit1, logit2])
print(logits)
loss_fn_logits = nn.BCEWithLogitsLoss()
loss = loss_fn_logits(logits, labels)
loss
tensor([ 2.1972, -1.3863], dtype=torch.float64)
tensor(0.1643)

2.7. Negative Log-Likelihood Loss

2.7.1. Log Softmax

import torch.nn as nn
import torch.nn.functional as F
logits = torch.tensor([ 1.3863, 0.0000, -0.6931])

2.7.2. Compute Softmax Manually

# e^z is odds ratios
odds_ratios = torch.exp(logits) 
odds_ratios
tensor([4.0000, 1.0000, 0.5000])
softmaxed = odds_ratios/odds_ratios.sum()
softmaxed
tensor([0.7273, 0.1818, 0.0909])

2.7.3. Compute Softmax in PyTorch

PyTorch provide the typical implementations both as a function (F.softmax) and as a module (nn.Softmax), which both take in logits (the log odds) as input. We need to tell the softmax function which dimension should be applied to.

In general, our models will produce logits with the shape (number of data points, number of classes), so the right dimension to apply softmax to is the last one (dim=-1).

## Using module
softmaxed_torch = nn.Softmax(dim=-1)(logits)
softmaxed_torch
tensor([0.7273, 0.1818, 0.0909])
# Using functional
softmaxed_torch = F.softmax(logits, dim=-1)
softmaxed_torch
tensor([0.7273, 0.1818, 0.0909])

2.8. LogSoftmax

torch.log(softmaxed)
tensor([-0.3185, -1.7048, -2.3979])
# Use functional
log_probs = F.log_softmax(logits, dim=-1)
log_probs
tensor([-0.3185, -1.7048, -2.3979])
# Use module
log_probs = nn.LogSoftmax(dim=-1)(logits)
log_probs
tensor([-0.3185, -1.7048, -2.3979])

2.9. Negative Loglikelihood Loss

NLL loss is a summation of the negative log probilities (log softmax) for all classes over the data points.

Let’s take a look at the following 3 classes, 5 data points’ NLL loss calculation.

logits \(\rightarrow\) log softmax (log probability) \(\rightarrow\) NLL

# manual implementation

torch.manual_seed(11)
logits = torch.randn((5, 3))
labels = torch.tensor([0, 0, 1, 2, 1])
log_probs = F.log_softmax(logits, dim=-1)
log_probs
tensor([[-1.5229, -0.3146, -2.9600],
        [-1.7934, -1.0044, -0.7607],
        [-1.2513, -1.0136, -1.0471],
        [-2.6799, -0.2219, -2.0367],
        [-1.0728, -1.9098, -0.6737]])

NLL is a negated sum of log probabilities.

2.9.1. Manual Calculation

#indices = torch.tensor([[0], [0], [1], [2], [1]])
indices = torch.tensor([[idx] for idx in labels])
log_probs_cls = log_probs.gather(1, indices)
# for each data point, we select the log_probs of corresponding class
-log_probs_cls.mean()
tensor(1.6553)

2.9.2. Using PyTorch

F.nll_loss(log_probs, labels)
tensor(1.6553)
loss_fn = nn.NLLLoss()
loss_fn(log_probs, labels)
tensor(1.6553)

The preferred module implementation nn.NLLLoss loss function is a higher-order function, and this one takes three optional arguments (the others are deprecated and you can safely ignore them).

reduction: it takes either mean, sum, or none. The default, mean, corresponds to our equation above. As expected, sum will return the sum of the errors, instead of the average. The last option, none, corresponds to the unreduced form, that is, it returns the full array of errors.

weight: it takes a tensor of length C, that is, containing as many weights as there are classes.

Important

this weight argument can be used to handle imbalanced datasets, unlike the weight argument in the binary cross-entropy losses we’ve seen before. Also, unlike the pos_weight argument of BCEWithLogitsLoss, the NLLLoss computes a true weighted average when this argument is used.

ignore_index: it takes one integer, corresponding to the one (and only one) class index that should be ignored when computing the loss. It can be used to mask a particular label that is not relevant to the classification task.

2.9.3. Example of Using Weight

What if we want to balance our dataset, giving data points with label (y=2) double the weight of the other classes?

loss_fn = nn.NLLLoss(weight=torch.tensor([1., 1., 2.]))
loss_fn(log_probs, labels)
tensor(1.7188)

2.9.4. Example of Using ignore_index

What if we want to simply ignore data points with label (y=2)?

loss_fn = nn.NLLLoss(ignore_index=2)
loss_fn(log_probs, labels)
tensor(1.5599)