2. Loss Functions¶

import numpy as np
import torch
import torch.nn as nn

seed = 42

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

%matplotlib inline

2.1. Perceptron¶

Each node in a neural network is called a perceptron unit, which has three “knobs”, a set of weights (\(w\)), a bias (\(b\)), and an activation function (\(f\)). The weights and bias are learned from the data, and the activation function is hand picked depending on the network designer’s intuition of the network and its target outputs. Mathematically,

\(y = f(wx + b)\)

class Perceptron(nn.Module):
    """
    A perceptron is one linear layer 
    """
    
    def __init__(self, input_dim):
        """
        Args:
            input_dim (int): size of the input features
        """
        super(Perceptron, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1)

    def forward(self, x_in):
        """The forward pass of the perceptron

        Args:
            x_in (torch.Tensor): an input data tensor
                x_in.shape should be (batch, num_features)
        Returns:
            the resulting tensor. tensor.shape should be (batch,).
        """
        return torch.sigmoid(self.fc1(x_in)).squeeze()

What does .squeeze() do in Python?

# Python program explaining
# numpy.squeeze function
  
import numpy as np
  
in_arr = np.array([[[2, 2, 2], [2, 2, 2]]])
   
print ("Input array : ", in_arr) 
print("Shape of input array : ", in_arr.shape)  
  
out_arr = np.squeeze(in_arr) 
  
print ("output squeezed array : ", out_arr)
print("Shape of output array : ", out_arr.shape) 

Input array :  [[[2 2 2]
  [2 2 2]]]
Shape of input array :  (1, 2, 3)
output squeezed array :  [[2 2 2]
 [2 2 2]]
Shape of output array :  (2, 3)

2.2. Softmax¶

softmax = nn.Softmax(dim=1)
x_input = torch.randn(1, 3)
y_output = softmax(x_input)
print(x_input)
print(y_output)
print(torch.sum(y_output, dim=1))

tensor([[0.3367, 0.1288, 0.2345]])
tensor([[0.3683, 0.2992, 0.3325]])
tensor([1.])

2.3. MSELoss¶

import torch
import torch.nn as nn

mse_loss = nn.MSELoss()
torch.manual_seed(42)
yhat = torch.randn(3, 5, requires_grad=True)
y = torch.randn(3, 5)
loss = mse_loss(yhat, y)
loss.backward()
print(loss)

tensor(1.0192, grad_fn=<MseLossBackward>)

2.4. MAE Loss - L1Loss¶

import torch
import torch.nn as nn

mse_loss = nn.L1Loss()
torch.manual_seed(42)
yhat = torch.randn(3, 5, requires_grad=True)
y = torch.randn(3, 5)
loss = mse_loss(yhat, y)
print(yhat)
print(y)
loss.backward()
print(loss)

tensor([[ 0.3367,  0.1288,  0.2345,  0.2303, -1.1229],
        [-0.1863,  2.2082, -0.6380,  0.4617,  0.2674],
        [ 0.5349,  0.8094,  1.1103, -1.6898, -0.9890]], requires_grad=True)
tensor([[ 0.9580,  1.3221,  0.8172, -0.7658, -0.7506],
        [ 1.3525,  0.6863, -0.3278,  0.7950,  0.2815],
        [ 0.0562,  0.5227, -0.2384, -0.0499,  0.5263]])
tensor(0.8502, grad_fn=<L1LossBackward>)

2.5. RMSE Loss¶

class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        eps = 1e-7
        return torch.sqrt(self.mse(yhat,y) + eps)

criterion = RMSELoss()

torch.manual_seed(42)
yhat = torch.randn(3, 5, requires_grad=True)
y = torch.randn(3, 5)
loss = criterion(yhat,y)
print(loss)

tensor(1.0096, grad_fn=<SqrtBackward>)

2.6. CrossEntropyLoss¶

import torch
import torch.nn as nn

ce_loss = nn.CrossEntropyLoss()

torch.manual_seed(42)
outputs = torch.randn(3, 5, requires_grad=True)
targets = torch.tensor([1, 0, 3], dtype=torch.int64)
loss = ce_loss(outputs, targets)
loss.backward()
print (loss)

tensor(2.6812, grad_fn=<NllLossBackward>)

2.6.1. Cross Entropy Loss - Manual Calculation¶

labels = torch.tensor([1.0, 0.0])
predictions = torch.tensor([.9, .2])
# Positive class (labels == 1)
positive_pred = predictions[labels == 1]
first_summation = torch.log(positive_pred).sum()
# Negative class (labels == 0)
negative_pred = predictions[labels == 0]
second_summation = torch.log(1 - negative_pred).sum()
# n_total = n_pos + n_neg
n_total = labels.size(0)
loss = -(first_summation + second_summation) / n_total
loss

tensor(0.1643)

positive_pred = predictions[labels == 1]
print(labels == 1)
print(positive_pred)

tensor([ True, False])
tensor([0.9000])

summation = torch.sum(labels * torch.log(predictions) + (1 - labels) * torch.log(1 - predictions))
loss = -summation / n_total
loss

tensor(0.1643)

2.6.2. Binary Cross Entropy Loss - PyTorch¶

bce_loss = nn.BCELoss()
sigmoid = nn.Sigmoid()

torch.manual_seed(42)
probabilities = sigmoid(torch.randn(4, 1, requires_grad=True))
print(probabilities)

targets = torch.tensor([1, 0, 1, 0], dtype=torch.float32).view(4, 1)
loss = bce_loss(probabilities, targets)
loss.backward()
print(loss)

tensor([[0.5834],
        [0.5322],
        [0.5583],
        [0.5573]], grad_fn=<SigmoidBackward>)
tensor(0.6741, grad_fn=<BinaryCrossEntropyBackward>)

labels = torch.tensor([1.0, 0.0])
predictions = torch.tensor([.9, .2])
# RIGHT
right_loss = bce_loss(predictions, labels)
# WRONG
wrong_loss = bce_loss(labels, predictions)
print(right_loss, wrong_loss)

tensor(0.1643) tensor(15.0000)

2.6.3. BCE with Logits Loss¶

import numpy as np
import torch.nn as nn

def log_odds_ratio(prob):
    return np.log(prob/(1-prob))

logit1 = log_odds_ratio(.9)
logit2 = log_odds_ratio(.2)
labels = torch.tensor([1.0, 0.0])
logits = torch.tensor([logit1, logit2])
print(logits)
loss_fn_logits = nn.BCEWithLogitsLoss()
loss = loss_fn_logits(logits, labels)
loss

tensor([ 2.1972, -1.3863], dtype=torch.float64)

tensor(0.1643)

2.7. Negative Log-Likelihood Loss¶

2.7.1. Log Softmax¶

import torch.nn as nn
import torch.nn.functional as F

logits = torch.tensor([ 1.3863, 0.0000, -0.6931])

2.7.2. Compute Softmax Manually¶

# e^z is odds ratios
odds_ratios = torch.exp(logits) 
odds_ratios

tensor([4.0000, 1.0000, 0.5000])

softmaxed = odds_ratios/odds_ratios.sum()
softmaxed

tensor([0.7273, 0.1818, 0.0909])

2.7.3. Compute Softmax in PyTorch¶

PyTorch provide the typical implementations both as a function (F.softmax) and as a module (nn.Softmax), which both take in logits (the log odds) as input. We need to tell the softmax function which dimension should be applied to.

In general, our models will produce logits with the shape (number of data points, number of classes), so the right dimension to apply softmax to is the last one (dim=-1).

## Using module
softmaxed_torch = nn.Softmax(dim=-1)(logits)
softmaxed_torch

tensor([0.7273, 0.1818, 0.0909])

# Using functional
softmaxed_torch = F.softmax(logits, dim=-1)
softmaxed_torch

tensor([0.7273, 0.1818, 0.0909])

2.8. LogSoftmax¶

torch.log(softmaxed)

tensor([-0.3185, -1.7048, -2.3979])

# Use functional
log_probs = F.log_softmax(logits, dim=-1)
log_probs

tensor([-0.3185, -1.7048, -2.3979])

# Use module
log_probs = nn.LogSoftmax(dim=-1)(logits)
log_probs

tensor([-0.3185, -1.7048, -2.3979])

2.9. Negative Loglikelihood Loss¶

NLL loss is a summation of the negative log probilities (log softmax) for all classes over the data points.

Let’s take a look at the following 3 classes, 5 data points’ NLL loss calculation.

logits \(\rightarrow\) log softmax (log probability) \(\rightarrow\) NLL

# manual implementation

torch.manual_seed(11)
logits = torch.randn((5, 3))
labels = torch.tensor([0, 0, 1, 2, 1])
log_probs = F.log_softmax(logits, dim=-1)
log_probs

tensor([[-1.5229, -0.3146, -2.9600],
        [-1.7934, -1.0044, -0.7607],
        [-1.2513, -1.0136, -1.0471],
        [-2.6799, -0.2219, -2.0367],
        [-1.0728, -1.9098, -0.6737]])

NLL is a negated sum of log probabilities.

2.9.1. Manual Calculation¶

#indices = torch.tensor([[0], [0], [1], [2], [1]])
indices = torch.tensor([[idx] for idx in labels])
log_probs_cls = log_probs.gather(1, indices)

# for each data point, we select the log_probs of corresponding class
-log_probs_cls.mean()

tensor(1.6553)

2.9.2. Using PyTorch¶

F.nll_loss(log_probs, labels)

tensor(1.6553)

loss_fn = nn.NLLLoss()
loss_fn(log_probs, labels)

tensor(1.6553)

The preferred module implementation nn.NLLLoss loss function is a higher-order function, and this one takes three optional arguments (the others are deprecated and you can safely ignore them).

reduction: it takes either mean, sum, or none. The default, mean, corresponds to our equation above. As expected, sum will return the sum of the errors, instead of the average. The last option, none, corresponds to the unreduced form, that is, it returns the full array of errors.

weight: it takes a tensor of length C, that is, containing as many weights as there are classes.

Important

this weight argument can be used to handle imbalanced datasets, unlike the weight argument in the binary cross-entropy losses we’ve seen before. Also, unlike the pos_weight argument of BCEWithLogitsLoss, the NLLLoss computes a true weighted average when this argument is used.

ignore_index: it takes one integer, corresponding to the one (and only one) class index that should be ignored when computing the loss. It can be used to mask a particular label that is not relevant to the classification task.

2.9.3. Example of Using Weight¶

What if we want to balance our dataset, giving data points with label (y=2) double the weight of the other classes?

loss_fn = nn.NLLLoss(weight=torch.tensor([1., 1., 2.]))
loss_fn(log_probs, labels)

tensor(1.7188)

2.9.4. Example of Using ignore_index¶

What if we want to simply ignore data points with label (y=2)?

loss_fn = nn.NLLLoss(ignore_index=2)
loss_fn(log_probs, labels)

tensor(1.5599)

CITS4012 Natural Language Processing