COMP5329 – Deep Learning¶
Tutorial 4 – Regularization¶
Semester 1, 2021
Objectives:
• To learn about regularization.
• To be familiar with how the regularization methods work, i.e., L2 regularization, dropout, batch normalization, early stopping, etc.
• To learn how to implement regularization methods with deep learning frameworks (in this tutorial we use pytorch).
Instructions:
• Install pytorch and related packages into your anaconda environment.
• Read and run this “4-reg.ipynb” file.
• Complete the exercises.
Lecturers: Chang Xu
0. Loading the packages¶
In [ ]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
import numpy as np
import matplotlib.pyplot as plt
from torch.autograd import Variable
from torch.nn import init
from IPython.display import clear_output
%matplotlib inline
—————————————————————————
ModuleNotFoundError Traceback (most recent call last)
—-> 1 import torch
2 import torch.nn as nn
3 import torch.nn.functional as F
4 import torch.optim as optim
5 import torch.utils.data as Data
ModuleNotFoundError: No module named ‘torch’
1. Hyperparameters¶
Defination of hyperparameters:
N_SAMPLES : number of samples
NOISE_RATE : amplitude of noise
N_INPUT_LAYER : dimension of input
N_HIDDEN_LAYER : dimension of hidden layer
N_OUTPUT_LAYER : dimension of output layer
N_HIDDEN : number of hidden layer
BATCH_SIZE : batch size for training
EPOCH : training epoch
LEARNING_RATE : learning rate
DROPOUT_RATE : dropout rate
WEIGHT_DECAY : value of penalty term of L2 regularization
MAX_COUNT : parameter related to the early stopping criterion
ACTIVATION : activation function
LOSS_FUNC : loss function
In [ ]:
N_SAMPLES = 20
NOISE_RATE = 0.4
N_INPUT_LAYER = 1
N_HIDDEN_LAYER = 100
N_OUTPUT_LAYER = 1
N_HIDDEN = 1
BATCH_SIZE = 10
EPOCH = 500
LEARNING_RATE = 0.01
DROPOUT_RATE = 0.5
WEIGHT_DECAY = 5e-3
MAX_COUNT = 5
ACTIVATION = nn.ReLU()
LOSS_FUNC = nn.MSELoss()
2. The Dataset¶
In this tutorial, we use the MLP to fit a linear function with noise under normal distribution, which is formulated as following: $$y=x+Norm(0,NOISE\_RATE).$$ For the training and testing sets, we evenly sample 20 points in the domain of $x\in[-1, 1]$; while the validation set is half the size of the training set.
In [ ]:
# training data
train_x = np.linspace(-1, 1, num=int(N_SAMPLES))[:, np.newaxis]
noise = np.random.normal(0, NOISE_RATE, train_x.shape)
train_y = train_x + noise
# test data
test_x = np.linspace(-1, 1, num=int(N_SAMPLES))[:, np.newaxis]
noise = np.random.normal(0, NOISE_RATE, test_x.shape)
test_y = test_x + noise
# validation data
validate_x = np.linspace(-1, 1, num=int(N_SAMPLES/2))[:, np.newaxis]
noise = np.random.normal(0, NOISE_RATE, validate_x.shape)
validate_y = validate_x + noise
train_x, train_y = torch.from_numpy(train_x).float(), torch.from_numpy(train_y).float()
test_x, test_y = torch.from_numpy(test_x).float(), torch.from_numpy(test_y).float()
validate_x, validate_y = torch.from_numpy(validate_x).float(), torch.from_numpy(validate_y).float()
train_dataset = Data.TensorDataset(train_x, train_y)
train_loader = Data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
3. Neural Networks¶
3.1 Vanilla MLP¶
In [ ]:
class FC_Classifier(nn.Module):
“””Custom module for a simple MLP”””
def __init__(self):
super(FC_Classifier, self).__init__()
self.fcs = []
self.fc_i = nn.Linear(N_INPUT_LAYER, N_HIDDEN_LAYER)
# define and name every hidden layer in the module
for i in range(N_HIDDEN):
fc = nn.Linear(N_HIDDEN_LAYER, N_HIDDEN_LAYER)
setattr(self, ‘fc%i’ % i, fc) # IMPORTANT set layer to the Module, if not doing so, the fc will not belong to the module
self._set_init(fc) # parameters initialization
self.fcs.append(fc)
self.fc_o = nn.Linear(N_HIDDEN_LAYER, N_OUTPUT_LAYER)
self._set_init(self.fc_i)
self._set_init(self.fc_o)
def _set_init(self, layer):
init.normal_(layer.weight, mean=0., std=.1)
init.constant_(layer.bias, 0)
def forward(self, x):
x = x.view(-1, N_INPUT_LAYER)
x = ACTIVATION(self.fc_i(x))
for i in range(N_HIDDEN):
x = ACTIVATION(self.fcs[i](x))
x = self.fc_o(x)
return x
3.2 MLP with Dropout¶
In [ ]:
class Dropout_Classifier(nn.Module):
“””Custom module for a MLP with dropout”””
def __init__(self):
super(Dropout_Classifier, self).__init__()
self.fcs = []
self.dropout = nn.Dropout(DROPOUT_RATE)
self.fc_i = nn.Linear(N_INPUT_LAYER, N_HIDDEN_LAYER)
# define and name every hidden layer in the module
for i in range(N_HIDDEN):
fc = nn.Linear(N_HIDDEN_LAYER, N_HIDDEN_LAYER)
setattr(self, ‘fc%i’ % i, fc) # IMPORTANT set layer to the Module, if not doing so, the fc will not belong to the module
self._set_init(fc) # parameters initialization
self.fcs.append(fc)
self.fc_o = nn.Linear(N_HIDDEN_LAYER, N_OUTPUT_LAYER)
self._set_init(self.fc_i)
self._set_init(self.fc_o)
def _set_init(self, layer):
init.normal_(layer.weight, mean=0., std=.1)
init.constant_(layer.bias, 0)
def forward(self, x):
x = x.view(-1, N_INPUT_LAYER)
x = self.fc_i(x)
x = self.dropout(x)
x = ACTIVATION(x)
for i in range(N_HIDDEN):
x = self.fcs[i](x)
# IMPORTANT: when implement dropout with F.dropout(), please use training=self.training
x = F.dropout(x, p=DROPOUT_RATE, training=self.training)
x = ACTIVATION(x)
x = self.fc_o(x)
return x
3.3 MLP with Batch Normalization¶
In [ ]:
class Batch_Normalization_Classifier(nn.Module):
“””Custom module for a MLP with batch normalization”””
def __init__(self):
super(Batch_Normalization_Classifier, self).__init__()
self.fcs = []
self.bns = []
self.fc_i = nn.Linear(N_INPUT_LAYER, N_HIDDEN_LAYER)
self.input_bn = nn.BatchNorm1d(N_INPUT_LAYER)
self.first_bn = nn.BatchNorm1d(N_HIDDEN_LAYER)
# define and name every hidden layer in the module
for i in range(N_HIDDEN):
fc = nn.Linear(N_HIDDEN_LAYER, N_HIDDEN_LAYER)
setattr(self, ‘fc%i’ % i, fc) # IMPORTANT set layer to the Module, if not doing so, the fc will not belong to the module
self._set_init(fc) # parameters initialization
self.fcs.append(fc)
bn = nn.BatchNorm1d(N_HIDDEN_LAYER)
setattr(self, ‘bn%i’ % i, bn) # IMPORTANT set layer to the Module, if not doing so, the fc will not belong to the module
self.bns.append(bn)
self.fc_o = nn.Linear(N_HIDDEN_LAYER, N_OUTPUT_LAYER)
self._set_init(self.fc_i)
self._set_init(self.fc_o)
def _set_init(self, layer):
init.normal_(layer.weight, mean=0., std=.1)
init.constant_(layer.bias, 0)
def forward(self, x):
x = x.view(-1, N_INPUT_LAYER)
x = self.input_bn(x)
x = self.fc_i(x)
x = ACTIVATION(self.first_bn(x))
for i in range(N_HIDDEN):
x = self.fcs[i](x)
x = self.bns[i](x) # batch normalization
x = ACTIVATION(x)
x = self.fc_o(x)
return x
4. Build Networks and Optimizers¶
In [ ]:
fc_net = FC_Classifier()
l2_net = FC_Classifier()
early_stop_net = FC_Classifier()
dropped_net = Dropout_Classifier()
bned_net = Batch_Normalization_Classifier()
fc_opt = torch.optim.Adam(fc_net.parameters(), lr=LEARNING_RATE)
l2_opt = torch.optim.Adam(l2_net.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
early_stop_opt = torch.optim.Adam(early_stop_net.parameters(), lr=LEARNING_RATE)
dropped_opt = torch.optim.Adam(dropped_net.parameters(), lr=LEARNING_RATE)
bned_opt = optim.Adam(bned_net.parameters(), lr=LEARNING_RATE)
nets = [fc_net, l2_net, dropped_net, bned_net]
opts = [fc_opt, l2_opt, dropped_opt, bned_opt]
5. Training and Testing¶
• In this tutorial, the early stopping criterion is that if the validation loss does not decrease after 50 epochs, we stop training the network.
In [ ]:
is_early_stop = False
last_validation_loss = LOSS_FUNC(early_stop_net(validate_x), validate_y).data.numpy()
flag = ‘training’
count = 0
for epoch in range(EPOCH):
# dataset API gives us pythonic batching
for batch_id, (data, label) in enumerate(train_loader):
data = Variable(data)
target = Variable(label)
for net, opt in zip(nets, opts): # train for each network
preds = net(data)
loss = LOSS_FUNC(preds, target)
opt.zero_grad()
loss.backward()
opt.step() # it will also learns the parameters in Batch Normalization
# loss_history[nets.index(net)].append(loss.data)
if not is_early_stop:
preds = early_stop_net(data)
loss = LOSS_FUNC(preds, target)
early_stop_opt.zero_grad()
loss.backward()
early_stop_opt.step()
if epoch % 10 == 0:
# change to eval mode in order to fix drop out effect
for net in nets:
net.eval()# parameters for dropout differ from train mode
early_stop_net.eval()
if not is_early_stop:
validate_pred_early_stop = early_stop_net(validate_x)
if LOSS_FUNC(validate_pred_early_stop, validate_y).data.numpy() > last_validation_loss:
count += 1
else:
last_validation_loss = LOSS_FUNC(validate_pred_early_stop, validate_y).data.numpy()
count = 0
if count == MAX_COUNT:
print(‘early stopped!!!’)
flag = ‘stopped’
is_early_stop = True
# plotting
clear_output(wait=True)
plt.figure(figsize=(15,10))
test_pred_fc = fc_net(test_x)
test_pred_l2 = l2_net(test_x)
test_pred_early_stop = early_stop_net(test_x)
test_pred_drop = dropped_net(test_x)
test_pred_bn = bned_net(test_x)
plt.scatter(train_x.data.numpy(), train_y.data.numpy(), c=’magenta’, s=50, alpha=0.3, label=’train’)
plt.scatter(test_x.data.numpy(), test_y.data.numpy(), c=’cyan’, s=50, alpha=0.3, label=’test’)
plt.plot(test_x.data.numpy(), test_pred_fc.data.numpy(), ‘r-‘, lw=3, label=’overfitting’)
plt.plot(test_x.data.numpy(), test_pred_l2.data.numpy(), ‘y-v’, lw=3, label=’L2 regularization’)
plt.plot(test_x.data.numpy(), test_pred_early_stop.data.numpy(), ‘k-*’, lw=3, label=’early stopping’)
plt.plot(test_x.data.numpy(), test_pred_drop.data.numpy(), ‘b–‘, lw=3, label=’dropout({})’.format(DROPOUT_RATE))
plt.plot(test_x.data.numpy(), test_pred_bn.data.numpy(), ‘g-.’, lw=3, label=’batch normalization’)
plt.text(0.1, -1.2, ‘overfitting loss=%.4f’ % LOSS_FUNC(test_pred_fc, test_y).data.numpy(), fontdict={‘size’: 20, ‘color’: ‘r’})
plt.text(0.1, -1.5, ‘L2 regularization loss=%.4f’ % LOSS_FUNC(test_pred_l2, test_y).data.numpy(), fontdict={‘size’: 20, ‘color’: ‘y’})
plt.text(0.1, -1.8, ‘early stopping loss=%.4f’ % LOSS_FUNC(test_pred_early_stop, test_y).data.numpy() + flag, fontdict={‘size’: 20, ‘color’: ‘k’})
plt.text(0.1, -2.1, ‘dropout loss=%.4f’ % LOSS_FUNC(test_pred_drop, test_y).data.numpy(), fontdict={‘size’: 20, ‘color’: ‘b’})
plt.text(0.1, -2.4, ‘batch normalization loss=%.4f’ % LOSS_FUNC(test_pred_bn, test_y).data.numpy(), fontdict={‘size’: 20, ‘color’: ‘g’})
plt.legend(loc=’upper left’); plt.ylim((-2.5, 2.5));plt.pause(0.1)
plt.show()
# change back to train mode
for net in nets:
net.train()
early_stop_net.train()
6. Exercises¶
• Try different configurations of networks (e.g., structure, hyperparameters), and different traning sets.
• Set your own early stopping criterion.