Generating Validation Set Splits¶
In [1]:
import torch
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader
BATCH_SIZE = 64
In [2]:
##########################
### MNIST DATASET
##########################
# Note transforms.ToTensor() scales input images
# to 0-1 range
train_dataset = datasets.MNIST(root=’data’,
train=True,
transform=transforms.ToTensor(),
download=True)
test_dataset = datasets.MNIST(root=’data’,
train=False,
transform=transforms.ToTensor())
train_loader = DataLoader(dataset=train_dataset,
batch_size=BATCH_SIZE,
num_workers=4,
shuffle=True)
test_loader = DataLoader(dataset=test_dataset,
batch_size=BATCH_SIZE,
num_workers=4,
shuffle=False)
# Checking the dataset
for images, labels in train_loader:
print(‘Image batch dimensions:’, images.shape)
print(‘Image label dimensions:’, labels.shape)
break
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz
HBox(children=(FloatProgress(value=0.0, max=9912422.0), HTML(value=”)))
Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz
HBox(children=(FloatProgress(value=0.0, max=28881.0), HTML(value=”)))
Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz
HBox(children=(FloatProgress(value=0.0, max=1648877.0), HTML(value=”)))
Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz
HBox(children=(FloatProgress(value=0.0, max=4542.0), HTML(value=”)))
Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw
Processing…
Done!
/Users/monajmn/opt/anaconda3/lib/python3.8/site-packages/torchvision/datasets/mnist.py:502: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:143.)
return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
Image batch dimensions: torch.Size([64, 1, 28, 28])
Image label dimensions: torch.Size([64])
In [3]:
print(f’Total number of training examples: {len(train_dataset)}’)
Total number of training examples: 60000
Subset Method¶
In [ ]:
from torch.utils.data.dataset import Subset
In [ ]:
valid_indices = torch.arange(0, 1000)
train_indices = torch.arange(1000, 60000)
train_and_valid = datasets.MNIST(root=’data’,
train=True,
transform=transforms.ToTensor(),
download=True)
train_dataset = Subset(train_and_valid, train_indices)
valid_dataset = Subset(train_and_valid, valid_indices)
In [ ]:
train_loader = DataLoader(dataset=train_dataset,
batch_size=BATCH_SIZE,
num_workers=4,
shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset,
batch_size=BATCH_SIZE,
num_workers=4,
shuffle=False)
In [ ]:
# Checking the dataset
for images, labels in train_loader:
print(‘Image batch dimensions:’, images.shape)
print(‘Image label dimensions:’, labels.shape)
break
In [ ]:
# Check that shuffling works properly
# i.e., label indices should be in random order.
# Also, the label order should be different in the second
# epoch.
for images, labels in train_loader:
pass
print(labels[:10])
for images, labels in train_loader:
pass
print(labels[:10])
In [ ]:
# Check that shuffling works properly.
# i.e., label indices should be in random order.
# Via the fixed random seed, both epochs should return
# the same label sequence.
torch.manual_seed(123)
for images, labels in train_loader:
pass
print(labels[:10])
torch.manual_seed(123)
for images, labels in train_loader:
pass
print(labels[:10])
SubsetRandomSampler Method¶
In [ ]:
from torch.utils.data import SubsetRandomSampler
In [ ]:
train_indices = torch.arange(1000, 60000)
valid_indices = torch.arange(0, 1000)
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(valid_indices)
training_transform = transforms.Compose([transforms.Resize((32, 32)),
transforms.RandomCrop((28, 28)),
transforms.ToTensor()])
valid_transform = transforms.Compose([transforms.Resize((32, 32)),
transforms.CenterCrop((28, 28)),
transforms.ToTensor()])
train_dataset = datasets.MNIST(root=’data’,
train=True,
transform=training_transform,
download=True)
# note that this is the same dataset as “train_dataset” above
# however, we can now choose a different transform method
valid_dataset = datasets.MNIST(root=’data’,
train=True,
transform=valid_transform,
download=False)
test_dataset = datasets.MNIST(root=’data’,
train=False,
transform=valid_transform,
download=False)
train_loader = DataLoader(train_dataset,
batch_size=BATCH_SIZE,
num_workers=4,
sampler=train_sampler)
valid_loader = DataLoader(valid_dataset,
batch_size=BATCH_SIZE,
num_workers=4,
sampler=valid_sampler)
test_loader = DataLoader(dataset=test_dataset,
batch_size=BATCH_SIZE,
num_workers=4,
shuffle=False)
In [ ]:
# Checking the dataset
for images, labels in train_loader:
print(‘Image batch dimensions:’, images.shape)
print(‘Image label dimensions:’, labels.shape)
break
In [ ]:
# Check that shuffling works properly
# i.e., label indices should be in random order.
# Also, the label order should be different in the second
# epoch.
for images, labels in train_loader:
pass
print(labels[:10])
for images, labels in train_loader:
pass
print(labels[:10])
In [ ]:
# Check that shuffling works properly.
# i.e., label indices should be in random order.
# Via the fixed random seed, both epochs should return
# the same label sequence.
torch.manual_seed(123)
for images, labels in train_loader:
pass
print(labels[:10])
torch.manual_seed(123)
for images, labels in train_loader:
pass
print(labels[:10])