from skimage import io, transform
import skimage.color as skio
import numpy as np
from __future__ import print_function, division
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import seaborn as sns
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
To build the dataloader, I made small tweaks to the examples used in the Pytorch Tutorial. In this part, the only transformation that we make apply to the images is a resizing to 60x80
pixels. We use the first 192 images for our training set and the remaining 48 images for validation. In this part, we are only interested in detecting the location of the nose keypoint.
# Create CSV that matches Dataset format
landmarks_df = pd.DataFrame(columns=range(117))
for i, filename in enumerate(os.listdir('imm_face_db')):
if filename.endswith('asf'):
with open('imm_face_db/' + filename) as fp:
img_name = filename.replace('asf', 'jpg')
points = fp.readlines()[16:74]
landmark = [img_name]
for point in points:
x, y = point.split('\t')[2:4]
landmark.append(float(x))
landmark.append(float(y))
landmarks_df.loc[i] = landmark
landmarks_df.to_csv('landmarks.csv')
class FaceLandmarksDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
self.landmarks_frame = pd.read_csv(csv_file, index_col=[0])
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir,
self.landmarks_frame.iloc[idx, 0])
image = skio.rgb2gray(io.imread(img_name))
image = transform.resize(image, (60, 80))
landmarks = self.landmarks_frame.iloc[idx, 1:]
landmarks = np.array([landmarks]).astype(float).reshape(-1, 2)
sample = {'image': image, 'landmarks': landmarks}
if self.transform:
sample = self.transform(sample)
return sample
face_dataset = FaceLandmarksDataset(csv_file='landmarks.csv', root_dir='images')
split = torch.utils.data.random_split(face_dataset, lengths=[192, 48], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(split[0], batch_size=4, shuffle=True)
test_loader = DataLoader(split[1], batch_size=4, shuffle=False)
def show_landmarks_batch(sample_batched):
images_batch, landmarks_batch = sample_batched['image'], sample_batched['landmarks']
batch_size = len(images_batch)
f, axs = plt.subplots(1, 4, figsize=(15,15))
for i in range(batch_size):
sample_image = images_batch[i]
sample_landmark = landmarks_batch[i]
ax = plt.subplot(1, 4, i+1)
plt.tight_layout()
plt.imshow(sample_image, cmap='gray')
plt.scatter(sample_landmark[-6][0]*80, sample_landmark[-6][1]*60, s=200, marker='.', c='g')
Here are four sample images from the training set with the corresponding nose keypoint marked in green:
for i_batch, sample_batched in enumerate(train_loader):
show_landmarks_batch(sample_batched)
break
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 12, 3)
self.conv2 = nn.Conv2d(12, 32, 5)
self.conv3 = nn.Conv2d(32, 12, 3)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(12*5*7, 120)
self.fc2 = nn.Linear(120, 2)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = x.view(x.size(0), 12*5*7)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 12, 3)
self.conv2 = nn.Conv2d(12, 32, 5)
self.conv3 = nn.Conv2d(32, 12, 3)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(12*5*7, 120)
self.fc2 = nn.Linear(120, 2)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = x.view(x.size(0), 12*5*7)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
Nose-detection CNN Hyperparameters:
optimizer=adam
loss=MSELoss()
batch_size=4
learning_rate=0.001
epochs=15
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(params=net.parameters(), lr=1e-3)
training_losses = []
val_losses = []
all_outputs = torch.empty((0, 2))
for epoch in range(15):
# TRAINING
for i, data in enumerate(train_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data['image'].view(4, 1, 60, 80).float(), data['landmarks'][:, -6].float()
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# TRAINING INFERENCE
running_loss = 0.0
for i, data in enumerate(train_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data['image'].view(4, 1, 60, 80).float(), data['landmarks'][:, -6].float()
with torch.no_grad():
outputs = net(inputs)
loss = criterion(outputs, labels)
running_loss += loss.item()
running_loss /= len(train_loader)
training_losses.append(running_loss)
# VALIDATION INFERENCE
running_loss = 0.0
for i, data in enumerate(test_loader):
inputs, labels = data['image'].view(4, 1, 60, 80).float(), data['landmarks'][:, -6].float()
with torch.no_grad():
outputs = net(inputs)
loss = criterion(outputs, labels)
running_loss += loss.item()
all_outputs = torch.cat((all_outputs, outputs))
running_loss /= len(test_loader)
val_losses.append(running_loss)
print('Finished Training')
plt.figure(figsize=(20, 8))
sns.lineplot(range(15), training_losses, label='Training MSE')
sns.lineplot(range(15), val_losses, label='Validation MSE')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title('MSE at each epoch')
plt.legend();
fig, axs = plt.subplots(2, 2, figsize=(15, 15))
plt.tight_layout()
ax = plt.subplot(2, 2, 1)
i = 205
plt.imshow(face_dataset[i]['image'], cmap='gray')
with torch.no_grad():
out = net(torch.from_numpy(face_dataset[i]['image']).float().view(1, 1, 60, 80)).detach()[0]
plt.scatter(out[0].item()*80, out[1].item()*60, s=60, color='r')
plt.scatter(face_dataset[i]['landmarks'][-6][0]*80, face_dataset[i]['landmarks'][-6][1]*60, s=60, color='g')
ax = plt.subplot(2, 2, 2)
i = 222
plt.imshow(face_dataset[i]['image'], cmap='gray')
with torch.no_grad():
out = net(torch.from_numpy(face_dataset[i]['image']).float().view(1, 1, 60, 80)).detach()[0]
plt.scatter(out[0].item()*80, out[1].item()*60, s=60, color='r')
plt.scatter(face_dataset[i]['landmarks'][-6][0]*80, face_dataset[i]['landmarks'][-6][1]*60, s=60, color='g')
ax = plt.subplot(2, 2, 3)
i = 215
plt.imshow(face_dataset[i]['image'], cmap='gray')
with torch.no_grad():
out = net(torch.from_numpy(face_dataset[i]['image']).float().view(1, 1, 60, 80)).detach()[0]
plt.scatter(out[0].item()*80, out[1].item()*60, s=60, color='r')
plt.scatter(face_dataset[i]['landmarks'][-6][0]*80, face_dataset[i]['landmarks'][-6][1]*60, s=60, color='g')
ax = plt.subplot(2, 2, 4)
i = 230
plt.imshow(face_dataset[i]['image'], cmap='gray')
with torch.no_grad():
out = net(torch.from_numpy(face_dataset[i]['image']).float().view(1, 1, 60, 80)).detach()[0]
plt.scatter(out[0].item()*80, out[1].item()*60, s=60, color='r')
plt.scatter(face_dataset[i]['landmarks'][-6][0]*80, face_dataset[i]['landmarks'][-6][1]*60, s=60, color='g');
The true nose location is shown in green and the detected output of our CNN is shown in red.
Our neural network detects the nost correctly in the top 2 images and incorrectly for the bottom two images. It appears that our classifier does much better on front-facing images, and detection for the bottom two images likely failed because the faces were tilted or rotated.
In this part, we attempt to detect all facial keypoints. Unlike part 1, we apply a few non-trivial transformations to the images before loading them into the dataloader including:
120x160
pixelsimport torchvision.transforms as transforms
from torchvision.utils import save_image
class FaceLandmarksDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
self.landmarks_frame = pd.read_csv(csv_file, index_col=[0])
self.root_dir = root_dir
self.transform = None
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir,
self.landmarks_frame.iloc[idx, 0])
image = skio.rgb2gray(io.imread(img_name))
image = transform.resize(image, (120, 160)).astype(np.float32)
# Change brightness/saturation
image = image*np.random.choice(np.arange(-0.99, 1.00, 0.02)) + np.random.choice(np.arange(-0.5, 0.5, 0.01))
# Rotate Image
angle = np.random.randint(-15, 16)
image = transform.rotate(image, angle=angle)
landmarks = self.landmarks_frame.iloc[idx, 1:]
landmarks = np.array([landmarks]).astype(np.float32).reshape(-1, 2)
# Rotate Landmarks
theta = np.radians(angle)
rotation = np.array([
[np.cos(theta), -np.sin(theta)],
[np.sin(theta), np.cos(theta)]
])
landmarks = (landmarks-0.5) @ rotation + 0.5
# landmarks transforms
sample = {'image': image, 'landmarks': landmarks}
return sample
face_dataset = FaceLandmarksDataset(csv_file='landmarks.csv', root_dir='images')
split = torch.utils.data.random_split(face_dataset, lengths=[192, 48], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(split[0], batch_size=4, shuffle=True)
test_loader = DataLoader(split[1], batch_size=4, shuffle=False)
def show_landmarks_batch(sample_batched):
images_batch, landmarks_batch = sample_batched['image'], sample_batched['landmarks']
batch_size = len(images_batch)
f, axs = plt.subplots(1, 4, figsize=(15,15))
for i in range(batch_size):
sample_image = images_batch[i]
sample_landmark = landmarks_batch[i]
ax = plt.subplot(1, 4, i+1)
plt.tight_layout()
plt.imshow(sample_image, cmap='gray')
plt.scatter(sample_landmark[:, 0]*160, sample_landmark[:, 1]*120, s=20, marker='.', c='g')
Here are four images from our training set with the corresponding keypoints labeled in green.
for i_batch, sample_batched in enumerate(train_loader):
show_landmarks_batch(sample_batched)
break
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 12, 3)
self.conv2 = nn.Conv2d(12, 32, 3)
self.conv3 = nn.Conv2d(32, 32, 3)
self.conv4 = nn.Conv2d(32, 32, 5)
self.conv5 = nn.Conv2d(32, 12, 5)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(1512, 256)
self.fc2 = nn.Linear(256, 116)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = F.relu(self.conv2(x))
x = self.pool(F.relu(self.conv3(x)))
x = F.relu(self.conv4(x))
x = self.pool(F.relu(self.conv5(x)))
x = x.view(x.size(0), 1512)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.002)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 12, 3)
self.conv2 = nn.Conv2d(12, 32, 3)
self.conv3 = nn.Conv2d(32, 32, 3)
self.conv4 = nn.Conv2d(32, 32, 5)
self.conv5 = nn.Conv2d(32, 12, 5)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(1512, 256)
self.fc2 = nn.Linear(256, 116)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = F.relu(self.conv2(x))
x = self.pool(F.relu(self.conv3(x)))
x = F.relu(self.conv4(x))
x = self.pool(F.relu(self.conv5(x)))
x = x.view(x.size(0), 1512)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
We use the following neural network architecture with the following hyperparameters:
batch_size=4
learning_rate=0.002
epochs=15
optimizer=adam
loss=MSELoss()
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.002)
training_losses = []
val_losses = []
all_outputs = torch.empty((0, 58, 2))
for epoch in range(15):
# TRAINING
for i, data in enumerate(train_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data['image'].view(4, 1, 120, 160).float(), data['landmarks'].float()
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels.view(4, 116))
loss.backward()
optimizer.step()
# TRAINING INFERENCE
running_loss = 0.0
for i, data in enumerate(train_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data['image'].view(4, 1, 120, 160).float(), data['landmarks'].float()
with torch.no_grad():
outputs = net(inputs)
loss = criterion(outputs, labels.view(4, 116))
running_loss += loss.item()
running_loss /= len(train_loader)
training_losses.append(running_loss)
# VALIDATION INFERENCE
running_loss = 0.0
for i, data in enumerate(test_loader):
inputs, labels = data['image'].view(4, 1, 120, 160).float(), data['landmarks'].float()
with torch.no_grad():
outputs = net(inputs)
loss = criterion(outputs, labels.view(4, 116))
running_loss += loss.item()
all_outputs = torch.cat((all_outputs, outputs.view(4, 58, 2)))
running_loss /= len(test_loader)
val_losses.append(running_loss)
print('Finished Training')
plt.figure(figsize=(20, 8))
sns.lineplot(range(15), training_losses, label='Training MSE')
sns.lineplot(range(15), val_losses, label='Validation MSE')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title('MSE at each epoch')
plt.legend()
fig, axs = plt.subplots(2, 2, figsize=(15, 15))
plt.tight_layout()
ax = plt.subplot(2, 2, 1)
i = 222
sample = face_dataset[i]
plt.imshow(sample['image'], cmap='gray')
with torch.no_grad():
out = net(torch.from_numpy(sample['image']).float().view(1, 1, 120, 160)).detach()[0].view(58, 2)
plt.scatter(out[:, 0]*160, out[:, 1]*120, s=20, color='r')
plt.scatter(sample['landmarks'][:, 0]*160, sample['landmarks'][:, 1]*120, s=60, color='g')
ax = plt.subplot(2, 2, 2)
i = 210
sample = face_dataset[i]
plt.imshow(sample['image'], cmap='gray')
with torch.no_grad():
out = net(torch.from_numpy(sample['image']).float().view(1, 1, 120, 160)).detach()[0].view(58, 2)
plt.scatter(out[:, 0]*160, out[:, 1]*120, s=20, color='r')
plt.scatter(sample['landmarks'][:, 0]*160, sample['landmarks'][:, 1]*120, s=60, color='g')
ax = plt.subplot(2, 2, 3)
i = 239
sample = face_dataset[i]
plt.imshow(sample['image'], cmap='gray')
with torch.no_grad():
out = net(torch.from_numpy(sample['image']).float().view(1, 1, 120, 160)).detach()[0].view(58, 2)
plt.scatter(out[:, 0]*160, out[:, 1]*120, s=20, color='r')
plt.scatter(sample['landmarks'][:, 0]*160, sample['landmarks'][:, 1]*120, s=60, color='g')
ax = plt.subplot(2, 2, 4)
i = 233
sample = face_dataset[i]
plt.imshow(sample['image'], cmap='gray')
with torch.no_grad():
out = net(torch.from_numpy(sample['image']).float().view(1, 1, 120, 160)).detach()[0].view(58, 2)
plt.scatter(out[:, 0]*160, out[:, 1]*120, s=20, color='r')
plt.scatter(sample['landmarks'][:, 0]*160, sample['landmarks'][:, 1]*120, s=60, color='g')
True landmarks are shown in green and detected landmarks are shown in red
As expected, our model performs extremely well for front-facing images (even with non-trivial rotation) but performs terribly for images where the person is primarily looking to the side.
# Code Source:
#https://colab.research.google.com/github/Niranjankumar-c/DeepLearning-PadhAI/blob/master/DeepLearning_Materials/6_VisualizationCNN_Pytorch/CNNVisualisation.ipynb
def plot_filters_single_channel_big(t):
#setting the rows and columns
nrows = t.shape[0]*t.shape[2]
ncols = t.shape[1]*t.shape[3]
npimg = np.array(t.numpy(), np.float32)
npimg = npimg.transpose((0, 2, 1, 3))
npimg = npimg.ravel().reshape(nrows, ncols)
npimg = npimg.T
fig, ax = plt.subplots(figsize=(ncols/10, nrows/200))
imgplot = sns.heatmap(npimg, xticklabels=False, yticklabels=False, cmap='gray', ax=ax, cbar=False)
def plot_filters_single_channel(t):
#kernels depth * number of kernels
nplots = t.shape[0]*t.shape[1]
ncols = 12
nrows = 1 + nplots//ncols
#convert tensor to numpy image
npimg = np.array(t.numpy(), np.float32)
count = 0
fig = plt.figure(figsize=(ncols, nrows))
#looping through all the kernels in each channel
for i in range(t.shape[0]):
for j in range(t.shape[1]):
count += 1
ax1 = fig.add_subplot(nrows, ncols, count)
npimg = np.array(t[i, j].numpy(), np.float32)
npimg = (npimg - np.mean(npimg)) / np.std(npimg)
npimg = np.minimum(1, np.maximum(0, (npimg + 0.5)))
ax1.imshow(npimg)
ax1.set_title(str(i) + ',' + str(j))
ax1.axis('off')
ax1.set_xticklabels([])
ax1.set_yticklabels([])
plt.tight_layout()
plt.show()
def plot_filters_multi_channel(t):
#get the number of kernals
num_kernels = t.shape[0]
#define number of columns for subplots
num_cols = 12
#rows = num of kernels
num_rows = num_kernels
#set the figure size
fig = plt.figure(figsize=(num_cols,num_rows))
#looping through all the kernels
for i in range(t.shape[0]):
ax1 = fig.add_subplot(num_rows,num_cols,i+1)
#for each kernel, we convert the tensor to numpy
npimg = np.array(t[i].numpy(), np.float32)
#standardize the numpy image
npimg = (npimg - np.mean(npimg)) / np.std(npimg)
npimg = np.minimum(1, np.maximum(0, (npimg + 0.5)))
npimg = npimg.transpose((1, 2, 0))
ax1.imshow(npimg)
ax1.axis('off')
ax1.set_title(str(i))
ax1.set_xticklabels([])
ax1.set_yticklabels([])
plt.savefig('myimage.png', dpi=100)
plt.tight_layout()
plt.show()
def plot_weights(model, layer_num, single_channel = True, collated = False):
#extracting the model features at the particular layer number
layer = list(model.children())[layer_num]
#checking whether the layer is convolution layer or not
if isinstance(layer, nn.Conv2d):
#getting the weight tensor data
weight_tensor = list(model.children())[layer_num].weight.data
if single_channel:
if collated:
plot_filters_single_channel_big(weight_tensor)
else:
plot_filters_single_channel(weight_tensor)
else:
if weight_tensor.shape[1] == 3:
plot_filters_multi_channel(weight_tensor)
else:
print("Can only plot weights with three channels with single channel = False")
else:
print("Can only visualize layers which are convolutional")
plot_weights(net, 0, single_channel = True)
plot_weights(net, 1, single_channel = True)
plot_weights(net, 2, single_channel = True)
In this part, we continue with keypoint detection on a much larger dataset (6666) images. The images in this dataset have much larger variance in terms of image size and face positioning, so we crop just the face, resize the image to 244x244
pixels, and apply the transformations from part 2. Here, we use an 80/20 train-test validation split, although we ultimately train on the entire provided dataset since the true test dataset is unlabeled. Since the dataset for this part is extremely large, we train our convolutional neural network on Google Colab with a dedicated GPU.
import xml.etree.ElementTree as ET
%%capture
if not os.path.exists('/content/ibug_300W_large_face_landmark_dataset'):
!wget https://people.eecs.berkeley.edu/~zhecao/ibug_300W_large_face_landmark_dataset.zip
!unzip 'ibug_300W_large_face_landmark_dataset.zip'
!rm -r 'ibug_300W_large_face_landmark_dataset.zip'
class FullDataset(Dataset):
def __init__(self, transform=None):
# Source code provided in the project spec
tree = ET.parse('ibug_300W_large_face_landmark_dataset/labels_ibug_300W_train.xml')
root = tree.getroot()
root_dir = 'ibug_300W_large_face_landmark_dataset'
bboxes = [] # face bounding box used to crop the image
landmarks = [] # the facial keypoints/landmarks for the whole training dataset
img_filenames = [] # the image names for the whole dataset
for filename in root[2]:
img_filenames.append(os.path.join(root_dir, filename.attrib['file']))
box = filename[0].attrib
# x, y for the top left corner of the box, w, h for box width and height
bboxes.append([box['left'], box['top'], box['width'], box['height']])
landmark = []
for num in range(68):
x_coordinate = int(filename[0][num].attrib['x'])
y_coordinate = int(filename[0][num].attrib['y'])
landmark.append([x_coordinate, y_coordinate])
landmarks.append(landmark)
landmarks = np.array(landmarks).astype('float32')
bboxes = np.array(bboxes).astype('float32')
self.img_filenames = img_filenames
self.landmarks = landmarks
self.bboxes = bboxes
def __len__(self):
return len(self.img_filenames)
def __getitem__(self, idx):
image = skio.rgb2gray(plt.imread(self.img_filenames[idx]))
landmarks = self.landmarks[idx]
bboxes = self.bboxes[idx]
min_x = max(0, min(int(bboxes[0]), min(landmarks[:, 0]).astype(int)))
min_y = max(0, min(int(bboxes[1]), min(landmarks[:, 1]).astype(int)))
width = int(bboxes[2]*1.3)
height = int(bboxes[3]*1.3)
new_img = image[min_y:min_y+height, min_x:min_x+width]
new_landmarks = landmarks.copy()
new_landmarks[:, 0] = (new_landmarks[:, 0]-min_x) / new_img.shape[1]
new_landmarks[:, 1] = (new_landmarks[:, 1]-min_y) / new_img.shape[0]
new_img = transform.resize(new_img, (224, 224)).astype(np.float32)
# Change brightness/saturation
new_img = new_img*np.random.choice(np.arange(-0.49, 0.51, 0.02)) + np.random.choice(np.arange(-0.5, 0.5, 0.01))
# Rotate Image
angle = np.random.randint(-15, 16)
new_img = transform.rotate(new_img, angle=angle)
# Rotate Landmarks
theta = np.radians(angle)
rotation = np.array([
[np.cos(theta), -np.sin(theta)],
[np.sin(theta), np.cos(theta)]
])
new_landmarks = (new_landmarks-0.5) @ rotation + 0.5
# landmarks transforms
sample = {'image': new_img, 'landmarks': new_landmarks, 'old_shape': image.shape, 'cropped': (min_x, min_y)}
return sample
full_dataset = FullDataset()
split = torch.utils.data.random_split(full_dataset, lengths=[5333, 1333], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(split[0], batch_size=16, shuffle=True, num_workers=2)
test_loader = DataLoader(split[1], batch_size=16, shuffle=False, num_workers=2)
Here are 16 sample images from the training set and their corresponding facial keypoints.
def show_landmarks_batch(sample_batched):
images_batch, landmarks_batch = sample_batched['image'], sample_batched['landmarks']
batch_size = len(images_batch)
f, axs = plt.subplots(1, 4, figsize=(15,15))
for i in range(batch_size):
for j in range(4):
sample_image = images_batch[i]
sample_landmark = landmarks_batch[i]
ax = plt.subplot(4, 4, i+1)
plt.tight_layout()
plt.imshow(sample_image, cmap='gray')
plt.scatter(sample_landmark[:, 0]*224, sample_landmark[:, 1]*224, s=20, marker='.', c='g')
for i_batch, sample_batched in enumerate(train_loader):
show_landmarks_batch(sample_batched)
break
from torchvision import models
Here, we use the non pre-trained resnet18
architecture from https://pytorch.org/docs/stable/torchvision/models.html with slight modifications to the first convolutional layer and the final linear layer so that the CNN is compatible with our data dimensions.
Hyperparameters:
learning_rate=0.001
batch_size=16
epochs=10
optimizer=adam
loss=MSELoss()
The chosen hyperparameters are fairly random, but I didn't have enough time to hypertune since training takes roughly 2-3 hours.
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
torch.backends.cudnn.deterministic=True
torch.cuda.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from torchvision import models
resnet = models.resnet18()
resnet.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet.fc = torch.nn.Linear(512, 136, bias=True)
criterion = nn.MSELoss()
optimizer = optim.Adam(params=resnet.parameters(), lr=1e-3)
training_losses = []
val_losses = []
resnet = resnet.to(device)
for epoch in range(1):
# TRAINING
for i, data in enumerate(train_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data['image'].view(data['image'].shape[0], 1, 224, 224).float().to(device), data['landmarks'].float().to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = resnet(inputs)
loss = criterion(outputs, labels.view(data['image'].shape[0], 136))
loss.backward()
optimizer.step()
# TRAINING INFERENCE
running_loss = 0.0
for i, data in enumerate(train_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data['image'].view(data['image'].shape[0], 1, 224, 224).float().to(device), data['landmarks'].float().to(device)
with torch.no_grad():
outputs = resnet(inputs)
loss = criterion(outputs, labels.view(data['image'].shape[0], 136))
running_loss += loss.item()
running_loss /= len(train_loader)
training_losses.append(running_loss)
# VALIDATION INFERENCE
running_loss = 0.0
for i, data in enumerate(test_loader):
inputs, labels = data['image'].view(data['image'].shape[0], 1, 224, 224).float().to(device), data['landmarks'].float().to(device)
with torch.no_grad():
outputs = resnet(inputs)
loss = criterion(outputs, labels.view(data['image'].shape[0], 136))
running_loss += loss.item()
running_loss /= len(test_loader)
val_losses.append(running_loss)
print('Finished Training')
sns.lineplot(range(15), training_losses, label='Training MSE')
sns.lineplot(range(15), val_losses, label='Validation MSE')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title('MSE at each epoch')
plt.legend()
# Train on the entire dataset
from torchvision import models
resnet = models.resnet18()
resnet.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet.fc = torch.nn.Linear(512, 136, bias=True)
criterion = nn.MSELoss()
optimizer = optim.Adam(params=resnet.parameters(), lr=1e-3)
training_losses = []
val_losses = []
resnet = resnet.to(device)
for epoch in range(10):
# TRAINING
for i, data in enumerate(all_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data['image'].view(data['image'].shape[0], 1, 224, 224).float().to(device), data['landmarks'].float().to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = resnet(inputs)
loss = criterion(outputs, labels.view(data['image'].shape[0], 136))
loss.backward()
optimizer.step()
print(f'EPOCH {epoch}: {loss.item()}')
# These losses are unfortunately hardcoded because my google colab died and i lost the exact values
training_losses = [0.0055, 0.0056, 0.0048, 0.0051, 0.0054, 0.0032, 0.0023, 0.0025, 0.0008, 0.0006]
val_losses = [0.0055, 0.0058, 0.005, 0.0052, 0.0055, 0.0033, 0.0024, 0.0026, 0.001, 0.0008]
plt.figure(figsize=(20, 8))
sns.lineplot(range(10), training_losses, label='Training MSE')
sns.lineplot(range(10), val_losses, label='Validation MSE')
plt.xlabel('epoch')
plt.ylabel('MSE')
plt.title('MSE at each Epoch');
tree = ET.parse('labels_ibug_300W_test_parsed.xml')
root = tree.getroot()
root_dir = 'ibug_300W_large_face_landmark_dataset'
bboxes = [] # face bounding box used to crop the image
img_filenames = [] # the image names for the whole dataset
for filename in root[2]:
img_filenames.append(os.path.join(root_dir, filename.attrib['file']))
box = filename[0].attrib
# x, y for the top left corner of the box, w, h for box width and height
bboxes.append([box['left'], box['top'], box['width'], box['height']])
bboxes = np.array(bboxes).astype('float32')
class TestDataset(Dataset):
def __init__(self, transform=None):
# Source code provided in the project spec
tree = ET.parse('labels_ibug_300W_test_parsed.xml')
root = tree.getroot()
root_dir = 'ibug_300W_large_face_landmark_dataset'
bboxes = [] # face bounding box used to crop the image
img_filenames = [] # the image names for the whole dataset
for filename in root[2]:
img_filenames.append(os.path.join(root_dir, filename.attrib['file']))
box = filename[0].attrib
# x, y for the top left corner of the box, w, h for box width and height
bboxes.append([box['left'], box['top'], box['width'], box['height']])
bboxes = np.array(bboxes).astype('float32')
self.img_filenames = img_filenames
self.bboxes = bboxes
def __len__(self):
return len(self.img_filenames)
def __getitem__(self, idx):
image = skio.rgb2gray(plt.imread(self.img_filenames[idx]))
bboxes = self.bboxes[idx]
min_x = max(0, int(bboxes[0]))
min_y = max(0, int(bboxes[1]))
width = int(bboxes[2]*1.3)
height = int(bboxes[3]*1.3)
new_img = image[min_y:min_y+height, min_x:min_x+width]
old_shape = [new_img.shape[0], new_img.shape[1]]
new_img = transform.resize(new_img, (224, 224)).astype(np.float32)
# landmarks transforms
sample = {'image': new_img, 'old_shape': old_shape, 'cropped': (min_x, min_y)}
return sample
# Load the model trained on ALL 6666 samples
resnet = models.resnet18()
resnet.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet.fc = torch.nn.Linear(512, 136, bias=True)
resnet.load_state_dict(torch.load('resnet.th'))
test_dataset = TestDataset()
landmarks = torch.empty((0, 68, 2))
with torch.no_grad():
for sample in test_dataset:
img = torch.from_numpy(sample['image'])
outputs = resnet(img.view(1, 1, 224, 224)).view(1, 68, 2)
outputs[:, :, 0] = outputs[:, :, 0]*sample['old_shape'][1] + sample['cropped'][0]
outputs[:, :, 1] = outputs[:, :, 1]*sample['old_shape'][0] + sample['cropped'][1]
landmarks = torch.cat((landmarks, outputs))
landmarks = landmarks.numpy()
pd.DataFrame(data={'Id': np.arange(137088), 'Predicted': landmarks.flatten()}).to_csv('kaggle.csv')
f, axs = plt.subplots(1, 4, figsize=(15,15))
plt.tight_layout()
i=15
ax = plt.subplot(2, 2, 1)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])
i=130
ax = plt.subplot(2, 2, 2)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])
i=115
ax = plt.subplot(2, 2, 3)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])
i=112
ax = plt.subplot(2, 2, 4)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1]);
f, axs = plt.subplots(1, 4, figsize=(15,15))
plt.tight_layout()
i=200
ax = plt.subplot(2, 2, 1)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])
i=873
ax = plt.subplot(2, 2, 2)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])
The keypoint detection range seems to be way to large for these two images, likely because both faces are at fairly strange angles (its also possible that this is because the children's faces are particularly round and most of our training data are adults with more oval shaped faces!)
obama = plt.imread('obama.jpg')
biden = plt.imread('biden.jpg')
trump = plt.imread('trump.jpg')
pence = plt.imread('pence.jpg')
images = [obama, biden, trump, pence]
politicians = {}
f, axs = plt.subplots(1, 4, figsize=(15,15))
plt.tight_layout()
ax = plt.subplot(2, 2, 1)
with torch.no_grad():
obama_new = skio.rgb2gray(obama)[0:800, 400:1000]
out = resnet(torch.from_numpy(transform.resize(obama_new, (224, 224))).view(1, 1, 224, 224).float())
out = out.view(68, 2)
out[:, 0] = out[:, 0]*obama_new.shape[1] + 400
out[:, 1] = out[:, 1]*obama_new.shape[0] + 0
plt.imshow(obama)
plt.scatter(out[:, 0], out[:, 1])
ax = plt.subplot(2, 2, 2)
with torch.no_grad():
biden_new = skio.rgb2gray(biden)[0:800, 100:800]
out = resnet(torch.from_numpy(transform.resize(biden_new, (224, 224))).view(1, 1, 224, 224).float())
out = out.view(68, 2)
out[:, 0] = out[:, 0]*biden_new.shape[1] + 100
out[:, 1] = out[:, 1]*biden_new.shape[0] + 0
plt.imshow(biden)
plt.scatter(out[:, 0], out[:, 1])
ax = plt.subplot(2, 2, 3)
with torch.no_grad():
trump_new = skio.rgb2gray(trump[50:300, 200:400])
out = resnet(torch.from_numpy(transform.resize(trump_new, (224, 224))).view(1, 1, 224, 224).float())
out = out.view(68, 2)
out[:, 0] = out[:, 0]*trump_new.shape[1] + 200
out[:, 1] = out[:, 1]*trump_new.shape[0] + 50
plt.imshow(trump)
plt.scatter(out[:, 0], out[:, 1])
ax = plt.subplot(2, 2, 4)
with torch.no_grad():
pence_new = skio.rgb2gray(pence)[0:600, 400:800]
out = resnet(torch.from_numpy(transform.resize(pence_new, (224, 224))).view(1, 1, 224, 224).float())
out = out.view(68, 2)
out[:, 0] = out[:, 0]*pence_new.shape[1] + 400
out[:, 1] = out[:, 1]*pence_new.shape[0] + 0
plt.imshow(pence)
plt.scatter(out[:, 0], out[:, 1]);
The model seems to perform fairly well in terms of finding the overall outline of the face, but does a pretty poor job in outlining actual facial features. I wonder if this has to do with the way that I am cropping the images.
FILE = "main.html"
with open(FILE, 'r') as html_file:
content = html_file.read()
# Get rid off prompts and source code
content = content.replace("div.input_area {
display: none;","div.input_area {
display: none;\n\tdisplay: none;")
content = content.replace(".prompt {
display: none;",".prompt {
display: none;\n\tdisplay: none;")
f = open(FILE, 'w')
f.write(content)
f.close()