In [420]:
from skimage import io, transform
import skimage.color as skio
import numpy as np
from __future__ import print_function, division
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

Facial Keypoint Detection with Neural Networks

David Yi

Part 1: Nose Tip Detection

Building the Dataloader

To build the dataloader, I made small tweaks to the examples used in the Pytorch Tutorial. In this part, the only transformation that we make apply to the images is a resizing to 60x80 pixels. We use the first 192 images for our training set and the remaining 48 images for validation. In this part, we are only interested in detecting the location of the nose keypoint.

In [421]:
# Create CSV that matches Dataset format

landmarks_df = pd.DataFrame(columns=range(117))

for i, filename in enumerate(os.listdir('imm_face_db')):
    if filename.endswith('asf'):
        with open('imm_face_db/' + filename) as fp:
            img_name = filename.replace('asf', 'jpg')
            points = fp.readlines()[16:74]
            landmark = [img_name]
            for point in points:
                x, y = point.split('\t')[2:4]
                landmark.append(float(x))
                landmark.append(float(y))
            landmarks_df.loc[i] = landmark
landmarks_df.to_csv('landmarks.csv')
In [422]:
class FaceLandmarksDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.landmarks_frame = pd.read_csv(csv_file, index_col=[0])
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.landmarks_frame)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img_name = os.path.join(self.root_dir,
                               self.landmarks_frame.iloc[idx, 0])
        image = skio.rgb2gray(io.imread(img_name))
        image = transform.resize(image, (60, 80))
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        landmarks = np.array([landmarks]).astype(float).reshape(-1, 2)
        sample = {'image': image, 'landmarks': landmarks}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample
In [423]:
face_dataset = FaceLandmarksDataset(csv_file='landmarks.csv', root_dir='images')
split = torch.utils.data.random_split(face_dataset, lengths=[192, 48], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(split[0], batch_size=4, shuffle=True)
test_loader = DataLoader(split[1], batch_size=4, shuffle=False)
In [426]:
def show_landmarks_batch(sample_batched):
    images_batch, landmarks_batch = sample_batched['image'], sample_batched['landmarks']
    batch_size = len(images_batch)
    
    f, axs = plt.subplots(1, 4, figsize=(15,15))
    for i in range(batch_size):
        sample_image = images_batch[i]
        sample_landmark = landmarks_batch[i]
        
        ax = plt.subplot(1, 4, i+1)
        plt.tight_layout()
        plt.imshow(sample_image, cmap='gray')
        plt.scatter(sample_landmark[-6][0]*80, sample_landmark[-6][1]*60, s=200, marker='.', c='g')

Here are four sample images from the training set with the corresponding nose keypoint marked in green:

In [427]:
for i_batch, sample_batched in enumerate(train_loader):
    show_landmarks_batch(sample_batched)
    break

Convolutional Neural Network (Nose Tip Detection)

In [445]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
In [446]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 12, 3)
        self.conv2 = nn.Conv2d(12, 32, 5)
        self.conv3 = nn.Conv2d(32, 12, 3)
        self.pool = nn.MaxPool2d(2, 2)
        
        self.fc1 = nn.Linear(12*5*7, 120)
        self.fc2 = nn.Linear(120, 2)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(x.size(0), 12*5*7)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 12, 3)
        self.conv2 = nn.Conv2d(12, 32, 5)
        self.conv3 = nn.Conv2d(32, 12, 3)
        self.pool = nn.MaxPool2d(2, 2)

        self.fc1 = nn.Linear(12*5*7, 120)
        self.fc2 = nn.Linear(120, 2)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(x.size(0), 12*5*7)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x

Nose-detection CNN Hyperparameters:

  • optimizer=adam
  • loss=MSELoss()
  • batch_size=4
  • learning_rate=0.001
  • epochs=15

Training

In [471]:
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(params=net.parameters(), lr=1e-3)

training_losses = []
val_losses = []
all_outputs = torch.empty((0, 2))
for epoch in range(15): 

    # TRAINING
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['image'].view(4, 1, 60, 80).float(), data['landmarks'][:, -6].float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # TRAINING INFERENCE
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['image'].view(4, 1, 60, 80).float(), data['landmarks'][:, -6].float()
        with torch.no_grad():
            outputs = net(inputs)
            loss = criterion(outputs, labels)
        running_loss += loss.item()
    running_loss /= len(train_loader)
    training_losses.append(running_loss)
    
    # VALIDATION INFERENCE
    running_loss = 0.0
    for i, data in enumerate(test_loader):
        inputs, labels = data['image'].view(4, 1, 60, 80).float(), data['landmarks'][:, -6].float()
        with torch.no_grad():
            outputs = net(inputs)
            loss = criterion(outputs, labels)
        running_loss += loss.item()
        all_outputs = torch.cat((all_outputs, outputs))
    running_loss /= len(test_loader)
    val_losses.append(running_loss)
        
print('Finished Training')
In [449]:
plt.figure(figsize=(20, 8))

sns.lineplot(range(15), training_losses, label='Training MSE')
sns.lineplot(range(15), val_losses, label='Validation MSE')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title('MSE at each epoch')
plt.legend();

Sample Detections on the Test Set

In [460]:
fig, axs = plt.subplots(2, 2, figsize=(15, 15))
plt.tight_layout()

ax = plt.subplot(2, 2, 1)
i = 205
plt.imshow(face_dataset[i]['image'], cmap='gray')
with torch.no_grad():
    out = net(torch.from_numpy(face_dataset[i]['image']).float().view(1, 1, 60, 80)).detach()[0]
plt.scatter(out[0].item()*80, out[1].item()*60, s=60, color='r')
plt.scatter(face_dataset[i]['landmarks'][-6][0]*80, face_dataset[i]['landmarks'][-6][1]*60, s=60, color='g')

ax = plt.subplot(2, 2, 2)
i = 222
plt.imshow(face_dataset[i]['image'], cmap='gray')
with torch.no_grad():
    out = net(torch.from_numpy(face_dataset[i]['image']).float().view(1, 1, 60, 80)).detach()[0]
plt.scatter(out[0].item()*80, out[1].item()*60, s=60, color='r')
plt.scatter(face_dataset[i]['landmarks'][-6][0]*80, face_dataset[i]['landmarks'][-6][1]*60, s=60, color='g')


ax = plt.subplot(2, 2, 3)
i = 215
plt.imshow(face_dataset[i]['image'], cmap='gray')
with torch.no_grad():
    out = net(torch.from_numpy(face_dataset[i]['image']).float().view(1, 1, 60, 80)).detach()[0]
plt.scatter(out[0].item()*80, out[1].item()*60, s=60, color='r')
plt.scatter(face_dataset[i]['landmarks'][-6][0]*80, face_dataset[i]['landmarks'][-6][1]*60, s=60, color='g')

ax = plt.subplot(2, 2, 4)
i = 230
plt.imshow(face_dataset[i]['image'], cmap='gray')
with torch.no_grad():
    out = net(torch.from_numpy(face_dataset[i]['image']).float().view(1, 1, 60, 80)).detach()[0]
plt.scatter(out[0].item()*80, out[1].item()*60, s=60, color='r')
plt.scatter(face_dataset[i]['landmarks'][-6][0]*80, face_dataset[i]['landmarks'][-6][1]*60, s=60, color='g');

The true nose location is shown in green and the detected output of our CNN is shown in red.

Our neural network detects the nost correctly in the top 2 images and incorrectly for the bottom two images. It appears that our classifier does much better on front-facing images, and detection for the bottom two images likely failed because the faces were tilted or rotated.

Part 2: Full Facial Keypoints Detection

In this part, we attempt to detect all facial keypoints. Unlike part 1, we apply a few non-trivial transformations to the images before loading them into the dataloader including:

  • Resizing the image to 120x160 pixels
  • Randomly changing the brightness and saturation of the image
  • Randomly rotating the image by -15 to 15 degrees
In [461]:
import torchvision.transforms as transforms
from torchvision.utils import save_image
In [462]:
class FaceLandmarksDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.landmarks_frame = pd.read_csv(csv_file, index_col=[0])
        self.root_dir = root_dir
        self.transform = None
    
    def __len__(self):
        return len(self.landmarks_frame)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img_name = os.path.join(self.root_dir,
                               self.landmarks_frame.iloc[idx, 0])
        image = skio.rgb2gray(io.imread(img_name))
        image = transform.resize(image, (120, 160)).astype(np.float32)
        
        # Change brightness/saturation
        image = image*np.random.choice(np.arange(-0.99, 1.00, 0.02)) + np.random.choice(np.arange(-0.5, 0.5, 0.01))
        
        # Rotate Image
        angle = np.random.randint(-15, 16)
        image = transform.rotate(image, angle=angle)
        
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        landmarks = np.array([landmarks]).astype(np.float32).reshape(-1, 2)
        
        # Rotate Landmarks
        theta = np.radians(angle)
        rotation = np.array([
            [np.cos(theta), -np.sin(theta)],
            [np.sin(theta), np.cos(theta)]
        ])
        landmarks = (landmarks-0.5) @ rotation + 0.5
        
        # landmarks transforms
        sample = {'image': image, 'landmarks': landmarks}
        
        return sample
In [463]:
face_dataset = FaceLandmarksDataset(csv_file='landmarks.csv', root_dir='images')
split = torch.utils.data.random_split(face_dataset, lengths=[192, 48], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(split[0], batch_size=4, shuffle=True)
test_loader = DataLoader(split[1], batch_size=4, shuffle=False)
In [464]:
def show_landmarks_batch(sample_batched):
    images_batch, landmarks_batch = sample_batched['image'], sample_batched['landmarks']
    batch_size = len(images_batch)
    
    f, axs = plt.subplots(1, 4, figsize=(15,15))
    for i in range(batch_size):
        sample_image = images_batch[i]
        sample_landmark = landmarks_batch[i]
        
        ax = plt.subplot(1, 4, i+1)
        plt.tight_layout()
        plt.imshow(sample_image, cmap='gray')
        plt.scatter(sample_landmark[:, 0]*160, sample_landmark[:, 1]*120, s=20, marker='.', c='g')

Here are four images from our training set with the corresponding keypoints labeled in green.

In [18]:
for i_batch, sample_batched in enumerate(train_loader):
    show_landmarks_batch(sample_batched)
    break
In [596]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 12, 3)
        self.conv2 = nn.Conv2d(12, 32, 3)
        self.conv3 = nn.Conv2d(32, 32, 3)
        self.conv4 = nn.Conv2d(32, 32, 5)
        self.conv5 = nn.Conv2d(32, 12, 5)
        self.pool = nn.MaxPool2d(2, 2)
        
        self.fc1 = nn.Linear(1512, 256)
        self.fc2 = nn.Linear(256, 116)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = F.relu(self.conv2(x))
        x = self.pool(F.relu(self.conv3(x)))
        x = F.relu(self.conv4(x))
        x = self.pool(F.relu(self.conv5(x)))
        x = x.view(x.size(0), 1512)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.002)
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 12, 3)
        self.conv2 = nn.Conv2d(12, 32, 3)
        self.conv3 = nn.Conv2d(32, 32, 3)
        self.conv4 = nn.Conv2d(32, 32, 5)
        self.conv5 = nn.Conv2d(32, 12, 5)
        self.pool = nn.MaxPool2d(2, 2)

        self.fc1 = nn.Linear(1512, 256)
        self.fc2 = nn.Linear(256, 116)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = F.relu(self.conv2(x))
        x = self.pool(F.relu(self.conv3(x)))
        x = F.relu(self.conv4(x))
        x = self.pool(F.relu(self.conv5(x)))
        x = x.view(x.size(0), 1512)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x

We use the following neural network architecture with the following hyperparameters:

  • batch_size=4
  • learning_rate=0.002
  • epochs=15
  • optimizer=adam
  • loss=MSELoss()
In [597]:
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.002)

training_losses = []
val_losses = []
all_outputs = torch.empty((0, 58, 2))

for epoch in range(15): 

    # TRAINING
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['image'].view(4, 1, 120, 160).float(), data['landmarks'].float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels.view(4, 116))
        loss.backward()
        optimizer.step()
    
    # TRAINING INFERENCE
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['image'].view(4, 1, 120, 160).float(), data['landmarks'].float()
        with torch.no_grad():
            outputs = net(inputs)
            loss = criterion(outputs, labels.view(4, 116))
        running_loss += loss.item()
    running_loss /= len(train_loader)
    training_losses.append(running_loss)
    
    # VALIDATION INFERENCE
    running_loss = 0.0
    for i, data in enumerate(test_loader):
        inputs, labels = data['image'].view(4, 1, 120, 160).float(), data['landmarks'].float()
        with torch.no_grad():
            outputs = net(inputs)
            loss = criterion(outputs, labels.view(4, 116))
        running_loss += loss.item()
        all_outputs = torch.cat((all_outputs, outputs.view(4, 58, 2)))
    running_loss /= len(test_loader)
    val_losses.append(running_loss)
    
print('Finished Training')
Finished Training
In [599]:
plt.figure(figsize=(20, 8))
sns.lineplot(range(15), training_losses, label='Training MSE')
sns.lineplot(range(15), val_losses, label='Validation MSE')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title('MSE at each epoch')
plt.legend()
Out[599]:
<matplotlib.legend.Legend at 0x2a1787a74f0>

Sample Detections on the Test Set

In [617]:
fig, axs = plt.subplots(2, 2, figsize=(15, 15))
plt.tight_layout()

ax = plt.subplot(2, 2, 1)
i = 222
sample = face_dataset[i]
plt.imshow(sample['image'], cmap='gray')
with torch.no_grad():
    out = net(torch.from_numpy(sample['image']).float().view(1, 1, 120, 160)).detach()[0].view(58, 2)
plt.scatter(out[:, 0]*160, out[:, 1]*120, s=20, color='r')
plt.scatter(sample['landmarks'][:, 0]*160, sample['landmarks'][:, 1]*120, s=60, color='g')

ax = plt.subplot(2, 2, 2)
i = 210
sample = face_dataset[i]
plt.imshow(sample['image'], cmap='gray')
with torch.no_grad():
    out = net(torch.from_numpy(sample['image']).float().view(1, 1, 120, 160)).detach()[0].view(58, 2)
plt.scatter(out[:, 0]*160, out[:, 1]*120, s=20, color='r')
plt.scatter(sample['landmarks'][:, 0]*160, sample['landmarks'][:, 1]*120, s=60, color='g')


ax = plt.subplot(2, 2, 3)
i = 239
sample = face_dataset[i]
plt.imshow(sample['image'], cmap='gray')
with torch.no_grad():
    out = net(torch.from_numpy(sample['image']).float().view(1, 1, 120, 160)).detach()[0].view(58, 2)
plt.scatter(out[:, 0]*160, out[:, 1]*120, s=20, color='r')
plt.scatter(sample['landmarks'][:, 0]*160, sample['landmarks'][:, 1]*120, s=60, color='g')

ax = plt.subplot(2, 2, 4)
i = 233
sample = face_dataset[i]
plt.imshow(sample['image'], cmap='gray')
with torch.no_grad():
    out = net(torch.from_numpy(sample['image']).float().view(1, 1, 120, 160)).detach()[0].view(58, 2)
plt.scatter(out[:, 0]*160, out[:, 1]*120, s=20, color='r')
plt.scatter(sample['landmarks'][:, 0]*160, sample['landmarks'][:, 1]*120, s=60, color='g')
Out[617]:
<matplotlib.collections.PathCollection at 0x2a162db2850>

True landmarks are shown in green and detected landmarks are shown in red

As expected, our model performs extremely well for front-facing images (even with non-trivial rotation) but performs terribly for images where the person is primarily looking to the side.

In [23]:
# Code Source:
#https://colab.research.google.com/github/Niranjankumar-c/DeepLearning-PadhAI/blob/master/DeepLearning_Materials/6_VisualizationCNN_Pytorch/CNNVisualisation.ipynb
def plot_filters_single_channel_big(t):
    
    #setting the rows and columns
    nrows = t.shape[0]*t.shape[2]
    ncols = t.shape[1]*t.shape[3]
    
    
    npimg = np.array(t.numpy(), np.float32)
    npimg = npimg.transpose((0, 2, 1, 3))
    npimg = npimg.ravel().reshape(nrows, ncols)
    
    npimg = npimg.T
    
    fig, ax = plt.subplots(figsize=(ncols/10, nrows/200))    
    imgplot = sns.heatmap(npimg, xticklabels=False, yticklabels=False, cmap='gray', ax=ax, cbar=False)
    
def plot_filters_single_channel(t):
    
    #kernels depth * number of kernels
    nplots = t.shape[0]*t.shape[1]
    ncols = 12
    
    nrows = 1 + nplots//ncols
    #convert tensor to numpy image
    npimg = np.array(t.numpy(), np.float32)
    
    count = 0
    fig = plt.figure(figsize=(ncols, nrows))
    
    #looping through all the kernels in each channel
    for i in range(t.shape[0]):
        for j in range(t.shape[1]):
            count += 1
            ax1 = fig.add_subplot(nrows, ncols, count)
            npimg = np.array(t[i, j].numpy(), np.float32)
            npimg = (npimg - np.mean(npimg)) / np.std(npimg)
            npimg = np.minimum(1, np.maximum(0, (npimg + 0.5)))
            ax1.imshow(npimg)
            ax1.set_title(str(i) + ',' + str(j))
            ax1.axis('off')
            ax1.set_xticklabels([])
            ax1.set_yticklabels([])
   
    plt.tight_layout()
    plt.show()
    
def plot_filters_multi_channel(t):
    
    #get the number of kernals
    num_kernels = t.shape[0]    
    
    #define number of columns for subplots
    num_cols = 12
    #rows = num of kernels
    num_rows = num_kernels
    
    #set the figure size
    fig = plt.figure(figsize=(num_cols,num_rows))
    
    #looping through all the kernels
    for i in range(t.shape[0]):
        ax1 = fig.add_subplot(num_rows,num_cols,i+1)
        
        #for each kernel, we convert the tensor to numpy 
        npimg = np.array(t[i].numpy(), np.float32)
        #standardize the numpy image
        npimg = (npimg - np.mean(npimg)) / np.std(npimg)
        npimg = np.minimum(1, np.maximum(0, (npimg + 0.5)))
        npimg = npimg.transpose((1, 2, 0))
        ax1.imshow(npimg)
        ax1.axis('off')
        ax1.set_title(str(i))
        ax1.set_xticklabels([])
        ax1.set_yticklabels([])
        
    plt.savefig('myimage.png', dpi=100)    
    plt.tight_layout()
    plt.show()


def plot_weights(model, layer_num, single_channel = True, collated = False):
  
    #extracting the model features at the particular layer number
    layer = list(model.children())[layer_num]
  
    #checking whether the layer is convolution layer or not 
    if isinstance(layer, nn.Conv2d):
    #getting the weight tensor data
        weight_tensor = list(model.children())[layer_num].weight.data

        if single_channel:
            if collated:
                plot_filters_single_channel_big(weight_tensor)
            else:
                plot_filters_single_channel(weight_tensor)

        else:
            if weight_tensor.shape[1] == 3:
                plot_filters_multi_channel(weight_tensor)
            else:
                print("Can only plot weights with three channels with single channel = False")
        
    else:
        print("Can only visualize layers which are convolutional")

Convolutional Layer Filters

Layer 1: (1, 12)

In [24]:
plot_weights(net, 0, single_channel = True)

Layer 2: (12, 32)

In [25]:
plot_weights(net, 1, single_channel = True)

Layer 3: (32, 32)

In [26]:
plot_weights(net, 2, single_channel = True)

Part 3: Train with Larger Dataset

In this part, we continue with keypoint detection on a much larger dataset (6666) images. The images in this dataset have much larger variance in terms of image size and face positioning, so we crop just the face, resize the image to 244x244 pixels, and apply the transformations from part 2. Here, we use an 80/20 train-test validation split, although we ultimately train on the entire provided dataset since the true test dataset is unlabeled. Since the dataset for this part is extremely large, we train our convolutional neural network on Google Colab with a dedicated GPU.

In [29]:
import xml.etree.ElementTree as ET 

%%capture
if not os.path.exists('/content/ibug_300W_large_face_landmark_dataset'):
    !wget https://people.eecs.berkeley.edu/~zhecao/ibug_300W_large_face_landmark_dataset.zip
    !unzip 'ibug_300W_large_face_landmark_dataset.zip'    
    !rm -r 'ibug_300W_large_face_landmark_dataset.zip'
In [391]:
class FullDataset(Dataset):
    def __init__(self, transform=None):
    
        # Source code provided in the project spec
        tree = ET.parse('ibug_300W_large_face_landmark_dataset/labels_ibug_300W_train.xml')
        root = tree.getroot()
        root_dir = 'ibug_300W_large_face_landmark_dataset'

        bboxes = [] # face bounding box used to crop the image
        landmarks = [] # the facial keypoints/landmarks for the whole training dataset
        img_filenames = [] # the image names for the whole dataset

        for filename in root[2]:
            img_filenames.append(os.path.join(root_dir, filename.attrib['file']))
            box = filename[0].attrib
            # x, y for the top left corner of the box, w, h for box width and height
            bboxes.append([box['left'], box['top'], box['width'], box['height']]) 

            landmark = []
            for num in range(68):
                x_coordinate = int(filename[0][num].attrib['x'])
                y_coordinate = int(filename[0][num].attrib['y'])
                landmark.append([x_coordinate, y_coordinate])
            landmarks.append(landmark)

        landmarks = np.array(landmarks).astype('float32')     
        bboxes = np.array(bboxes).astype('float32') 
        
        self.img_filenames = img_filenames
        self.landmarks = landmarks
        self.bboxes = bboxes
    
    def __len__(self):
        return len(self.img_filenames)
    
    def __getitem__(self, idx):
        image = skio.rgb2gray(plt.imread(self.img_filenames[idx]))
        landmarks = self.landmarks[idx]
        bboxes = self.bboxes[idx]
        
        min_x = max(0, min(int(bboxes[0]), min(landmarks[:, 0]).astype(int)))
        min_y = max(0, min(int(bboxes[1]), min(landmarks[:, 1]).astype(int)))
        width = int(bboxes[2]*1.3)
        height = int(bboxes[3]*1.3)
        
        new_img = image[min_y:min_y+height, min_x:min_x+width]
        new_landmarks = landmarks.copy()
        new_landmarks[:, 0] = (new_landmarks[:, 0]-min_x) / new_img.shape[1]
        new_landmarks[:, 1] = (new_landmarks[:, 1]-min_y) / new_img.shape[0]
        
        new_img = transform.resize(new_img, (224, 224)).astype(np.float32)
        
        # Change brightness/saturation
        new_img = new_img*np.random.choice(np.arange(-0.49, 0.51, 0.02)) + np.random.choice(np.arange(-0.5, 0.5, 0.01))
        
        # Rotate Image
        angle = np.random.randint(-15, 16)
        new_img = transform.rotate(new_img, angle=angle)
        
        # Rotate Landmarks
        theta = np.radians(angle)
        rotation = np.array([
            [np.cos(theta), -np.sin(theta)],
            [np.sin(theta), np.cos(theta)]
        ])
        new_landmarks = (new_landmarks-0.5) @ rotation + 0.5
        
        # landmarks transforms
        sample = {'image': new_img, 'landmarks': new_landmarks, 'old_shape': image.shape, 'cropped': (min_x, min_y)}
        
        return sample
In [416]:
full_dataset = FullDataset()
split = torch.utils.data.random_split(full_dataset, lengths=[5333, 1333], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(split[0], batch_size=16, shuffle=True, num_workers=2)
test_loader = DataLoader(split[1], batch_size=16, shuffle=False, num_workers=2)

16 Sample Images

Here are 16 sample images from the training set and their corresponding facial keypoints.

In [400]:
def show_landmarks_batch(sample_batched):
    images_batch, landmarks_batch = sample_batched['image'], sample_batched['landmarks']
    batch_size = len(images_batch)
    
    f, axs = plt.subplots(1, 4, figsize=(15,15))
    for i in range(batch_size):
        for j in range(4):
            sample_image = images_batch[i]
            sample_landmark = landmarks_batch[i]

            ax = plt.subplot(4, 4, i+1)
            plt.tight_layout()
            plt.imshow(sample_image, cmap='gray')
            plt.scatter(sample_landmark[:, 0]*224, sample_landmark[:, 1]*224, s=20, marker='.', c='g')

for i_batch, sample_batched in enumerate(train_loader):
    show_landmarks_batch(sample_batched)
    break

CODE IN THIS SECTION RAN IN GOOGLE COLAB

-----------------------------------------------------------------------

Model Building

In [418]:
from torchvision import models

Here, we use the non pre-trained resnet18 architecture from https://pytorch.org/docs/stable/torchvision/models.html with slight modifications to the first convolutional layer and the final linear layer so that the CNN is compatible with our data dimensions.

Hyperparameters:

  • learning_rate=0.001
  • batch_size=16
  • epochs=10
  • optimizer=adam
  • loss=MSELoss()

The chosen hyperparameters are fairly random, but I didn't have enough time to hypertune since training takes roughly 2-3 hours.

In [413]:
np.random.seed(42)
torch.manual_seed(42)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic=True
    torch.cuda.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Training/Validation Evaluation

In [ ]:
from torchvision import models

resnet = models.resnet18()
resnet.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet.fc = torch.nn.Linear(512, 136, bias=True)

criterion = nn.MSELoss()
optimizer = optim.Adam(params=resnet.parameters(), lr=1e-3)

training_losses = []
val_losses = []

resnet = resnet.to(device)

for epoch in range(1): 

    # TRAINING
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['image'].view(data['image'].shape[0], 1, 224, 224).float().to(device), data['landmarks'].float().to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = resnet(inputs)
        loss = criterion(outputs, labels.view(data['image'].shape[0], 136))
        loss.backward()
        optimizer.step()
    
    # TRAINING INFERENCE
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['image'].view(data['image'].shape[0], 1, 224, 224).float().to(device), data['landmarks'].float().to(device)
        with torch.no_grad():
            outputs = resnet(inputs)
            loss = criterion(outputs, labels.view(data['image'].shape[0], 136))
        running_loss += loss.item()
    running_loss /= len(train_loader)
    training_losses.append(running_loss)
    
    # VALIDATION INFERENCE
    running_loss = 0.0
    for i, data in enumerate(test_loader):
        inputs, labels = data['image'].view(data['image'].shape[0], 1, 224, 224).float().to(device), data['landmarks'].float().to(device)
        with torch.no_grad():
            outputs = resnet(inputs)
            loss = criterion(outputs, labels.view(data['image'].shape[0], 136))
        running_loss += loss.item()
    running_loss /= len(test_loader)
    val_losses.append(running_loss)
    
print('Finished Training')
In [ ]:
sns.lineplot(range(15), training_losses, label='Training MSE')
sns.lineplot(range(15), val_losses, label='Validation MSE')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title('MSE at each epoch')
plt.legend()
In [ ]:
# Train on the entire dataset
from torchvision import models

resnet = models.resnet18()
resnet.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet.fc = torch.nn.Linear(512, 136, bias=True)

criterion = nn.MSELoss()
optimizer = optim.Adam(params=resnet.parameters(), lr=1e-3)

training_losses = []
val_losses = []

resnet = resnet.to(device)

for epoch in range(10): 

    # TRAINING
    for i, data in enumerate(all_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['image'].view(data['image'].shape[0], 1, 224, 224).float().to(device), data['landmarks'].float().to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = resnet(inputs)
        loss = criterion(outputs, labels.view(data['image'].shape[0], 136))
        loss.backward()
        optimizer.step()

    print(f'EPOCH {epoch}: {loss.item()}')
In [859]:
# These losses are unfortunately hardcoded because my google colab died and i lost the exact values
training_losses = [0.0055, 0.0056, 0.0048, 0.0051, 0.0054, 0.0032, 0.0023, 0.0025, 0.0008, 0.0006]
val_losses = [0.0055, 0.0058, 0.005, 0.0052, 0.0055, 0.0033, 0.0024, 0.0026, 0.001, 0.0008]
plt.figure(figsize=(20, 8))
sns.lineplot(range(10), training_losses, label='Training MSE')
sns.lineplot(range(10), val_losses, label='Validation MSE')
plt.xlabel('epoch')
plt.ylabel('MSE')
plt.title('MSE at each Epoch');

-----------------------------------------------------------------------

In [516]:
tree = ET.parse('labels_ibug_300W_test_parsed.xml')
root = tree.getroot()
root_dir = 'ibug_300W_large_face_landmark_dataset'

bboxes = [] # face bounding box used to crop the image
img_filenames = [] # the image names for the whole dataset

for filename in root[2]:
    img_filenames.append(os.path.join(root_dir, filename.attrib['file']))
    box = filename[0].attrib
    # x, y for the top left corner of the box, w, h for box width and height
    bboxes.append([box['left'], box['top'], box['width'], box['height']]) 

bboxes = np.array(bboxes).astype('float32') 
In [682]:
class TestDataset(Dataset):
    def __init__(self, transform=None):
    
        # Source code provided in the project spec
        tree = ET.parse('labels_ibug_300W_test_parsed.xml')
        root = tree.getroot()
        root_dir = 'ibug_300W_large_face_landmark_dataset'

        bboxes = [] # face bounding box used to crop the image
        img_filenames = [] # the image names for the whole dataset

        for filename in root[2]:
            img_filenames.append(os.path.join(root_dir, filename.attrib['file']))
            box = filename[0].attrib
            # x, y for the top left corner of the box, w, h for box width and height
            bboxes.append([box['left'], box['top'], box['width'], box['height']]) 

        bboxes = np.array(bboxes).astype('float32') 
        
        self.img_filenames = img_filenames
        self.bboxes = bboxes
    
    def __len__(self):
        return len(self.img_filenames)
    
    def __getitem__(self, idx):
        image = skio.rgb2gray(plt.imread(self.img_filenames[idx]))
        bboxes = self.bboxes[idx]
        
        min_x = max(0, int(bboxes[0]))
        min_y = max(0, int(bboxes[1]))
        width = int(bboxes[2]*1.3)
        height = int(bboxes[3]*1.3)
        
        new_img = image[min_y:min_y+height, min_x:min_x+width]
        old_shape = [new_img.shape[0], new_img.shape[1]]
        new_img = transform.resize(new_img, (224, 224)).astype(np.float32)
        
        # landmarks transforms
        sample = {'image': new_img, 'old_shape': old_shape, 'cropped': (min_x, min_y)}
        
        return sample
In [836]:
# Load the model trained on ALL 6666 samples

resnet = models.resnet18()
resnet.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet.fc = torch.nn.Linear(512, 136, bias=True)
resnet.load_state_dict(torch.load('resnet.th'))
In [685]:
test_dataset = TestDataset()

landmarks = torch.empty((0, 68, 2))
with torch.no_grad():
    for sample in test_dataset:
        img = torch.from_numpy(sample['image'])
        outputs = resnet(img.view(1, 1, 224, 224)).view(1, 68, 2)
        outputs[:, :, 0] = outputs[:, :, 0]*sample['old_shape'][1] + sample['cropped'][0]
        outputs[:, :, 1] = outputs[:, :, 1]*sample['old_shape'][0] + sample['cropped'][1]
        landmarks = torch.cat((landmarks, outputs))
landmarks = landmarks.numpy()
In [720]:
pd.DataFrame(data={'Id': np.arange(137088), 'Predicted': landmarks.flatten()}).to_csv('kaggle.csv')

Kaggle Results

Name: Kvah

MAE Score: 19.11511

Keypoint Predictions on the Test Set (Successful)

In [827]:
f, axs = plt.subplots(1, 4, figsize=(15,15))
plt.tight_layout()

i=15
ax = plt.subplot(2, 2, 1)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])

i=130
ax = plt.subplot(2, 2, 2)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])

i=115
ax = plt.subplot(2, 2, 3)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])

i=112
ax = plt.subplot(2, 2, 4)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1]);

Keypoint Predictions on Test Set (Bad)

In [710]:
f, axs = plt.subplots(1, 4, figsize=(15,15))
plt.tight_layout()

i=200
ax = plt.subplot(2, 2, 1)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])

i=873
ax = plt.subplot(2, 2, 2)
plt.imshow(plt.imread(test_dataset.img_filenames[i]))
plt.scatter(landmarks[i][:, 0], landmarks[i][:, 1])
Out[710]:
<matplotlib.collections.PathCollection at 0x2a13f66e340>

The keypoint detection range seems to be way to large for these two images, likely because both faces are at fairly strange angles (its also possible that this is because the children's faces are particularly round and most of our training data are adults with more oval shaped faces!)

In [ ]:
obama = plt.imread('obama.jpg')
biden = plt.imread('biden.jpg')
trump = plt.imread('trump.jpg')
pence = plt.imread('pence.jpg')

images = [obama, biden, trump, pence]
politicians = {}    

Keypoint Detection on four random invidiuals:

In [834]:
f, axs = plt.subplots(1, 4, figsize=(15,15))
plt.tight_layout()

ax = plt.subplot(2, 2, 1)
with torch.no_grad():
    obama_new = skio.rgb2gray(obama)[0:800, 400:1000]
    out = resnet(torch.from_numpy(transform.resize(obama_new, (224, 224))).view(1, 1, 224, 224).float())
    out = out.view(68, 2)
    out[:, 0] = out[:, 0]*obama_new.shape[1] + 400
    out[:, 1] = out[:, 1]*obama_new.shape[0] + 0
plt.imshow(obama)
plt.scatter(out[:, 0], out[:, 1])

ax = plt.subplot(2, 2, 2)
with torch.no_grad():
    biden_new = skio.rgb2gray(biden)[0:800, 100:800]
    out = resnet(torch.from_numpy(transform.resize(biden_new, (224, 224))).view(1, 1, 224, 224).float())
    out = out.view(68, 2)
    out[:, 0] = out[:, 0]*biden_new.shape[1] + 100
    out[:, 1] = out[:, 1]*biden_new.shape[0] + 0
plt.imshow(biden)
plt.scatter(out[:, 0], out[:, 1])

ax = plt.subplot(2, 2, 3)
with torch.no_grad():
    trump_new = skio.rgb2gray(trump[50:300, 200:400])
    out = resnet(torch.from_numpy(transform.resize(trump_new, (224, 224))).view(1, 1, 224, 224).float())
    out = out.view(68, 2)
    out[:, 0] = out[:, 0]*trump_new.shape[1] + 200
    out[:, 1] = out[:, 1]*trump_new.shape[0] + 50
plt.imshow(trump)
plt.scatter(out[:, 0], out[:, 1])

ax = plt.subplot(2, 2, 4)
with torch.no_grad():
    pence_new = skio.rgb2gray(pence)[0:600, 400:800]
    out = resnet(torch.from_numpy(transform.resize(pence_new, (224, 224))).view(1, 1, 224, 224).float())
    out = out.view(68, 2)
    out[:, 0] = out[:, 0]*pence_new.shape[1] + 400
    out[:, 1] = out[:, 1]*pence_new.shape[0] + 0
plt.imshow(pence)
plt.scatter(out[:, 0], out[:, 1]);

The model seems to perform fairly well in terms of finding the overall outline of the face, but does a pretty poor job in outlining actual facial features. I wonder if this has to do with the way that I am cropping the images.

In [860]:
FILE = "main.html"

with open(FILE, 'r') as html_file:
    content = html_file.read()

# Get rid off prompts and source code
content = content.replace("div.input_area {
	display: none;","div.input_area {
	display: none;\n\tdisplay: none;")    
content = content.replace(".prompt {
	display: none;",".prompt {
	display: none;\n\tdisplay: none;")

f = open(FILE, 'w')
f.write(content)
f.close()