diff --git a/.github/linters/.python-lint b/.github/linters/.python-lint index 892053922..2c02be588 100644 --- a/.github/linters/.python-lint +++ b/.github/linters/.python-lint @@ -7,7 +7,7 @@ ignored-classes = ModelProto max-line-length = 99 [DESIGN] max-locals=100 -max-statements=300 +max-statements=350 min-public-methods=1 max-branches=120 max-module-lines=5000 diff --git a/.gitmodules b/.gitmodules index d37537691..ecdf70507 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,6 +2,3 @@ path = distiller url = https://github.com/MaximIntegratedAI/distiller.git branch = pytorch-1.8 -[submodule "datasets/face_id/facenet_pytorch"] - path = datasets/face_id/facenet_pytorch - url = https://github.com/MaximIntegratedAI/facenet-pytorch.git diff --git a/.pylintrc b/.pylintrc index 22fe62009..26cbe5d6a 100644 --- a/.pylintrc +++ b/.pylintrc @@ -7,7 +7,7 @@ ignored-classes = ModelProto max-line-length = 99 [DESIGN] max-locals=100 -max-statements=300 +max-statements=350 min-public-methods=1 max-branches=120 max-module-lines=5000 diff --git a/README.md b/README.md index 8dbe7cf2a..8a3161e66 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # ADI MAX78000/MAX78002 Model Training and Synthesis -November 13, 2023 +January 10, 2024 ADI’s MAX78000/MAX78002 project is comprised of five repositories: @@ -1534,6 +1534,11 @@ The following table describes the most important command line arguments for `tra | `--nas` | Enable network architecture search | | | `--nas-policy` | Define NAS policy in YAML file | `--nas-policy nas/nas_policy.yaml` | | `--regression` | Select regression instead of classification (changes Loss function, and log output) | | +| `--dr` | Set target embedding dimensionality for dimensionality reduction |`--dr 64` | +| `--scaf-lr` | Initial learning rate for sub-center ArcFace loss optimizer | | +| `--scaf-scale` |Scale hyperparameter for sub-center ArcFace loss | | +| `--scaf-margin` |Margin hyperparameter for sub-center ArcFace loss | | +| `--backbone-checkpoint` |Path to checkpoint from which to load backbone weights | | | *Display and statistics* | | | | `--enable-tensorboard` | Enable logging to TensorBoard (default: disabled) | | | `--confusion` | Display the confusion matrix | | @@ -1552,6 +1557,7 @@ The following table describes the most important command line arguments for `tra | `--summary onnx_simplified` | Export trained model to simplified [ONNX](https://onnx.ai/) file (default name: model.onnx) | | | `--summary-filename` | Change the file name for the exported model | `--summary-filename mnist.onnx` | | `--save-sample` | Save data[index] from the test set to a NumPy pickle for use as sample data | `--save-sample 10` | +| `--slice-sample` | For models that require RGB input, when the sample from the dataset has additional channels, slice the sample into 3 channels | | #### ONNX Model Export @@ -3340,6 +3346,8 @@ Additional information about the evaluation kits, and the software development k [AHB Addresses for MAX78000 and MAX78002](docs/AHBAddresses.md) +[Facial Recognition System](https://github.com/MaximIntegratedAI/ai8x-training/blob/develop/docs/FacialRecognitionSystem.md) + --- diff --git a/README.pdf b/README.pdf index 944c6fe87..fbea4318b 100644 Binary files a/README.pdf and b/README.pdf differ diff --git a/ai8x_blocks.py b/ai8x_blocks.py index e95603675..fc2b314c7 100644 --- a/ai8x_blocks.py +++ b/ai8x_blocks.py @@ -1,6 +1,6 @@ ################################################################################################### # -# Copyright (C) 2020-2022 Maxim Integrated Products, Inc. All Rights Reserved. +# Copyright (C) 2020-2023 Maxim Integrated Products, Inc. All Rights Reserved. # # Maxim Integrated Products, Inc. Default Copyright Notice: # https://www.maximintegrated.com/en/aboutus/legal/copyrights.html @@ -116,6 +116,80 @@ def forward(self, x): # pylint: disable=arguments-differ return self.resid(y, x) +class ConvResidualBottleneck(nn.Module): + """ + AI8X module based on Residual Bottleneck Layer. + Depthwise convolution is replaced with standard convolution. + This module uses ReLU activation not ReLU6 as the original study suggests [1], + because of MAX7800X capabilities. + + Args: + in_channels: number of input channels + out_channels: number of output channels + expansion_factor: expansion_factor + stride: stirde size (default=1) + bias: determines if bias used at non-depthwise layers. + depthwise_bias: determines if bias used at depthwise layers. + + References: + [1] https://arxiv.org/pdf/1801.04381.pdf (MobileNetV2) + """ + def __init__(self, in_channels, out_channels, expansion_factor, stride=1, bias=False, + depthwise_bias=False, **kwargs): + super().__init__() + self.stride = stride + hidden_channels = int(round(in_channels * expansion_factor)) + if hidden_channels == in_channels: + self.conv1 = ai8x.Empty() + else: + self.conv1 = ai8x.FusedConv2dBNReLU(in_channels, hidden_channels, 1, padding=0, + bias=bias, **kwargs) + if stride == 1: + if depthwise_bias: + self.conv2 = ai8x.FusedConv2dBN(hidden_channels, out_channels, 3, + padding=1, stride=stride, + bias=depthwise_bias, **kwargs) + + else: + self.conv2 = ai8x.Conv2d(hidden_channels, out_channels, 3, + padding=1, stride=stride, + bias=depthwise_bias, **kwargs) + + else: + if depthwise_bias: + self.conv2 = ai8x.FusedMaxPoolConv2dBN(hidden_channels, + out_channels, 3, + padding=1, pool_size=stride, + pool_stride=stride, + bias=depthwise_bias, **kwargs) + + else: + self.conv2 = ai8x.FusedMaxPoolConv2d(hidden_channels, + out_channels, 3, + padding=1, pool_size=stride, + pool_stride=stride, + bias=depthwise_bias, **kwargs) + + if (stride == 1) and (in_channels == out_channels): + self.resid = ai8x.Add() + else: + self.resid = self.NoResidual() + + class NoResidual(nn.Module): + """ + Does nothing. + """ + def forward(self, *x): # pylint: disable=arguments-differ + """Forward prop""" + return x[0] + + def forward(self, x): # pylint: disable=arguments-differ + """Forward prop""" + y = self.conv1(x) + y = self.conv2(y) + return self.resid(y, x) + + class MBConvBlock(nn.Module): """Mobile Inverted Residual Bottleneck Block. diff --git a/datasets/face_id/README.md b/datasets/face_id/README.md deleted file mode 100644 index e14459abb..000000000 --- a/datasets/face_id/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# FaceID Data Generation - -This folder contains scripts to generate data to train and test models for FaceID model using the following datasets: - - VGGFace-2: A large-scale face recognition dataset. [https://www.robots.ox.ac.uk/~vgg/data/](https://www.robots.ox.ac.uk/~vgg/data/) - - YouTubeFaces: A database of face videos designed for studying the problem of unconstrained face recognition. [https://www.cs.tau.ac.il/~wolf/ytfaces/](https://www.cs.tau.ac.il/~wolf/ytfaces/) - -## Dataset Generation - -### VGGFace-2 -**Warning:** The original dataset is about 40GB and the following scripts generate a new dataset with size of 183 GB. Be sure there is enough space on your hard drive before starting the execution. - -\ -Follow these steps for both train and test sets. -1. Download train and test the *VGG Face 2 Dataset* from [https://www.robots.ox.ac.uk/~vgg/data/](https://www.robots.ox.ac.uk/~vgg/data/) and extract the .tar.gz files. into the same folder. -2. Run gen_vggface2_embeddings.py: - ``` - python gen_vggface2_embeddings.py -r -d --type - ``` -3. Run merge_vggface2_dataset.py - ``` - python merge_vggface2_dataset.py -p --type - ``` - -### YouTubeFaces - -**Warning:** The original dataset is about 29GB and the following scripts generate a new dataset with size of 15 GB. Be sure there is enough space on your hard drive before starting the execution. - -\ -Follow these steps. -1. Download the dataset from [here](http://www.cslab.openu.ac.il/download/) and extract the tar.gz files. into the same folder. -2. Run gen_youtubefaces_embeddings.py: - ``` - python gen_youtubefaces_embeddings.py -r -d --type test - ``` -3. Run merge_youtubefaces_dataset.py - ``` - python merge_youtubefaces_dataset.py -p --type test - ``` - -**Note:** The default paths for generated dataset is set to AI8X_TRAINING_HOME/data so the data loaders can load them with default parameters. If the destination folder is changed, the ---data option should be added to the model training script. diff --git a/datasets/face_id/facenet_pytorch b/datasets/face_id/facenet_pytorch deleted file mode 160000 index 14b312d4e..000000000 --- a/datasets/face_id/facenet_pytorch +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 14b312d4ef4b1baf90438f89d49d3942e3c6df7f diff --git a/datasets/face_id/gen_vggface2_embeddings.py b/datasets/face_id/gen_vggface2_embeddings.py deleted file mode 100755 index 95043a229..000000000 --- a/datasets/face_id/gen_vggface2_embeddings.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 -################################################################################################### -# -# Copyright (C) 2020-2021 Maxim Integrated Products, Inc. All Rights Reserved. -# -# Maxim Integrated Products, Inc. Default Copyright Notice: -# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html -# -################################################################################################### -""" -Script to generate dataset for FaceID training and validation from VGGFace-2 dataset. -""" - -import argparse -import json -import os - -import numpy as np -import torch - -import scipy.ndimage -from facenet_pytorch import MTCNN, InceptionResnetV1 # pylint: disable=no-name-in-module -from matplotlib.image import imread - - -def generate_image(img, box, count): # pylint: disable=too-many-locals - """ - Generates images in size 120x160x3 that includes the detected face in the image. - - img, box are the original image and box. - count is how many pics you wanna generate - - box format: x1, y1, x3, y3 - """ - box[0] = np.max((box[0], 0)) - box[1] = np.max((box[1], 0)) - box[2] = np.min((box[2], img.shape[1])) - box[3] = np.min((box[3], img.shape[0])) - - factor = 1 - height = img.shape[0] - width = img.shape[1] - new_img = img - new_box = box - while True: - if height < 160 or width < 120: - factor += 1 - new_img = scipy.ndimage.zoom(img, [factor, factor, 1], order=1) - new_box = box * factor - height = new_img.shape[0] - width = new_img.shape[1] - else: - break - new_box = np.round(new_box).astype(np.int) - new_box_height = new_box[3] - new_box[1] - new_box_width = new_box[2] - new_box[0] - - scale_list = np.concatenate((np.arange(0.9, 0, -0.1), np.arange(0.09, 0, -0.02))) - ind = 0 - while (new_box_height > 160 or new_box_width > 120): - if ind < scale_list.size: - new_img = scipy.ndimage.zoom(img, [scale_list[ind], scale_list[ind], 1], order=1) - new_box = box * scale_list[ind] - new_box = np.round(new_box).astype(np.int) - new_box_height = new_box[3] - new_box[1] - new_box_width = new_box[2] - new_box[0] - ind += 1 - else: - pass - - min_x = np.max((0, new_box[0] - (120 - new_box_width))) - min_y = np.max((0, new_box[1] - (160 - new_box_height))) - max_x = np.min((new_box[0], width-120)) - max_y = np.min((new_box[1], height-160)) - - start_x = np.random.choice(np.arange(min_x, max_x+1), count, replace=True) - start_y = np.random.choice(np.arange(min_y, max_y+1), count, replace=True) - img_arr = [] - box_arr = [] - for i in range(count): - img_arr.append(new_img[start_y[i]:start_y[i]+160, start_x[i]:start_x[i]+120]) - temp_box = new_box.copy() - temp_box[0] -= start_x[i] - temp_box[2] -= start_x[i] - temp_box[1] -= start_y[i] - temp_box[3] -= start_y[i] - box_arr.append(temp_box) - new_img = img_arr - new_box = box_arr - return new_img, new_box, img, box - - -def main(source_path, dest_path): # pylint: disable=too-many-locals - """ - Main function to iterate over the images in the raw data and generate data samples - to train/test FaceID model. - """ - - device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - print(f'Running on device: {device}') - - mtcnn = MTCNN( - image_size=80, margin=0, min_face_size=20, - thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True, - device=device - ) - - resnet = InceptionResnetV1(pretrained='vggface2').eval().to(device) - - data_dir_list = os.listdir(source_path) - for i, folder in enumerate(data_dir_list): - if i % 10 == 0: - print(f'{i} of {len(data_dir_list)}') - folder_path = os.path.join(source_path, folder) - prcssd_folder_path = os.path.join(dest_path, folder) - if not os.path.exists(prcssd_folder_path): - os.makedirs(prcssd_folder_path) - else: - continue - embedding_dict = {} - for image in os.listdir(folder_path): - image_path = os.path.join(folder_path, image) - img = imread(image_path) - x_aligned, prob, box = mtcnn(img, return_prob=True) - if box is not None and prob > 0.9: - x_aligned = x_aligned[None, :] - x_aligned = x_aligned.to(device) - embeddings = resnet(x_aligned).detach().cpu() - embedding_list = embeddings.numpy().ravel().tolist() - img_arr, _, img, box = generate_image(img, box, 1) - for ind, new_img in enumerate(img_arr): - new_img_name = image+'_160_120_'+str(ind)+'.npy' - new_img_path = os.path.join(prcssd_folder_path, new_img_name) - np.save(new_img_path, new_img) - embedding_dict[new_img_name] = embedding_list - json_bin = json.dumps(embedding_dict) - with open( - os.path.join(prcssd_folder_path, 'embeddings.json'), - mode='w', - encoding='utf-8', - ) as out_file: - out_file.write(json_bin) - - -def parse_args(): - """Parses command line arguments""" - data_folder = os.path.abspath(__file__) - for _ in range(3): - data_folder = os.path.dirname(data_folder) - data_folder = os.path.join(data_folder, 'data') - - parser = argparse.ArgumentParser(description='Generate VGGFace-2 dataset to train/test \ - FaceID model.') - parser.add_argument('-r', '--raw', dest='raw_data_path', type=str, - default=os.path.join(data_folder, 'VGGFace-2', 'raw'), - help='Path to raw VGG-Face-2 dataset folder.') - parser.add_argument('-d', '--dest', dest='dest_data_path', type=str, - default=os.path.join(data_folder, 'VGGFace-2'), - help='Folder path to store processed data') - parser.add_argument('--type', dest='data_type', type=str, required=True, - help='Data type to generate (train/test)') - args = parser.parse_args() - - source_path = os.path.join(args.raw_data_path, args.data_type) - dest_path = os.path.join(args.dest_data_path, args.data_type, 'temp') - return source_path, dest_path - - -if __name__ == "__main__": - raw_data_path, dest_data_path = parse_args() - main(raw_data_path, dest_data_path) diff --git a/datasets/face_id/gen_youtubefaces_embeddings.py b/datasets/face_id/gen_youtubefaces_embeddings.py deleted file mode 100755 index 2d271c2c0..000000000 --- a/datasets/face_id/gen_youtubefaces_embeddings.py +++ /dev/null @@ -1,216 +0,0 @@ -#!/usr/bin/env python3 -################################################################################################### -# -# Copyright (C) 2020-2021 Maxim Integrated Products, Inc. All Rights Reserved. -# -# Maxim Integrated Products, Inc. Default Copyright Notice: -# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html -# -################################################################################################### -""" -Script to generate dataset for FaceID training and validation from YouTubeFaces dataset. -""" - -import argparse -import json -import os - -import numpy as np -import torch - -import scipy.ndimage -from facenet_pytorch import MTCNN, InceptionResnetV1 # pylint: disable=no-name-in-module -from matplotlib.image import imread - - -def generate_image(img, box, count): # pylint: disable=too-many-locals - """ - Generates images in size 120x160x3 that includes the detected face in the image. - - img, box are the original image and box. - count is how many pics you wanna generate - - box format: x1, y1, x3, y3 - """ - box[0] = np.max((box[0], 0)) - box[1] = np.max((box[1], 0)) - box[2] = np.min((box[2], img.shape[1])) - box[3] = np.min((box[3], img.shape[0])) - - factor = 1 - height = img.shape[0] - width = img.shape[1] - new_img = img - new_box = box - while True: - if height < 160 or width < 120: - factor += 1 - new_img = scipy.ndimage.zoom(img, [factor, factor, 1], order=1) - new_box = box * factor - height = new_img.shape[0] - width = new_img.shape[1] - else: - break - new_box = np.round(new_box).astype(np.int) - new_box_height = new_box[3] - new_box[1] - new_box_width = new_box[2] - new_box[0] - - scale_list = np.arange(0.9, 0, -0.1) - ind = 0 - while (new_box_height > 160 or new_box_width > 120): - new_img = scipy.ndimage.zoom(img, [scale_list[ind], scale_list[ind], 1], order=1) - new_box = box * scale_list[ind] - new_box = np.round(new_box).astype(np.int) - new_box_height = new_box[3] - new_box[1] - new_box_width = new_box[2] - new_box[0] - ind += 1 - - min_x = np.max((0, new_box[0] - (120 - new_box_width))) - min_y = np.max((0, new_box[1] - (160 - new_box_height))) - max_x = np.min((new_box[0], width-120)) - max_y = np.min((new_box[1], height-160)) - - start_x = np.random.choice(np.arange(min_x, max_x+1), count, replace=True) - start_y = np.random.choice(np.arange(min_y, max_y+1), count, replace=True) - img_arr = [] - box_arr = [] - for i in range(count): - img_arr.append(new_img[start_y[i]:start_y[i]+160, start_x[i]:start_x[i]+120]) - temp_box = new_box.copy() - temp_box[0] -= start_x[i] - temp_box[2] -= start_x[i] - temp_box[1] -= start_y[i] - temp_box[3] -= start_y[i] - box_arr.append(temp_box) - new_img = img_arr - new_box = box_arr - return new_img, new_box, img, box - - -def main(source_path, dest_path): - """ - Main function to iterate over the images in the raw data and generate data samples - to train/test FaceID model. - """ - - # img_dir = os.path.join(raw_data_path, 'aligned_images_DB') - frame_dir = os.path.join(source_path, 'frame_images_DB') - - if not os.path.exists(dest_path): - os.makedirs(dest_path) - - # set parameters - num_imgs_per_face = 1 - target_im_shape = (160, 120) - - # set device - device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - print(f'Running on device: {device}') - - # create models - mtcnn = MTCNN( - image_size=80, margin=0, min_face_size=20, - thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True, - device=device - ) - resnet = InceptionResnetV1(pretrained='vggface2').eval().to(device) - - # run models on the images - num_persons = 0 - num_faces = 0 - - embedding_dict = {} - subj_name_list = os.listdir(frame_dir) - - for f_n, face_file in enumerate(subj_name_list): - if (f_n % 100) == 0: - print(f'Subject {f_n} of {len(subj_name_list)}') - f_path = os.path.join(frame_dir, face_file) - if os.path.isfile(f_path): - if face_file.endswith('txt'): - with open(f_path, mode='r', encoding='utf-8') as file: - lines = file.readlines() - num_persons += 1 - for line in lines: - num_faces += 1 - img_name = line.split(',')[0] - subj_name, video_no, file_name = img_name.split('\\') - img_path = os.path.join(frame_dir, subj_name, video_no, file_name) - img = imread(img_path) - - x_aligned, _, _ = mtcnn(img, return_prob=True) - if x_aligned is not None: - aligned = x_aligned[None, :, :, :].to(device) - embedding = resnet(aligned).detach().cpu().numpy()[0] - - if subj_name not in embedding_dict: - embedding_dict[subj_name] = {} - subj_path = os.path.join(dest_path, subj_name) - if not os.path.exists(subj_path): - os.mkdir(subj_path) - if video_no not in embedding_dict[subj_name]: - embedding_dict[subj_name][video_no] = {} - video_path = os.path.join(dest_path, subj_name, video_no) - if not os.path.exists(video_path): - os.mkdir(video_path) - - embedding_dict[subj_name][video_no][file_name] = embedding.tolist() - x_aligned_int = x_aligned.cpu().numpy() - x_aligned_int -= np.min(x_aligned_int) - x_aligned_int /= np.max(x_aligned_int) - x_aligned_int = (255.0 * x_aligned_int).astype(np.uint8) - np.save(os.path.join(dest_path, subj_name, video_no, file_name), - x_aligned_int) - - rect = line.split(',')[2:6] - for i in range(4): - rect[i] = int(rect[i]) - - box = np.array([int(rect[0]) - int(rect[2])//2, - int(rect[1]) - int(rect[3])//2, - int(rect[0]) + int(rect[2])//2, - int(rect[1]) + int(rect[3])//2]) - - img_arr, _, img, box = generate_image(img, box, num_imgs_per_face) - for img_idx in range(num_imgs_per_face): - new_file_name = '_'.join([file_name, str(target_im_shape[0]), - str(target_im_shape[1]), str(img_idx)]) - cropped_im_path = os.path.join(dest_path, subj_name, video_no, - new_file_name) - np.save(cropped_im_path, img_arr[img_idx]) - - print(f'Number of People: {num_persons}') - print(f'Number of Faces: {num_faces}') - - # save embeddings to json file - with open(os.path.join(dest_path, 'embeddings.json'), mode='w', encoding='utf-8') as out_file: - json.dump(embedding_dict, out_file) - - -def parse_args(): - """Parses command line arguments""" - data_folder = os.path.abspath(__file__) - for _ in range(3): - data_folder = os.path.dirname(data_folder) - data_folder = os.path.join(data_folder, 'data') - - parser = argparse.ArgumentParser(description='Generate YouTubeFaces dataset to train/test \ - FaceID model.') - parser.add_argument('-r', '--raw', dest='raw_data_path', type=str, - default=os.path.join(data_folder, 'YouTubeFaces', 'raw'), - help='Path to raw YouTubeFaces dataset folder.') - parser.add_argument('-d', '--dest', dest='dest_data_path', type=str, - default=os.path.join(data_folder, 'YouTubeFaces'), - help='Folder path to store processed data') - parser.add_argument('--type', dest='data_type', type=str, required=True, - help='Data type to generate (train/test)') - args = parser.parse_args() - - source_path = args.raw_data_path - dest_path = os.path.join(args.dest_data_path, args.data_type, 'temp') - return source_path, dest_path - - -if __name__ == "__main__": - raw_data_path, dest_data_path = parse_args() - main(raw_data_path, dest_data_path) diff --git a/datasets/face_id/merge_vggface2_dataset.py b/datasets/face_id/merge_vggface2_dataset.py deleted file mode 100755 index 0db35b3a9..000000000 --- a/datasets/face_id/merge_vggface2_dataset.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python3 -################################################################################################### -# -# Copyright (C) 2020-2023 Maxim Integrated Products, Inc. All Rights Reserved. -# -# Maxim Integrated Products, Inc. Default Copyright Notice: -# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html -# -################################################################################################### -""" -Script to merge VGGFace-2 data samples into more compact file series to effectively use during -FaceID model training. -""" - -import argparse -import json -import os -import pickle - -import numpy as np - - -def save_dataset(data, merged_data_path, part_no): - """ - Function to save merged file. - """ - merged_file_path = os.path.join(merged_data_path, f'whole_set_{part_no:02d}.pkl') - with open(merged_file_path, 'wb') as handle: - pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) - - -def main(data_path): # pylint: disable=too-many-locals - """ - Main function to iterate over the data samples to merge. - """ - img_size = (3, 160, 120) - - subj_list = sorted(os.listdir(os.path.join(data_path, 'temp'))) - part_no = 0 - dataset = {} - num_empty_subjs = 0 - - for i, subj in enumerate(subj_list): - if subj == 'merged': - print(f'Folder {subj} skipped') - continue - - if (i % 250) == 0: - print(f'{i} of {subj_list}') - if i > 0: - save_dataset(dataset, data_path, part_no) - dataset = {} - part_no += 1 - - if subj not in dataset: - dataset[subj] = {} - - subj_path = os.path.join(data_path, 'temp', subj) - if not os.path.isdir(subj_path): - continue - - if not os.listdir(subj_path): - print(f'Empty folder: {subj_path}') - num_empty_subjs += 1 - continue - - embedding_path = os.path.join(subj_path, 'embeddings.json') - with open(embedding_path, encoding='utf-8') as file: - embeddings = json.load(file) - - for img_name, emb in embeddings.items(): - img_path = os.path.join(subj_path, img_name) - img = np.load(img_path).transpose([2, 0, 1]) - - if img.shape == img_size: - if np.min(img) != np.max(img): - dataset[subj][img_name] = {'embedding': emb, 'img': img} - - if dataset: - save_dataset(dataset, data_path, part_no) - - -def parse_args(): - """Parses command line arguments""" - parser = argparse.ArgumentParser(description='Merge VGGFace-2 data samples to effectively use\ - during training/testing FaceID model.') - default_data_path = os.path.abspath(__file__) - for _ in range(3): - default_data_path = os.path.dirname(default_data_path) - default_data_path = os.path.join(default_data_path, 'data', 'VGGFace-2') - parser.add_argument('-p', '--data_path', dest='data_path', type=str, - default=default_data_path, - help='Folder path to processed data') - parser.add_argument('--type', dest='data_type', type=str, required=True, - help='Data type to generate (train/test)') - args = parser.parse_args() - - data_path = os.path.join(args.data_path, args.data_type) - return data_path - - -if __name__ == "__main__": - data_folder = parse_args() - main(data_folder) diff --git a/datasets/face_id/merge_youtubefaces_dataset.py b/datasets/face_id/merge_youtubefaces_dataset.py deleted file mode 100755 index a4eb1af5d..000000000 --- a/datasets/face_id/merge_youtubefaces_dataset.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 -################################################################################################### -# -# Copyright (C) 2020-2023 Maxim Integrated Products, Inc. All Rights Reserved. -# -# Maxim Integrated Products, Inc. Default Copyright Notice: -# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html -# -################################################################################################### -""" -Script to merge YouTubeFaces data samples into more compact file series to effectively use during -FaceID model training. -""" - -import argparse -import json -import os -import pickle - -import numpy as np - - -def save_dataset(data, merged_data_path, part_no): - """ - Function to save merged file. - """ - merged_file_path = os.path.join(merged_data_path, f'whole_set_{part_no:02d}.pkl') - with open(merged_file_path, 'wb') as handle: - pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) - - -def main(data_path): # pylint: disable=too-many-locals - """ - Main function to iterate over the data samples to merge. - """ - img_size = (3, 160, 120) - - num_imgs_per_face = 1 - - dataset = {} - part_no = 0 - - embedding_path = os.path.join(data_path, 'temp', 'embeddings.json') - with open(embedding_path, encoding='utf-8') as file: - embeddings = json.load(file) - - for i, (subj, val) in enumerate(embeddings.items()): # pylint: disable=too-many-nested-blocks - if (i % 200) == 0: - print(f'{i} of {len(embeddings)}') - if i > 0: - save_dataset(dataset, data_path, part_no) - dataset = {} - part_no += 1 - - if subj not in dataset: - dataset[subj] = {} - for video_num, val2 in val.items(): - img_folder = os.path.join(data_path, 'temp', subj, str(video_num)) - - if video_num not in dataset[subj]: - dataset[subj][video_num] = {} - - for img_name, embedding in val2.items(): - for idx in range(num_imgs_per_face): - img_name = '_'.join([img_name, str(img_size[1]), str(img_size[2]), str(idx)]) - img_path = os.path.join(img_folder, '.'.join([img_name, 'npy'])) - img = np.load(img_path).transpose([2, 0, 1]) - - if img.shape == img_size: - if np.min(img) != np.max(img): - dataset[subj][video_num][img_name] = {'embedding': embedding, - 'img': img} - - if dataset: - save_dataset(dataset, data_path, part_no) - - -def parse_args(): - """Parses command line arguments""" - parser = argparse.ArgumentParser(description='Merge YouTubeFaces data samples to effectively\ - use during training/testing FaceID model.') - default_data_path = os.path.abspath(__file__) - for _ in range(3): - default_data_path = os.path.dirname(default_data_path) - default_data_path = os.path.join(default_data_path, 'data', 'YouTubeFaces') - parser.add_argument('-p', '--data_path', dest='data_path', type=str, - default=default_data_path, - help='Folder path to processed data') - parser.add_argument('--type', dest='data_type', type=str, required=True, - help='Data type to generate (train/test)') - args = parser.parse_args() - - data_path = os.path.join(args.data_path, args.data_type) - return data_path - - -if __name__ == "__main__": - data_folder = parse_args() - main(data_folder) diff --git a/datasets/faceid.py b/datasets/faceid.py deleted file mode 100644 index 3f0039750..000000000 --- a/datasets/faceid.py +++ /dev/null @@ -1,77 +0,0 @@ -################################################################################################### -# -# Copyright (C) 2019-2023 Maxim Integrated Products, Inc. All Rights Reserved. -# -# Maxim Integrated Products, Inc. Default Copyright Notice: -# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html -# -################################################################################################### -""" -Classes and functions used to utilize the Face ID dataset. -""" -import os - -from torchvision import transforms - -import ai8x -from datasets.vggface2 import VGGFace2Dataset -from datasets.youtube_faces import YouTubeFacesDataset - - -def faceid_get_datasets(data, load_train=True, load_test=True): - """ - Load the faceID dataset - - The dataset is loaded from the archive file, so the file is required for this version. - - The dataset consists of actually 2 different datasets, VGGFace2 for training and YouTubeFaces - for the test. The reason of this is proof-of-concept models are obtained by this way and the - losses At YTFaces are tracked for the sake of benchmarking. - - The images are all 3-color 160x120 sized and consist the face image. - """ - (data_dir, args) = data - - # These are hard coded for now, need to come from above in future. - train_resample_subj = 1 - train_resample_img_per_subj = 6 - test_resample_subj = 1 - test_resample_img_per_subj = 2 - train_data_dir = os.path.join(data_dir, 'VGGFace-2') - test_data_dir = os.path.join(data_dir, 'YouTubeFaces') - - transform = transforms.Compose([ - ai8x.normalize(args=args) - ]) - - if load_train: - train_dataset = VGGFace2Dataset(root_dir=train_data_dir, d_type='train', - transform=transform, - resample_subj=train_resample_subj, - resample_img_per_subj=train_resample_img_per_subj) - else: - train_dataset = None - - if load_test: - test_dataset = YouTubeFacesDataset(root_dir=test_data_dir, d_type='test', - transform=transform, - resample_subj=test_resample_subj, - resample_img_per_subj=test_resample_img_per_subj) - - if args.truncate_testset: - test_dataset.data = test_dataset.data[:1] # type: ignore # .data exists - else: - test_dataset = None - - return train_dataset, test_dataset - - -datasets = [ - { - 'name': 'FaceID', - 'input': (3, 160, 120), - 'output': ('id'), - 'regression': True, - 'loader': faceid_get_datasets, - }, -] diff --git a/datasets/vggface2.py b/datasets/vggface2.py index 6f872f967..24e95624a 100644 --- a/datasets/vggface2.py +++ b/datasets/vggface2.py @@ -1,6 +1,6 @@ ################################################################################################### # -# Copyright (C) 2019-2022 Maxim Integrated Products, Inc. All Rights Reserved. +# Copyright (C) 2019-2023 Maxim Integrated Products, Inc. All Rights Reserved. # # Maxim Integrated Products, Inc. Default Copyright Notice: # https://www.maximintegrated.com/en/aboutus/legal/copyrights.html @@ -10,80 +10,436 @@ VGGFace2: A Dataset for Recognising Faces across Pose and Age https://ieeexplore.ieee.org/abstract/document/8373813 """ + +import errno +import glob import os import pickle -import time import numpy as np import torch -from torch.utils import data +import torchvision.transforms.functional as FT +from torch.utils.data import Dataset +from torchvision import transforms + +import cv2 +import kornia.geometry.transform as GT +import onnxruntime +from hawk_eyes.face import RetinaFace +from PIL import Image +from skimage import transform as trans +from tqdm import tqdm + +import ai8x +from utils import augmentation_utils -class VGGFace2Dataset(data.Dataset): +class VGGFace2(Dataset): """ - VGGFace2: A Dataset for Recognising Faces across Pose and Age - https://ieeexplore.ieee.org/abstract/document/8373813 + VGGFace2 Dataset """ - def __init__( - self, - root_dir, - d_type, - transform=None, - resample_subj=1, - resample_img_per_subj=1, - ): - data_folder = os.path.join(root_dir, d_type) - assert os.path.isdir(data_folder), (f'No dataset at {data_folder}.' - ' Follow the steps at datasets/face_id/README.md') - - data_file_list = sorted([d for d in os.listdir(data_folder) if d.startswith('whole_set')]) - - self.sid_list = [] - self.embedding_list = [] - self.img_list = [] + def __init__(self, root_dir, d_type, mode, transform=None, + teacher_transform=None, img_size=(112, 112)): + + if d_type not in ('test', 'train'): + raise ValueError("d_type can only be set to 'test' or 'train'") + + if mode not in ('detection', 'identification', 'identification_dr'): + raise ValueError("mode can only be set to 'detection', 'identification'," + "or 'identification_dr'") + + self.root_dir = root_dir + self.d_type = d_type self.transform = transform + self.teacher_transform = teacher_transform + self.img_size = img_size + self.mode = mode + self.dataset_path = os.path.join(self.root_dir, "VGGFace-2") + self.__makedir_exist_ok(self.dataset_path) + self.count = 0 + self.tform = trans.SimilarityTransform() + self.src = np.array([ + [38.2946, 51.6963], + [73.5318, 51.5014], + [56.0252, 71.7366], + [41.5493, 92.3655], + [70.7299, 92.2041]], dtype=np.float32) + + self.__makedir_exist_ok(self.dataset_path) + self.__makedir_exist_ok(os.path.join(self.dataset_path, "processed")) + + if self.d_type in ('train', 'test'): + self.gt_path = os.path.join(self.dataset_path, "processed", + self.d_type+"_vggface2.pickle") + self.d_path = os.path.join(self.dataset_path, self.d_type) + if not os.path.exists(self.gt_path): + assert os.path.isdir(self.d_path), (f'No dataset at {self.d_path}.\n' + ' Please review the term and' + ' conditions at https://www.robots.ox.ac.uk/' + '~vgg/data/vgg_face2/ . Then, download the' + ' dataset and extract raw images to the' + ' train and test subfolders.\n' + ' Expected folder structure: \n' + ' - root_dir \n' + ' - VGGFace-2 \n' + ' - train \n' + ' - test \n') + + print("Extracting ground truth from the " + self.d_type + " set") + self.__extract_gt() + + f = open(self.gt_path, 'rb') + self.pickle_dict = pickle.load(f) + f.close() + + else: + print(f'Unknown data type: {self.d_type}') + return - subj_idx = 0 - n_elems = 0 + def __extract_gt(self): + """ + Extracts the ground truth from the dataset + """ + onnxruntime.set_default_logger_severity(3) # suppress onnxruntime warnings + retina = RetinaFace(model_name='retina_l', conf=0.5) + img_paths = list(glob.glob(os.path.join(self.d_path + '/**/', '*.jpg'), recursive=True)) + nf_number = 0 + n_words = 0 + pickle_dict = {key: [] for key in ["boxes", "landmarks", "img_list", "lbl_list"]} + pickle_dict["word2index"] = {} - t_start = time.time() - print('Data loading...') - for n_file, data_file in enumerate(data_file_list): - if ((n_file+1) % 5) == 0: - print(f'\t{n_file+1} of {len(data_file_list)}') - f_path = os.path.join(data_folder, data_file) + for jpg in tqdm(img_paths): + boxes = [] + image = cv2.imread(jpg) + bboxes, lndmrks = retina.detect(image) + if len(bboxes) == 0: + nf_number += 1 + continue - with open(f_path, 'rb') as f: - x = pickle.load(f) + for box in bboxes: + box = np.clip(box[:4], 0, None) + boxes.append(box) - for key in list(x)[::resample_subj]: - val = x[key] - for key2 in list(val)[::resample_img_per_subj]: - self.img_list.append(val[key2]['img']) - self.embedding_list.append(np.array(val[key2]['embedding']).astype(np.float32)) - self.sid_list.append(subj_idx) - n_elems += 1 - subj_idx += resample_subj + lndmrks = lndmrks[0] - t_end = time.time() - print(f'{n_elems} of data samples loaded in {t_end-t_start:.4f} seconds.') + dir_name = os.path.dirname(jpg) + lbl = os.path.relpath(dir_name, self.d_path) - def __normalize_data(self, data_item): - data_item = data_item.astype(np.float32) - data_item /= 256 - return data_item + if lbl not in pickle_dict["word2index"]: + pickle_dict["word2index"][lbl] = n_words + n_words += 1 + + pickle_dict["lbl_list"].append(lbl) + pickle_dict["boxes"].append(boxes) + pickle_dict["landmarks"].append(lndmrks) + pickle_dict["img_list"].append(os.path.relpath(jpg, self.dataset_path)) + if nf_number > 0: + print(f'Not found any faces in {nf_number} images ') + + with open(self.gt_path, 'wb') as f: + pickle.dump(pickle_dict, f) def __len__(self): - return len(self.img_list) + return len(self.pickle_dict["img_list"]) - 1 + + def __getitem__(self, index): + """ + Get the image and associated target according to the mode + """ + if index >= len(self): + raise IndexError + + if self.mode == 'detection': + return self.__getitem_detection(index) + + if self.mode == 'identification': + return self.__getitem_identification(index) + + if self.mode == 'identification_dr': + return self.__getitem_identification_dr(index) + + # Will never reached + return None + + def __getitem_detection(self, index): + """ + Get the image and associated target for face detection + """ + if torch.is_tensor(index): + index = index.tolist() + + img = Image.open(os.path.join(self.dataset_path, self.pickle_dict["img_list"][index])) + img = FT.to_tensor(img) + + boxes = self.pickle_dict["boxes"][index] + boxes = torch.as_tensor(boxes, dtype=torch.float32) + + img, boxes = augmentation_utils.resize(img, boxes, + dims=(self.img_size[0], self.img_size[1])) - def __getitem__(self, idx): - embedding = self.embedding_list[idx] - embedding = np.expand_dims(embedding, 1) - embedding = np.expand_dims(embedding, 2) - embedding *= 6.0 + labels = [1] * boxes.shape[0] - inp = torch.tensor(self.__normalize_data(self.img_list[idx]), dtype=torch.float) if self.transform is not None: - inp = self.transform(inp) + img = self.transform(img) + + boxes = boxes.clamp_(min=0, max=1) + labels = torch.as_tensor(labels, dtype=torch.int64) + + return img, (boxes, labels) + + def __getitem_identification(self, index): + """ + Get the image and associated target for face identification + """ + if torch.is_tensor(index): + index = index.tolist() + + lbl = self.pickle_dict["lbl_list"][index] + lbl_index = self.pickle_dict["word2index"][lbl] + lbl_index = torch.tensor(lbl_index, dtype=torch.long) + box = self.pickle_dict["boxes"][index][0] + img = Image.open(os.path.join(self.dataset_path, self.pickle_dict["img_list"][index])) + img_A = img.copy() + + # Apply transformation to the image that will be aligned + if self.teacher_transform is not None: + img_A = self.teacher_transform(img_A) + + # Apply transformation to the image that will be cropped + if self.transform is not None: + img = self.transform(img) + + # Use landmarks to estimate affine transformation + landmark = self.pickle_dict["landmarks"][index] + self.tform.estimate(landmark, self.src) + A = self.tform.params[0:2, :] + A = torch.as_tensor(A, dtype=torch.float32) + A = A.unsqueeze(0) + + # Apply affine transformation to obtain aligned image + img_A = GT.warp_affine(img_A.unsqueeze(0), A, (self.img_size[0], self.img_size[1])) + img_A = img_A.squeeze(0) + + # Convert bounding box to square + height = box[3] - box[1] + width = box[2] - box[0] + max_dim = max(height, width) + box[0] = np.clip(box[0] - (max_dim - width) / 2, 0, img.shape[2]) + box[1] = np.clip(box[1] - (max_dim - height) / 2, 0, img.shape[1]) + box[2] = np.clip(box[2] + (max_dim - width) / 2, 0, img.shape[2]) + box[3] = np.clip(box[3] + (max_dim - height) / 2, 0, img.shape[1]) + + # Crop image with the square bounding box + img_C = FT.crop(img=img, top=int(box[1]), left=int(box[0]), + height=int(box[3]-box[1]), width=int(box[2]-box[0])) + + # Check if the cropped image is square, if not, pad it + _, h, w = img_C.shape + if w != h: + max_dim = max(w, h) + h_padding = (max_dim - h) / 2 + w_padding = (max_dim - w) / 2 + l_pad = w_padding if w_padding % 1 == 0 else w_padding+0.5 + t_pad = h_padding if h_padding % 1 == 0 else h_padding+0.5 + r_pad = w_padding if w_padding % 1 == 0 else w_padding-0.5 + b_pad = h_padding if h_padding % 1 == 0 else h_padding-0.5 + padding = (int(l_pad), int(t_pad), int(r_pad), int(b_pad)) + img_C = FT.pad(img_C, padding, 0, 'constant') + + # Resize cropped image to the desired size + img_C = FT.resize(img_C, (self.img_size[0], self.img_size[1])) + + # Concatenate images + concat_img = torch.cat((img_C, img_A), 0) + + return concat_img, lbl_index + + def __getitem_identification_dr(self, index): + """ + Get the image and associated target for dimensionality reduction + """ + if torch.is_tensor(index): + index = index.tolist() + + lbl = self.pickle_dict["lbl_list"][index] + lbl_index = self.pickle_dict["word2index"][lbl] + lbl_index = torch.tensor(lbl_index, dtype=torch.long) + img = Image.open(os.path.join(self.dataset_path, self.pickle_dict["img_list"][index])) + + # Apply transformation to the image that will be aligned + if self.transform is not None: + img = self.transform(img) + + # Use landmarks to estimate affine transformation + landmark = self.pickle_dict["landmarks"][index] + self.tform.estimate(landmark, self.src) + A = self.tform.params[0:2, :] + A = torch.as_tensor(A, dtype=torch.float32) + A = A.unsqueeze(0) + + # Apply affine transformation to obtain aligned image + img = GT.warp_affine(img.unsqueeze(0), A, (self.img_size[0], self.img_size[1])) + img = img.squeeze(0) + + return img, lbl_index + + @staticmethod + def __makedir_exist_ok(dirpath): + """Make directory if not already exists + """ + try: + os.makedirs(dirpath) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + + @staticmethod + def collate_fn(batch): + """ + Since each image may have a different number of objects, we need a collate function + (to be passed to the DataLoader). + This describes how to combine these tensors of different sizes. We use lists. + :param batch: an iterable of N sets from __getitem__() + :return: a tensor of images, lists of varying-size tensors of bounding boxes and labels + """ + images = [] + boxes_and_labels = [] + + for b in batch: + images.append(b[0]) + boxes_and_labels.append(b[1]) + + images = torch.stack(images, dim=0) + return images, boxes_and_labels + + +def VGGFace2_FaceID_get_datasets(data, load_train=True, load_test=True, img_size=(112, 112)): + + """ Returns FaceID Dataset + """ + (data_dir, args) = data + + train_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.RandomHorizontalFlip(p=0.5), + transforms.ColorJitter(brightness=(0.6, 1.4), saturation=(0.6, 1.4), + contrast=(0.6, 1.4), hue=(-0.4, 0.4)), + transforms.RandomErasing(p=0.1), + ai8x.normalize(args=args)]) + + teacher_transform = transforms.Compose([ + transforms.ToTensor(), + ai8x.normalize(args=args)]) + + if load_train: + + train_dataset = VGGFace2(root_dir=data_dir, d_type='train', mode='identification', + transform=train_transform, teacher_transform=teacher_transform, + img_size=img_size) + + print(f'Train dataset length: {len(train_dataset)}\n') + else: + train_dataset = None + + if load_test: + test_transform = transforms.Compose([transforms.ToTensor(), + ai8x.normalize(args=args)]) + + test_dataset = VGGFace2(root_dir=data_dir, d_type='test', mode='identification', + transform=test_transform, teacher_transform=teacher_transform, + img_size=img_size) + + print(f'Test dataset length: {len(test_dataset)}\n') + else: + test_dataset = None + + return train_dataset, test_dataset + + +def VGGFace2_FaceID_dr_get_datasets(data, load_train=True, load_test=True, img_size=(112, 112)): + + """ Returns FaceID Dataset for dimensionality reduction + """ + (data_dir, args) = data + + train_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.RandomHorizontalFlip(p=0.5), + ai8x.normalize(args=args)]) + + if load_train: + + train_dataset = VGGFace2(root_dir=data_dir, d_type='train', mode='identification_dr', + transform=train_transform, img_size=img_size) + + print(f'Train dataset length: {len(train_dataset)}\n') + else: + train_dataset = None + + if load_test: + test_transform = transforms.Compose([transforms.ToTensor(), + ai8x.normalize(args=args)]) + + test_dataset = VGGFace2(root_dir=data_dir, d_type='test', mode='identification_dr', + transform=test_transform, img_size=img_size) + + print(f'Test dataset length: {len(test_dataset)}\n') + else: + test_dataset = None + + return train_dataset, test_dataset + + +def VGGFace2_Facedet_get_datasets(data, load_train=True, load_test=True, img_size=(224, 168)): + + """ Returns FaceDetection Dataset + """ + (data_dir, args) = data + + if load_train: + train_transform = transforms.Compose([ + ai8x.normalize(args=args)]) + + train_dataset = VGGFace2(root_dir=data_dir, d_type='train', mode='detection', + transform=train_transform, img_size=img_size) + + print(f'Train dataset length: {len(train_dataset)}\n') + else: + train_dataset = None + + if load_test: + test_transform = transforms.Compose([ai8x.normalize(args=args)]) + + test_dataset = VGGFace2(root_dir=data_dir, d_type='test', mode='detection', + transform=test_transform, img_size=img_size) + + print(f'Test dataset length: {len(test_dataset)}\n') + else: + test_dataset = None + + return train_dataset, test_dataset + - return inp, torch.tensor(embedding, dtype=torch.float) +datasets = [ + { + 'name': 'VGGFace2_FaceID', + 'input': (3, 112, 112), + 'output': ('id'), + 'loader': VGGFace2_FaceID_get_datasets, + }, + { + 'name': 'VGGFace2_FaceID_dr', + 'input': (3, 112, 112), + 'output': [*range(0, 8631, 1)], + 'loader': VGGFace2_FaceID_dr_get_datasets, + }, + { + 'name': 'VGGFace2_FaceDetection', + 'input': (3, 224, 168), + 'output': ([1]), + 'loader': VGGFace2_Facedet_get_datasets, + 'collate': VGGFace2.collate_fn + } +] diff --git a/datasets/vggface2_facedet.py b/datasets/vggface2_facedet.py deleted file mode 100644 index d63a4dc2b..000000000 --- a/datasets/vggface2_facedet.py +++ /dev/null @@ -1,214 +0,0 @@ -################################################################################################### -# -# Copyright (C) 2023 Maxim Integrated Products, Inc. All Rights Reserved. -# -# Maxim Integrated Products, Inc. Default Copyright Notice: -# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html -# -################################################################################################### -""" -VGGFace2: A Dataset for Recognising Faces across Pose and Age -https://ieeexplore.ieee.org/abstract/document/8373813 -""" - - -import errno -import glob -import os -import pickle - -import torch -from torch.utils.data import Dataset -from torchvision import transforms - -from PIL import Image -from tqdm import tqdm - -import ai8x -from datasets.face_id.facenet_pytorch import MTCNN - - -class VGGFace2_FaceDetectionDataset(Dataset): - """ - VGGFace2 Dataset for face detection - - MTCNN is used to extract the ground truth from the dataset as it provides - the ground truth for multiple faces in an image. - - GT Format: 0-3:Box Coordinates - - """ - def __init__(self, root_dir, d_type, transform=None, img_size=(224, 168)): - - if d_type not in ('test', 'train'): - raise ValueError("d_type can only be set to 'test' or 'train'") - - self.root_dir = root_dir - self.d_type = d_type - self.transform = transform - self.img_size = img_size - self.dataset_path = os.path.join(self.root_dir, "VGGFace-2") - self.__makedir_exist_ok(self.dataset_path) - self.__makedir_exist_ok(os.path.join(self.dataset_path, "processed")) - - if self.d_type in ('train', 'test'): - self.gt_path = os.path.join(self.dataset_path, "processed", self.d_type+"_gt.pickle") - self.d_path = os.path.join(self.dataset_path, self.d_type) - if not os.path.exists(self.gt_path): - assert os.path.isdir(self.d_path), (f'No dataset at {self.d_path}.\n' - ' Please review the term and' - ' conditions at https://www.robots.ox.ac.uk/' - '~vgg/data/vgg_face2/ . Then, download the' - ' dataset and extract raw images to the' - ' train and test subfolders.\n' - ' Expected folder structure: \n' - ' - root_dir \n' - ' - VGGFace-2 \n' - ' - train \n' - ' - test \n') - - print("Extracting ground truth from the " + self.d_type + " set") - self.__extract_gt() - - else: - print(f'Unknown data type: {self.d_type}') - return - - f = open(self.gt_path, 'rb') - self.pickle_dict = pickle.load(f) - f.close() - - def __extract_gt(self): - """ - Extracts the ground truth from the dataset - """ - mtcnn = MTCNN() - img_paths = list(glob.glob(os.path.join(self.d_path + '/**/', '*.jpg'), recursive=True)) - nf_number = 0 - pickle_dict = {key: [] for key in ["gt", "img_list"]} - - for jpg in tqdm(img_paths): - img = Image.open(jpg) - img = img.resize((self.img_size[1], self.img_size[0])) - # pylint: disable-next=unbalanced-tuple-unpacking - gt, _ = mtcnn.detect(img, landmarks=False) # type: ignore # returns tuple of 2 - - if gt is None or None in gt: - nf_number += 1 - continue - - pickle_dict["gt"].append(gt) - pickle_dict["img_list"].append(os.path.relpath(jpg, self.dataset_path)) - - if nf_number > 0: - print(f'Not found any faces in {nf_number} images ') - - with open(self.gt_path, 'wb') as f: - pickle.dump(pickle_dict, f) - - def __len__(self): - return len(self.pickle_dict["img_list"]) - 1 - - def __getitem__(self, index): - if index >= len(self): - raise IndexError - - if torch.is_tensor(index): - index = index.tolist() - - img = Image.open(os.path.join(self.dataset_path, self.pickle_dict["img_list"][index])) - - ground_truth = self.pickle_dict["gt"][index] - - lbls = [1] * ground_truth.shape[0] - if self.transform is not None: - img = self.transform(img) - for box in ground_truth: - box[0] = box[0] / self.img_size[1] - box[2] = box[2] / self.img_size[1] - box[1] = box[1] / self.img_size[0] - box[3] = box[3] / self.img_size[0] - - boxes = torch.as_tensor(ground_truth, dtype=torch.float32) - boxes = boxes.clamp_(min=0, max=1) - - labels = torch.as_tensor(lbls, dtype=torch.int64) - - return img, (boxes, labels) - - @staticmethod - def collate_fn(batch): - """ - Since each image may have a different number of objects, we need a collate function - (to be passed to the DataLoader). - This describes how to combine these tensors of different sizes. We use lists. - :param batch: an iterable of N sets from __getitem__() - :return: a tensor of images, lists of varying-size tensors of bounding boxes and labels - """ - images = [] - boxes_and_labels = [] - - for b in batch: - images.append(b[0]) - boxes_and_labels.append(b[1]) - - images = torch.stack(images, dim=0) - return images, boxes_and_labels - - @staticmethod - def __makedir_exist_ok(dirpath): - """Make directory if not already exists - """ - try: - os.makedirs(dirpath) - except OSError as e: - if e.errno == errno.EEXIST: - pass - else: - raise - - -def VGGFace2_Facedet_get_datasets(data, load_train=True, load_test=True, img_size=(224, 168)): - - """ Returns FaceDetection Dataset - """ - (data_dir, args) = data - - if load_train: - train_transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Resize(img_size), - ai8x.normalize(args=args) - ]) - - train_dataset = VGGFace2_FaceDetectionDataset(root_dir=data_dir, d_type='train', - transform=train_transform, img_size=img_size) - - print(f'Train dataset length: {len(train_dataset)}\n') - else: - train_dataset = None - - if load_test: - test_transform = transforms.Compose([transforms.ToTensor(), - transforms.Resize(img_size), - ai8x.normalize(args=args)]) - - test_dataset = VGGFace2_FaceDetectionDataset(root_dir=data_dir, d_type='test', - transform=test_transform, img_size=img_size) - - print(f'Test dataset length: {len(test_dataset)}\n') - else: - test_dataset = None - - return train_dataset, test_dataset - - -datasets = [ - { - 'name': 'VGGFace2_FaceDetection', - 'input': (3, 224, 168), - 'output': ([1]), - 'loader': VGGFace2_Facedet_get_datasets, - 'collate': VGGFace2_FaceDetectionDataset.collate_fn - } -] diff --git a/datasets/youtube_faces.py b/datasets/youtube_faces.py deleted file mode 100644 index 6bf4bee58..000000000 --- a/datasets/youtube_faces.py +++ /dev/null @@ -1,91 +0,0 @@ -################################################################################################### -# -# Copyright (C) 2019-2022 Maxim Integrated Products, Inc. All Rights Reserved. -# -# Maxim Integrated Products, Inc. Default Copyright Notice: -# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html -# -################################################################################################### -""" -YouTube Faces Dataset -https://www.cs.tau.ac.il/~wolf/ytfaces/ -""" -import os -import pickle -import time - -import numpy as np -import torch -from torch.utils import data - - -class YouTubeFacesDataset(data.Dataset): - """ - YouTube Faces Dataset - https://www.cs.tau.ac.il/~wolf/ytfaces/ - """ - def __init__( - self, - root_dir, - d_type, - transform=None, - resample_subj=1, - resample_img_per_subj=1, - ): - data_folder = os.path.join(root_dir, d_type) - assert os.path.isdir(data_folder), (f'No dataset at {data_folder}.' - ' Follow the steps at datasets/face_id/README.md') - - data_file_list = sorted([d for d in os.listdir(data_folder) if d.startswith('whole_set')]) - - self.sid_list = [] - self.embedding_list = [] - self.img_list = [] - self.transform = transform - - subj_idx = 0 - n_elems = 0 - - t_start = time.time() - print('Data loading...') - for n_file, data_file in enumerate(data_file_list): - print(f'\t{n_file+1} of {len(data_file_list)}') - f_path = os.path.join(data_folder, data_file) - - with open(f_path, 'rb') as f: - x = pickle.load(f) - - for key in list(x)[::resample_subj]: - val = x[key] - for key2 in list(val)[::resample_img_per_subj]: - for key3 in list(val[key2]): - self.img_list.append(val[key2][key3]['img']) - self.embedding_list.append( - np.array(val[key2][key3]['embedding']).astype(np.float32) - ) - self.sid_list.append(subj_idx) - n_elems += 1 - subj_idx += resample_subj - - t_end = time.time() - print(f'{n_elems} of data samples loaded in {t_end-t_start:.4f} seconds.') - - def __normalize_data(self, data_item): - data_item = data_item.astype(np.float32) - data_item /= 256 - return data_item - - def __len__(self): - return len(self.img_list) - - def __getitem__(self, idx): - embedding = self.embedding_list[idx] - embedding = np.expand_dims(embedding, 1) - embedding = np.expand_dims(embedding, 2) - embedding *= 6.0 - - inp = torch.tensor(self.__normalize_data(self.img_list[idx]), dtype=torch.float) - if self.transform is not None: - inp = self.transform(inp) - - return inp, torch.tensor(embedding, dtype=torch.float) diff --git a/distiller b/distiller index ea4436dba..0477a66ef 160000 --- a/distiller +++ b/distiller @@ -1 +1 @@ -Subproject commit ea4436dba4bc17a60216f27c8ded69b7faa5f501 +Subproject commit 0477a66ef0ace09f5572f27c0178ea422ed9bf4e diff --git a/docs/FacialRecognitionSystem.md b/docs/FacialRecognitionSystem.md new file mode 100644 index 000000000..92cb03eff --- /dev/null +++ b/docs/FacialRecognitionSystem.md @@ -0,0 +1,102 @@ +# Facial Recognition System + +This document aims to explain facial recognition applications for MAX7800x series microcontrollers. Facial recognition task consists from three main parts, face detection, face identification and dot product. + + - The face detection model detects faces in the captured image and extracts a rectangular sub-image containing only one face. + - The face Identification model identifies a person from their facial images by generating the embedding for a given face image. + - The dot product layer outputs the dot product representing the similarity between the embedding from the given image and embeddings in the database. + +Figure 1 depicts the facial recognition system sequential diagram. + + + +​Figure 1. MAX7800x facial recognition system + +## Dataset + +The first step will be the dataset preparation. The dataset is VGGFace-2 [1]. +Please review the term and conditions at [VGGFace2](https://www.robots.ox.ac.uk/~vgg/data/vgg_face2/). Then, download the dataset and extract raw images to the train and test subfolders. + +Expected folder structure: + + - root_dir + - VGGFace-2 + - train + - test + +FaceID and Face Detection tasks share the same ground truth pickle, and it will be automatically generated when one of these tasks started. + +## Face Detection + +To be able to localize faces in a facial recognition system, a face detection algorithm is generally used in facial recognition systems. Face detection is an object detection problem that has various solutions in the literature. In this work, a face detection algorithm that will run on MAX7800x series microcontrollers with a real-time performance was targeted. + +For digit detection problem, previously, a Tiny SSD[2] based MAX7800x object detection algorithm was developed , named Tinier SSD. The face detection model is a modified version of the digit detection model. The modification was realized to reduce the number of parameters and enable larger input size. + +To train the facedetection model, "scripts/train_facedet_tinierssd.sh" script can be used. + +## FaceID + +To train a FaceID model for MAX7800x microcontrollers, there are multiple steps. As the MAX7800x FaceID models will be trained in a knowledge distillation fashion, the first step will be downloading a backbone checkpoint for the teacher model. + +The suggested teacher model is IR-152, but the other teacher models defined in "model_irse_drl.py" may be used as well. Please review the terms and conditions at face.evoLVe[3] repository, and download the checkpoint according to your teacher model selection. + +There are two FaceID models, one for the MAX78000 and one for the MAX78002. The MAX78000 one is named faceid_112, and it is a relatively lightweight model. To enable more performance on MAX78002, a more complex model was developed, which is named mobilefacenet_112. To train the FaceID models, "scripts/train_faceid_112.sh" and "scripts/train_mobilefacenet_112.sh" scripts can be used, respectivey. Training scripts will realize Dimensionality Reduction and Relation Based-Knowledge Knowledge Distillation steps automatically. A summary of Dimensionality Reduction and Relation-Based Knowledge Distillation can be found in the following sub-sections. + +### Dimensionality Reduction on the Teacher Model + +Reducing embedding dimensionality can greatly reduce the post-processing operations and memory usage for the facial recognition system. To achieve this, the teacher backbone will be frozen and two additional Conv1d layers will be added to the teacher models. These additions are called dimension reduction layers. For the example in the repository, the length of the embeddings produced by the teacher model is 512 and the optimum length for the student model is found to be 64. Still, other choices like 32, 128 or 256 can be examined for different application areas. A summary of the dimensionality reduction is shown in Figure 2, and dimension reduction layers' details are represented in Table 1. + + + + +​Figure 2. Dimensionality Reduction + + + +Table 1. Dimension Reduction Layers + +| Layer1 | Layer2 | +|--------------------------------------| ------------------------------------| +| Conv1d(In=512ch, Out=512Ch, Kernel=1)| Conv1d(In=512ch, Out=64Ch, Kernel=1)| +| BatchNorm1d(512) | BatchNorm1d(64) | +| PReLU(512) | | + + + +To train dimensionality reduction layers Sub-Center ArcFace loss is used. The SubCenterArcFace Loss was presented in the [4], summary of the training framework can be seen in Figure 3. The loss function uses cosine similarity as the distance metric, and in the framework embedding network is trained as a part of the classification problem. The Normalized Sub-Centers(also known as the prototypes) must be learned from zero as no model is available to extract embeddings at the beginning. + + + +​Figure 3. Sub-Center ArcFace Loss[4] + +### Relation-Based Knowledge Distillation + +The knowledge distillation choice for the FaceID models was a relation-based one. The distillation loss was calculated as the MSE between teacher model and student model. + +To train the student FaceID models, no student loss was used, so student weight was set to 0. + +From Figure 4, a visual can be seen for the relation-based knowledge distillation. + + + +​Figure 4. Relation-Based Knowledge Distillation[5] + + + +## Dot Product Layer + +The dot product layer weights will be populated with the embeddings that are generated by MAX7800x FaceID models. Outputs of the FaceID models are normalized at both inference and recording. Therefore, the result of the dot product layer equals cosine similarity. Using the cosine similarity as a distance metric, the image is identified as either one of the known subjects or 'Unknown' depending on the embedding distances. To record new people in the database, there are two options. The first one is using the Python scripts that are available on the SDK demos. The second option is to use "record on hardware" mode which does not require any external connection. The second option is not available for all platforms, so please check SDK demo ReadMEs to see if it is supported. + + + +## References + +[1] [Cao, Qiong, et al. "Vggface2: A dataset for recognising faces across pose and age." 2018 13th IEEE international conference on automatic face & gesture recognition (FG 2018). IEEE, 2018.](https://arxiv.org/abs/1710.08092) + +[2] [A. Womg, M. J. Shafiee, F. Li and B. Chwyl, "Tiny SSD: A Tiny Single-Shot Detection Deep Convolutional Neural Network for Real-Time Embedded Object Detection," 2018 15th Conference on Computer and Robot Vision (CRV), Toronto, ON, Canada, 2018, pp. 95-101, doi: 10.1109/CRV.2018.00023.](https://ieeexplore.ieee.org/document/8575741) + +[3] [face.evoLVe, High-Performance Face Recognition Library on PaddlePaddle & PyTorch](https://github.com/ZhaoJ9014/face.evoLVe) + +[4] [Deng, Jiankang, et al. "Arcface: Additive angular margin loss for deep face recognition." Proceedings of the IEEE/CVF conference on computer vision and pattern recognition. 2019.](https://arxiv.org/abs/1801.07698) + +[5] [Gou, Jianping, et al. "Knowledge distillation: A survey." International Journal of Computer Vision 129 (2021): 1789-1819.](https://arxiv.org/abs/2006.05525) diff --git a/docs/RelationBasedKD.png b/docs/RelationBasedKD.png new file mode 100644 index 000000000..a32f42c52 Binary files /dev/null and b/docs/RelationBasedKD.png differ diff --git a/docs/SubCenterArcFaceLoss.png b/docs/SubCenterArcFaceLoss.png new file mode 100644 index 000000000..11581aa42 Binary files /dev/null and b/docs/SubCenterArcFaceLoss.png differ diff --git a/docs/dimensionreductionlayers.png b/docs/dimensionreductionlayers.png new file mode 100644 index 000000000..f7a0b2b4d Binary files /dev/null and b/docs/dimensionreductionlayers.png differ diff --git a/docs/facialrecognition.png b/docs/facialrecognition.png new file mode 100644 index 000000000..4ee5c8035 Binary files /dev/null and b/docs/facialrecognition.png differ diff --git a/models/ai85net-faceid_112.py b/models/ai85net-faceid_112.py new file mode 100644 index 000000000..eeb33e2bd --- /dev/null +++ b/models/ai85net-faceid_112.py @@ -0,0 +1,141 @@ +################################################################################################### +# +# Copyright (C) 2019-2023 Maxim Integrated Products, Inc. All Rights Reserved. +# +# Maxim Integrated Products, Inc. Default Copyright Notice: +# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html +# +################################################################################################### +""" +FaceID networks for MAX78000 + +""" +import torch.nn.functional as F +from torch import nn + +import ai8x +import ai8x_blocks + + +class AI85FaceIDNet_112(nn.Module): + """ + FaceID Network for MAX78000 with 112x112 input + """ + + def __init__( # pylint: disable=too-many-arguments + self, + pre_layer_stride, + bottleneck_settings, + last_layer_width, + emb_dimensionality, + num_classes=None, # pylint: disable=unused-argument + avg_pool_size=(7, 7), + num_channels=3, + dimensions=(112, 112), # pylint: disable=unused-argument + bias=False, + depthwise_bias=False, + reduced_depthwise_bias=False, + **kwargs + ): + super().__init__() + # bias = False due to streaming + self.pre_stage = ai8x.FusedConv2dReLU(num_channels, bottleneck_settings[0][1], 3, + padding=1, stride=pre_layer_stride, + bias=False, **kwargs) + # bias = False due to streaming + self.pre_stage_2 = ai8x.FusedMaxPoolConv2dReLU(bottleneck_settings[0][1], + bottleneck_settings[0][1], 3, padding=1, + stride=1, pool_size=2, pool_stride=2, + bias=False, **kwargs) + self.feature_stage = nn.ModuleList([]) + for setting in bottleneck_settings: + self._create_bottleneck_stage(setting, bias, depthwise_bias, + reduced_depthwise_bias, **kwargs) + + self.post_stage = ai8x.FusedConv2dReLU(bottleneck_settings[-1][2], last_layer_width, 1, + padding=0, stride=1, bias=False, **kwargs) + + self.pre_avg = ai8x.Conv2d(last_layer_width, last_layer_width, 3, padding=1, stride=1, + bias=False, **kwargs) + self.avg_pool = ai8x.AvgPool2d(avg_pool_size, stride=1) + self.linear = ai8x.Linear(last_layer_width, emb_dimensionality, bias=bias, **kwargs) + + def _create_bottleneck_stage(self, setting, bias, depthwise_bias, + reduced_depthwise_bias, **kwargs): + """Function to create bottlencek stage. Setting format is: + [num_repeat, in_channels, out_channels, stride, expansion_factor] + """ + stage = [] + + if setting[0] > 0: + stage.append(ai8x_blocks.ConvResidualBottleneck(in_channels=setting[1], + out_channels=setting[2], + stride=setting[3], + expansion_factor=setting[4], bias=bias, + depthwise_bias=depthwise_bias, + **kwargs)) + + for i in range(1, setting[0]): + if reduced_depthwise_bias: + stage.append(ai8x_blocks.ConvResidualBottleneck(in_channels=setting[2], + out_channels=setting[2], + stride=1, + expansion_factor=setting[4], + bias=bias, + depthwise_bias=(i % 2 == 0) and + depthwise_bias, **kwargs)) + else: + stage.append(ai8x_blocks.ConvResidualBottleneck(in_channels=setting[2], + out_channels=setting[2], + stride=1, + expansion_factor=setting[4], + bias=bias, + depthwise_bias=depthwise_bias, + **kwargs)) + + self.feature_stage.append(nn.Sequential(*stage)) + + def forward(self, x): # pylint: disable=arguments-differ + """Forward prop""" + if x.shape[1] == 6: + x = x[:, 0:3, :, :] + x = self.pre_stage(x) + x = self.pre_stage_2(x) + for stage in self.feature_stage: + x = stage(x) + x = self.post_stage(x) + x = self.pre_avg(x) + x = self.avg_pool(x) + x = x.view(x.size(0), -1) + x = self.linear(x) + x = F.normalize(x, p=2, dim=1) + return x + + +def ai85faceidnet_112(pretrained=False, **kwargs): + """ + Constructs a FaceIDNet_112 model. + """ + assert not pretrained + # settings for bottleneck stages in format + # [num_repeat, in_channels, out_channels, stride, expansion_factor] + bottleneck_settings = [ + [1, 32, 48, 2, 2], + [1, 48, 64, 2, 4], + [1, 64, 64, 1, 2], + [1, 64, 96, 2, 4], + [1, 96, 128, 1, 2] + ] + + return AI85FaceIDNet_112(pre_layer_stride=1, bottleneck_settings=bottleneck_settings, + last_layer_width=128, emb_dimensionality=64, avg_pool_size=(7, 7), + depthwise_bias=True, reduced_depthwise_bias=True, **kwargs) + + +models = [ + { + 'name': 'ai85faceidnet_112', + 'min_input': 1, + 'dim': 3, + } +] diff --git a/models/ai87net-mobilefacenet_112.py b/models/ai87net-mobilefacenet_112.py new file mode 100644 index 000000000..c039d1e3e --- /dev/null +++ b/models/ai87net-mobilefacenet_112.py @@ -0,0 +1,140 @@ +################################################################################################### +# +# Copyright (C) 2023 Maxim Integrated Products, Inc. All Rights Reserved. +# +# Maxim Integrated Products, Inc. Default Copyright Notice: +# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html +# +################################################################################################### +""" +MobileFaceNet [1] network implementation for MAX78002. + +[1] Chen, Sheng, et al. "Mobilefacenets: Efficient cnns for accurate real-time face verification +on mobile devices." Biometric Recognition: 13th Chinese Conference, CCBR 2018, Urumqi, China, +August 11-12, 2018, Proceedings 13. Springer International Publishing, 2018. +""" +import torch.nn.functional as F +from torch import nn + +import ai8x +import ai8x_blocks + + +class AI87MobileFaceNet(nn.Module): + """ + MobileFaceNet for MAX78002 + """ + def __init__( # pylint: disable=too-many-arguments + self, + pre_layer_stride, + bottleneck_settings, + last_layer_width, + emb_dimensionality, + num_classes=None, # pylint: disable=unused-argument + avg_pool_size=(7, 7), + num_channels=3, + dimensions=(112, 112), # pylint: disable=unused-argument + bias=False, + depthwise_bias=False, + reduced_depthwise_bias=False, + **kwargs + ): + super().__init__() + + # bias = False due to streaming + self.pre_stage = ai8x.FusedConv2dReLU(num_channels, bottleneck_settings[0][1], 3, + padding=1, stride=pre_layer_stride, + bias=False, **kwargs) + + self.dwise = ai8x.FusedMaxPoolDepthwiseConv2dReLU(64, 64, 3, padding=1, stride=1, + pool_size=2, pool_stride=2, + bias=depthwise_bias, **kwargs) + self.feature_stage = nn.ModuleList([]) + for setting in bottleneck_settings: + self._create_bottleneck_stage(setting, bias, depthwise_bias, + reduced_depthwise_bias, **kwargs) + + self.post_stage = ai8x.FusedConv2dReLU(bottleneck_settings[-1][2], last_layer_width, 1, + padding=0, stride=1, bias=False, **kwargs) + self.classifier = ai8x.FusedAvgPoolConv2d(last_layer_width, emb_dimensionality, + 1, padding=0, stride=1, pool_size=avg_pool_size, + pool_stride=1, bias=False, wide=False, + **kwargs) + + def _create_bottleneck_stage(self, setting, bias, depthwise_bias, + reduced_depthwise_bias, **kwargs): + """Function to create bottlencek stage. Setting format is: + [num_repeat, in_channels, out_channels, stride, expansion_factor] + """ + stage = [] + + if setting[0] > 0: + stage.append(ai8x_blocks.ResidualBottleneck(in_channels=setting[1], + out_channels=setting[2], + stride=setting[3], + expansion_factor=setting[4], + bias=bias, depthwise_bias=depthwise_bias, + **kwargs)) + + for i in range(1, setting[0]): + if reduced_depthwise_bias: + stage.append(ai8x_blocks.ResidualBottleneck(in_channels=setting[2], + out_channels=setting[2], + stride=1, + expansion_factor=setting[4], + bias=bias, + depthwise_bias=(i % 2 == 0) and + depthwise_bias, **kwargs)) + else: + stage.append(ai8x_blocks.ResidualBottleneck(in_channels=setting[2], + out_channels=setting[2], + stride=1, + expansion_factor=setting[4], + bias=bias, + depthwise_bias=depthwise_bias, + **kwargs)) + + self.feature_stage.append(nn.Sequential(*stage)) + + def forward(self, x): # pylint: disable=arguments-differ + """Forward prop""" + if x.shape[1] == 6: + x = x[:, 0:3, :, :] + x = self.pre_stage(x) + x = self.dwise(x) + for stage in self.feature_stage: + x = stage(x) + x = self.post_stage(x) + x = self.classifier(x) + x = F.normalize(x, p=2, dim=1) + x = x.squeeze() + return x + + +def ai87netmobilefacenet_112(pretrained=False, **kwargs): + """ + Constructs a MobileFaceNet model. + """ + assert not pretrained + # settings for bottleneck stages in format + # [num_repeat, in_channels, out_channels, stride, expansion_factor] + bottleneck_settings = [ + [5, 64, 64, 2, 2], + [1, 64, 128, 2, 4], + [6, 128, 128, 1, 2], + [1, 128, 128, 2, 4], + [2, 128, 128, 1, 2] + ] + + return AI87MobileFaceNet(pre_layer_stride=1, bottleneck_settings=bottleneck_settings, + last_layer_width=128, emb_dimensionality=64, avg_pool_size=(7, 7), + depthwise_bias=True, reduced_depthwise_bias=True, **kwargs) + + +models = [ + { + 'name': 'ai87netmobilefacenet_112', + 'min_input': 1, + 'dim': 3, + }, +] diff --git a/models/model_irse_drl.py b/models/model_irse_drl.py new file mode 100644 index 000000000..eba2b0795 --- /dev/null +++ b/models/model_irse_drl.py @@ -0,0 +1,430 @@ +################################################################################################### +# +# MIT License +# +# Copyright (c) 2019 Jian Zhao +# Portions Copyright (C) 2023-2024 Maxim Integrated Products, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################################### +""" +FaceID Teacher Model to be used for Knowledge Distillation +""" +from collections import namedtuple + +import torch +import torch.nn.functional as F +import torchvision.transforms.functional as FT +from torch import nn + + +class DRL(nn.Module): + """ + Dimensionality reduction layers + Expects unnormalized 512 embeddings from the Teacher Model + """ + def __init__( + self, + dimensionality, + bias=True, + ): + super().__init__() + self.conv1 = nn.Conv1d(512, 512, 1, padding=0, bias=bias) + self.BN1 = nn.BatchNorm1d(512) + self.PRelu1 = nn.PReLU(512) + self.conv2 = nn.Conv1d(512, dimensionality, 1, padding=0, bias=bias) + self.BN2 = nn.BatchNorm1d(dimensionality) + + def forward(self, x): # pylint: disable=arguments-differ + """Forward prop""" + x = torch.unsqueeze(x, 2) + x = self.conv1(x) + x = self.BN1(x) + x = self.PRelu1(x) + x = self.conv2(x) + x = self.BN2(x) + x = torch.squeeze(x, 2) + return x + + +class Ensemble(nn.Module): + """ + Ensemble of Teacher and DRL + """ + def __init__(self, resnet, drl): + super().__init__() + self.resnet = resnet + self.DRL = drl + self.Teacher_mode = False + + def forward(self, x): + """Forward prop""" + if x.shape[1] == 6: + if not self.Teacher_mode: + self.Teacher_mode = True + x = x[:, 3:, :, :] + x_flip = FT.hflip(x) + x = torch.cat((x, x_flip), 0) + x = self.resnet(x) + x = self.DRL(x) + if self.Teacher_mode: + x = x[:x.shape[0]//2] + x[x.shape[0]//2:] # Flip fusion + x = F.normalize(x, p=2, dim=1) + return x + + +class Flatten(nn.Module): + """Flattens the input""" + def forward(self, x): + """Forward prop""" + return x.view(x.size(0), -1) + + +def l2_norm(x, axis=1): + """l2 norm""" + norm = torch.norm(x, 2, axis, True) + output = torch.div(x, norm) + return output + + +class SEModule(nn.Module): + """ + SEModule + """ + def __init__(self, channels, reduction): + super().__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d( + channels, channels // reduction, kernel_size=1, padding=0, bias=False) + + nn.init.xavier_uniform_(self.fc1.weight.data) + + self.relu = nn.ReLU(inplace=True) + self.fc2 = nn.Conv2d( + channels // reduction, channels, kernel_size=1, padding=0, bias=False) + + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + """Forward prop""" + module_input = x + x = self.avg_pool(x) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.sigmoid(x) + + return module_input * x + + +class bottleneck_IR(nn.Module): + """ + IR bottleneck module + """ + def __init__(self, in_channel, depth, stride): + super().__init__() + if in_channel == depth: + self.shortcut_layer = nn.MaxPool2d(1, stride) + else: + self.shortcut_layer = nn.Sequential( + nn.Conv2d(in_channel, depth, (1, 1), stride, bias=False), nn.BatchNorm2d(depth)) + self.res_layer = nn.Sequential( + nn.BatchNorm2d(in_channel), + nn.Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), nn.PReLU(depth), + nn.Conv2d(depth, depth, (3, 3), stride, 1, bias=False), nn.BatchNorm2d(depth)) + + def forward(self, x): + """Forward prop""" + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + + return res + shortcut + + +class bottleneck_IR_SE(nn.Module): + """ + IR bottleneck module with SE + """ + def __init__(self, in_channel, depth, stride): + super().__init__() + if in_channel == depth: + self.shortcut_layer = nn.MaxPool2d(1, stride) + else: + self.shortcut_layer = nn.Sequential( + nn.Conv2d(in_channel, depth, (1, 1), stride, bias=False), + nn.BatchNorm2d(depth)) + self.res_layer = nn.Sequential( + nn.BatchNorm2d(in_channel), + nn.Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), + nn.PReLU(depth), + nn.Conv2d(depth, depth, (3, 3), stride, 1, bias=False), + nn.BatchNorm2d(depth), + SEModule(depth, 16) + ) + + def forward(self, x): + """Forward prop""" + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + + return res + shortcut + + +class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])): + '''A named tuple describing a ResNet block.''' + + +def get_block(in_channel, depth, num_units, stride=2): + """Creates a bottleneck block.""" + return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) + for i in range(num_units - 1)] + + +def get_blocks(num_layers): + """Creates the block architecture for the given model.""" + if num_layers == 50: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=4), + get_block(in_channel=128, depth=256, num_units=14), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 100: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=13), + get_block(in_channel=128, depth=256, num_units=30), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 152: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=8), + get_block(in_channel=128, depth=256, num_units=36), + get_block(in_channel=256, depth=512, num_units=3) + ] + + return blocks + + +class Backbone(nn.Module): + """ + Constructs a backbone with the given parameters. + """ + def __init__(self, input_size, num_layers, mode='ir'): + super().__init__() + assert input_size[0] in [112, 224], "input_size should be [112, 112] or [224, 224]" + assert num_layers in [50, 100, 152], "num_layers should be 50, 100 or 152" + assert mode in ['ir', 'ir_se'], "mode should be ir or ir_se" + blocks = get_blocks(num_layers) + if mode == 'ir': + unit_module = bottleneck_IR + elif mode == 'ir_se': + unit_module = bottleneck_IR_SE + self.input_layer = nn.Sequential(nn.Conv2d(3, 64, (3, 3), 1, 1, bias=False), + nn.BatchNorm2d(64), + nn.PReLU(64)) + if input_size[0] == 112: + # Dropout is set to 0, due to the train.py structure + self.output_layer = nn.Sequential(nn.BatchNorm2d(512), + nn.Dropout(p=0), + Flatten(), + nn.Linear(512 * 7 * 7, 512), + nn.BatchNorm1d(512)) + else: + self.output_layer = nn.Sequential(nn.BatchNorm2d(512), + nn.Dropout(p=0), + Flatten(), + nn.Linear(512 * 14 * 14, 512), + nn.BatchNorm1d(512)) + + modules = [] + for block in blocks: + for bottleneck in block: + modules.append( + unit_module(bottleneck.in_channel, + bottleneck.depth, + bottleneck.stride)) + self.body = nn.Sequential(*modules) + + self._initialize_weights() + + def forward(self, x): + """Forward prop""" + x = self.input_layer(x) + x = self.body(x) + x = self.output_layer(x) + return x + + def _initialize_weights(self): + """Initializes the weights.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.xavier_uniform_(m.weight.data) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm1d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight.data) + if m.bias is not None: + m.bias.data.zero_() + + +def ir_50(input_size=(112, 112), # pylint: disable=unused-argument + dimensionality=64, + backbone_checkpoint=None, **kwargs): + """Constructs a ir-50 model. + """ + model = Backbone(input_size, 50, 'ir') + if backbone_checkpoint is not None: + model.load_state_dict(torch.load(backbone_checkpoint, map_location=torch.device('cpu'))) + for param in model.parameters(): + param.requires_grad = False + drl = DRL(dimensionality) + ensemble = Ensemble(model, drl) + + return ensemble + + +def ir_101(input_size=(112, 112), # pylint: disable=unused-argument + dimensionality=64, + backbone_checkpoint=None, **kwargs): + """Constructs a ir-101 model. + """ + model = Backbone(input_size, 100, 'ir') + if backbone_checkpoint is not None: + model.load_state_dict(torch.load(backbone_checkpoint, map_location=torch.device('cpu'))) + for param in model.parameters(): + param.requires_grad = False + drl = DRL(dimensionality) + ensemble = Ensemble(model, drl) + + return ensemble + + +def ir_152(input_size=(112, 112), # pylint: disable=unused-argument + dimensionality=64, + backbone_checkpoint=None, **kwargs): + """Constructs a ir-152 model. + """ + model = Backbone(input_size, 152, 'ir') + if backbone_checkpoint is not None: + model.load_state_dict(torch.load(backbone_checkpoint, map_location=torch.device('cpu'))) + for param in model.parameters(): + param.requires_grad = False + drl = DRL(dimensionality) + + ensemble = Ensemble(model, drl) + + return ensemble + + +def ir_se_50(input_size=(112, 112), # pylint: disable=unused-argument + dimensionality=64, + backbone_checkpoint=None, **kwargs): + """Constructs a ir_se-50 model. + """ + model = Backbone(input_size, 50, 'ir_se') + if backbone_checkpoint is not None: + model.load_state_dict(torch.load(backbone_checkpoint, map_location=torch.device('cpu'))) + for param in model.parameters(): + param.requires_grad = False + drl = DRL(dimensionality) + ensemble = Ensemble(model, drl) + + return ensemble + + +def ir_se_101(input_size=(112, 112), # pylint: disable=unused-argument + dimensionality=64, + backbone_checkpoint=None, **kwargs): + """Constructs a ir_se-101 model. + """ + model = Backbone(input_size, 100, 'ir_se') + if backbone_checkpoint is not None: + model.load_state_dict(torch.load(backbone_checkpoint, map_location=torch.device('cpu'))) + for param in model.parameters(): + param.requires_grad = False + drl = DRL(dimensionality) + ensemble = Ensemble(model, drl) + + return ensemble + + +def ir_se_152(input_size=(112, 112), # pylint: disable=unused-argument + dimensionality=64, + backbone_checkpoint=None, **kwargs): + """Constructs a ir_se-152 model. + """ + model = Backbone(input_size, 152, 'ir_se') + if backbone_checkpoint is not None: + model.load_state_dict(torch.load(backbone_checkpoint, map_location=torch.device('cpu'))) + for param in model.parameters(): + param.requires_grad = False + drl = DRL(dimensionality) + ensemble = Ensemble(model, drl) + + return ensemble + + +models = [ + { + 'name': 'ir_50', + 'min_input': 1, + 'dim': 2, + 'dr': True, + }, + { + 'name': 'ir_101', + 'min_input': 1, + 'dim': 2, + 'dr': True, + }, + { + 'name': 'ir_152', + 'min_input': 1, + 'dim': 2, + 'dr': True, + }, + { + 'name': 'ir_se_50', + 'min_input': 1, + 'dim': 2, + 'dr': True, + }, + { + 'name': 'ir_se_101', + 'min_input': 1, + 'dim': 2, + 'dr': True, + }, + { + 'name': 'ir_se_152', + 'min_input': 1, + 'dim': 2, + 'dr': True, + }, + +] diff --git a/parsecmd.py b/parsecmd.py index 6f703b36a..6f0690dbb 100644 --- a/parsecmd.py +++ b/parsecmd.py @@ -67,6 +67,17 @@ def get_parser(model_names, dataset_names): parser.add_argument('--avg-pool-rounding', action='store_true', default=False, help='when simulating, use "round()" in AvgPool operations ' '(default: use "floor()")') + parser.add_argument('--dr', type=int, default=None, + help='Embedding dimensionality for dimensionality' + 'reduction (default: None)') + parser.add_argument('--scaf-margin', default=28.6, + type=float, help='Margin hyperparameter' + 'for Sub-center ArcFace Loss') + parser.add_argument('--scaf-scale', default=64, + type=int, help='Scale hyperparameter for Sub-center ArcFace Loss') + parser.add_argument('--backbone-checkpoint', type=str, default=None, metavar='PATH', + help='path to checkpoint from which to load' + 'backbone weights (default: None)') parser.add_argument('--copy-output-folder', type=str, default=None, metavar='PATH', help='Path to copy output folder (default: None)') parser.add_argument('--kd-relationbased', action='store_true', default=False, @@ -95,6 +106,10 @@ def get_parser(model_names, dataset_names): help='optimizer for training (default: SGD)') optimizer_args.add_argument('--lr', '--learning-rate', type=float, metavar='LR', help='initial learning rate') + optimizer_args.add_argument('--scaf-lr', default=1e-4, + type=float, metavar='SCAF_LR', + help='initial learning rate for Sub-center' + 'ArcFace Loss optimizer') optimizer_args.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') optimizer_args.add_argument('--weight-decay', '--wd', default=1e-4, type=float, @@ -126,6 +141,9 @@ def get_parser(model_names, dataset_names): help='save as CSVs with the given prefix during evaluation') mgroup.add_argument('--save-sample', dest='generate_sample', type=int, help='save the sample at given index as NumPy sample data') + parser.add_argument('--slice-sample', action='store_true', default=False, + help='for models that require RGB input, when the sample from the dataset ' + 'has additional channels, slice the sample into 3 channels') parser.add_argument('--shap', default=0, type=int, help='select # of images from the test set and plot SHAP after evaluation') parser.add_argument('--activation-stats', '--act-stats', nargs='+', metavar='PHASE', diff --git a/policies/qat_policy_faceid_112.yaml b/policies/qat_policy_faceid_112.yaml new file mode 100644 index 000000000..a40fdeaea --- /dev/null +++ b/policies/qat_policy_faceid_112.yaml @@ -0,0 +1,17 @@ +--- +start_epoch: 25 +weight_bits: 4 +shift_quantile: 0.6 +overrides: + pre_stage: + weight_bits: 8 + pre_stage_2: + weight_bits: 8 + feature_stage.1.0.conv2: + weight_bits: 2 + feature_stage.2.0.conv2: + weight_bits: 2 + feature_stage.4.0.conv2: + weight_bits: 2 + linear: + weight_bits: 8 diff --git a/policies/qat_policy_mobilefacenet_112.yaml b/policies/qat_policy_mobilefacenet_112.yaml new file mode 100644 index 000000000..f8a4e6247 --- /dev/null +++ b/policies/qat_policy_mobilefacenet_112.yaml @@ -0,0 +1,3 @@ +--- +start_epoch: 25 +weight_bits: 8 diff --git a/policies/schedule-faceid_112.yaml b/policies/schedule-faceid_112.yaml new file mode 100644 index 000000000..6d36bb38e --- /dev/null +++ b/policies/schedule-faceid_112.yaml @@ -0,0 +1,13 @@ +--- +lr_schedulers: + training_lr: + class: MultiStepLR + milestones: [10, 15, 20, 25, 40, 50, 60] + gamma: 0.5 + +policies: + - lr_scheduler: + instance_name: training_lr + starting_epoch: 0 + ending_epoch: 80 + frequency: 1 diff --git a/policies/schedule-mobilefacenet_112.yaml b/policies/schedule-mobilefacenet_112.yaml new file mode 100644 index 000000000..5959971ff --- /dev/null +++ b/policies/schedule-mobilefacenet_112.yaml @@ -0,0 +1,13 @@ +--- +lr_schedulers: + training_lr: + class: MultiStepLR + milestones: [10, 15, 20, 25, 26, 28, 30, 32, 34] + gamma: 0.5 + +policies: + - lr_scheduler: + instance_name: training_lr + starting_epoch: 0 + ending_epoch: 35 + frequency: 1 diff --git a/requirements.txt b/requirements.txt index bb7a713bb..a0e0ac082 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,29 @@ -numpy>=1.22,<1.23 -PyYAML>=5.1.1 -scipy>=1.3.0 -librosa>=0.7.2 -Pillow>=7 -shap>=0.34.0 -tk>=0.1.0 torch==1.8.1 torchaudio==0.8.1 torchvision==0.9.1 -tensorboard>=2.9.0,<2.10.0 -protobuf>=3.20.1,<4.0 +GitPython>=3.1.18 +Pillow>=7 +PyYAML>=5.1.1 +albumentations>=1.3.0 +faiss-cpu==1.7.4 +h5py>=3.7.0 +hawk-eyes==2.1.0 +imutils==0.5.4 +kornia==0.6.8 +librosa>=0.7.2 numba<0.50.0 +numpy>=1.22,<1.23 +onnx==1.15.0 +onnxruntime==1.7.0 opencv-python>=4.4.0 -h5py>=3.7.0 -torchmetrics==0.6.0 -pycocotools==2.0.6 -albumentations>=1.3.0 -pytube>=12.1.3 +protobuf>=3.20.1,<4.0 +pycocotools==2.0.7 pyffmpeg==2.0 -GitPython>=3.1.18 +pytorch-metric-learning==2.3.0 +pytube>=12.1.3 +scipy>=1.3.0 +shap>=0.34.0 +tensorboard>=2.9.0,<2.10.0 +tk>=0.1.0 +torchmetrics==0.6.0 -e distiller diff --git a/sample.py b/sample.py index 21e0cc63e..c1616e497 100644 --- a/sample.py +++ b/sample.py @@ -19,6 +19,7 @@ def generate( outputs, # pylint: disable=unused-argument dataset_name, search=False, # pylint: disable=unused-argument + slice_sample=False, ): """ Save the element `index` from the `inputs` batch to a file named "sample_`dataset_name`.npy". @@ -33,6 +34,8 @@ def generate( print(f'==> Saving sample at index {index} to {sample_name}.npy') x = inputs[index].cpu().numpy().astype('int64') + if slice_sample: + x = x[0:3, :, :] x = np.clip(x, -128, 127) np.save(sample_name, x, allow_pickle=False, fix_imports=False) diff --git a/scripts/evaluate_faceid.sh b/scripts/evaluate_faceid.sh deleted file mode 100755 index c12272e99..000000000 --- a/scripts/evaluate_faceid.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -python train.py --model ai85faceidnet --dataset FaceID --regression --evaluate --exp-load-weights-from ../ai8x-synthesis/trained/ai85-faceid-qat8-q.pth.tar -8 --device MAX78000 "$@" diff --git a/scripts/evaluate_faceid_112.sh b/scripts/evaluate_faceid_112.sh new file mode 100755 index 000000000..d2634a5a8 --- /dev/null +++ b/scripts/evaluate_faceid_112.sh @@ -0,0 +1,2 @@ +#!/bin/sh +python train.py --model ai85faceidnet_112 --dataset VGGFace2_FaceID --kd-student-wt 0 --kd-distill-wt 1 --kd-teacher ir_152 --kd-resume pretrained/ir152_dim64/best.pth.tar --kd-relationbased --evaluate --device MAX78000 --exp-load-weights-from ../ai8x-synthesis/trained/ai85-faceid_112-qat-q.pth.tar -8 --use-bias --save-sample 10 --slice-sample "$@" \ No newline at end of file diff --git a/scripts/evaluate_mobilefacenet_112.sh b/scripts/evaluate_mobilefacenet_112.sh new file mode 100755 index 000000000..8b7399506 --- /dev/null +++ b/scripts/evaluate_mobilefacenet_112.sh @@ -0,0 +1,2 @@ +#!/bin/sh +python train.py --model ai87netmobilefacenet_112 --dataset VGGFace2_FaceID --kd-student-wt 0 --kd-distill-wt 1 --kd-teacher ir_152 --kd-resume pretrained/ir152_dim64/best.pth.tar --kd-relationbased --evaluate --device MAX78002 --exp-load-weights-from ../ai8x-synthesis/trained/ai87-mobilefacenet_112_qat_best-q.pth.tar -8 --use-bias --save-sample 10 --slice-sample "$@" \ No newline at end of file diff --git a/scripts/train_facedet_tinierssd.sh b/scripts/train_facedet_tinierssd.sh index 94ebd2549..d3c67f68f 100755 --- a/scripts/train_facedet_tinierssd.sh +++ b/scripts/train_facedet_tinierssd.sh @@ -1,2 +1,2 @@ #!/bin/sh -python train.py --deterministic --print-freq 1 --epochs 3 --optimizer Adam --lr 1e-3 --wd 5e-4 --model ai85tinierssdface --use-bias --momentum 0.9 --dataset VGGFace2_FaceDetection --device MAX78000 --obj-detection --obj-detection-params parameters/obj_detection_params_facedet.yaml --batch-size 100 --qat-policy policies/qat_policy_facedet.yaml --validation-split 0.1 "$@" +python train.py --deterministic --print-freq 100 --epochs 3 --optimizer Adam --lr 1e-3 --wd 5e-4 --model ai85tinierssdface --use-bias --momentum 0.9 --dataset VGGFace2_FaceDetection --device MAX78000 --obj-detection --obj-detection-params parameters/obj_detection_params_facedet.yaml --batch-size 100 --qat-policy policies/qat_policy_facedet.yaml --validation-split 0.1 "$@" diff --git a/scripts/train_faceid.sh b/scripts/train_faceid.sh deleted file mode 100755 index 1465c8b5d..000000000 --- a/scripts/train_faceid.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -python train.py --epochs 100 --optimizer Adam --lr 0.001 --wd 0 --deterministic --compress policies/schedule-faceid.yaml --model ai85faceidnet --dataset FaceID --batch-size 100 --device MAX78000 --regression --print-freq 250 "$@" diff --git a/scripts/train_faceid_112.sh b/scripts/train_faceid_112.sh new file mode 100755 index 000000000..594ff32d3 --- /dev/null +++ b/scripts/train_faceid_112.sh @@ -0,0 +1,3 @@ +#!/bin/sh +python train.py --epochs 4 --optimizer Adam --lr 0.001 --scaf-lr 1e-2 --scaf-scale 32 --copy-output-folder pretrained/ir152_dim64 --wd 5e-4 --deterministic --workers 8 --qat-policy None --model ir_152 --dr 64 --backbone-checkpoint pretrained/Backbone_IR_152_Epoch_112_Batch_2547328_Time_2019-07-13-02-59_checkpoint.pth --use-bias --dataset VGGFace2_FaceID_dr --batch-size 64 --device MAX78000 --validation-split 0 --print-freq 250 "$@" +python train.py --epochs 80 --optimizer Adam --lr 0.001 --compress policies/schedule-faceid_112.yaml --kd-student-wt 0 --kd-distill-wt 1 --qat-policy policies/qat_policy_faceid_112.yaml --model ai85faceidnet_112 --kd-teacher ir_152 --kd-resume pretrained/ir152_dim64/best.pth.tar --kd-relationbased --wd 0 --deterministic --workers 8 --use-bias --dataset VGGFace2_FaceID --batch-size 256 --device MAX78000 --print-freq 100 --validation-split 0 "$@" diff --git a/scripts/train_mobilefacenet_112.sh b/scripts/train_mobilefacenet_112.sh new file mode 100755 index 000000000..47a48c942 --- /dev/null +++ b/scripts/train_mobilefacenet_112.sh @@ -0,0 +1,3 @@ +#!/bin/sh +python train.py --epochs 4 --optimizer Adam --lr 0.001 --scaf-lr 1e-2 --scaf-scale 32 --copy-output-folder pretrained/ir152_dim64 --wd 5e-4 --deterministic --workers 8 --qat-policy None --model ir_152 --dr 64 --backbone-checkpoint pretrained/Backbone_IR_152_Epoch_112_Batch_2547328_Time_2019-07-13-02-59_checkpoint.pth --use-bias --dataset VGGFace2_FaceID_dr --batch-size 64 --device MAX78000 --validation-split 0 --print-freq 250 "$@" +python train.py --epochs 35 --optimizer Adam --lr 0.001 --compress policies/schedule-mobilefacenet_112.yaml --kd-student-wt 0 --kd-distill-wt 1 --qat-policy policies/qat_policy_mobilefacenet_112.yaml --model ai87netmobilefacenet_112 --kd-teacher ir_152 --kd-resume pretrained/ir152_dim64/best.pth.tar --kd-relationbased --wd 0 --deterministic --workers 8 --use-bias --dataset VGGFace2_FaceID --batch-size 100 --device MAX78002 --validation-split 0 --print-freq 100 "$@" diff --git a/train.py b/train.py index 724705888..c05e90814 100644 --- a/train.py +++ b/train.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # # Copyright (c) 2018 Intel Corporation -# Portions Copyright (C) 2019-2023 Maxim Integrated Products, Inc. +# Portions Copyright (C) 2019-2024 Maxim Integrated Products, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -95,6 +95,11 @@ RecordsActivationStatsCollector, SummaryActivationStatsCollector, collectors_context) from distiller.quantization.range_linear import PostTrainLinearQuantizer +from pytorch_metric_learning import losses as pml_losses +from pytorch_metric_learning import testers +from pytorch_metric_learning.distances import CosineSimilarity +from pytorch_metric_learning.utils.accuracy_calculator import AccuracyCalculator +from pytorch_metric_learning.utils.inference import CustomKNN from torchmetrics.detection.map import MAP as MeanAveragePrecision # pylint: enable=no-name-in-module @@ -333,6 +338,7 @@ def main(): # We can optionally resume from a checkpoint optimizer = None + loss_optimizer = None if args.resumed_checkpoint_path: update_old_model_params(args.resumed_checkpoint_path, model) if qat_policy is not None: @@ -387,6 +393,28 @@ def main(): alpha=obj_detection_params['multi_box_loss']['alpha'], neg_pos_ratio=obj_detection_params['multi_box_loss'] ['neg_pos_ratio'], device=args.device).to(args.device) + + elif args.dr: + + criterion = pml_losses.SubCenterArcFaceLoss(num_classes=args.num_classes, + embedding_size=args.dr, + margin=args.scaf_margin, + scale=args.scaf_scale) + if args.resumed_checkpoint_path: + checkpoint = torch.load(args.resumed_checkpoint_path, + map_location=lambda storage, loc: storage) + criterion.W = checkpoint['extras']['loss_weights'] + criterion = criterion.to(args.device) + + loss_optimizer = torch.optim.Adam(criterion.parameters(), lr=args.scaf_lr) + if args.resumed_checkpoint_path: + loss_optimizer.load_state_dict(checkpoint['extras']['loss_optimizer_state_dict']) + + distance_fn = CosineSimilarity() + custom_knn = CustomKNN(distance_fn, batch_size=args.batch_size) + accuracy_calculator = AccuracyCalculator(knn_func=custom_knn, + include=("precision_at_1",), k=1) + else: if not args.regression: if 'weight' in selected_source: @@ -430,22 +458,14 @@ def main(): args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) - if args.evaluate: - msglogger.info('Dataset sizes:\n\ttest=%d', len(test_loader.sampler)) - return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, - args, compression_scheduler) - - assert train_loader and val_loader - msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', - len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) - if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress, compression_scheduler, (start_epoch-1) - if args.resumed_checkpoint_path else None) + if args.resumed_checkpoint_path + else None, loss_optimizer) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) @@ -475,7 +495,8 @@ def main(): dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) if args.kd_relationbased: - args.kd_policy = kd_relationbased.RelationBasedKDPolicy(model, teacher, dlw) + args.kd_policy = kd_relationbased.RelationBasedKDPolicy(model, teacher, + dlw, args.act_mode_8bit) else: args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) @@ -514,6 +535,15 @@ def main(): args.epochs) create_nas_kd_policy(model, compression_scheduler, start_epoch, kd_end_epoch, args) + if args.evaluate: + msglogger.info('Dataset sizes:\n\ttest=%d', len(test_loader.sampler)) + return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, + args, compression_scheduler) + + assert train_loader and val_loader + msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', + len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) + vloss = 10**6 for epoch in range(start_epoch, ending_epoch): # pylint: disable=unsubscriptable-object @@ -557,7 +587,8 @@ def main(): # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, - loggers=all_loggers, args=args) + loggers=all_loggers, args=args, loss_optimizer=loss_optimizer) + # distiller.log_weights_sparsity(model, epoch, loggers=all_loggers) distiller.log_activation_statistics(epoch, "train", loggers=all_tbloggers, collector=collectors["sparsity"]) @@ -584,8 +615,11 @@ def main(): checkpoint_name = f'nas_stg{stage}_lev{level}' with collectors_context(activations_collectors["valid"]) as collectors: - top1, top5, vloss, mAP = validate(val_loader, model, criterion, [pylogger], - args, epoch, tflogger) + if not args.dr: + top1, top5, vloss, mAP = validate(val_loader, model, criterion, [pylogger], + args, epoch, tflogger) + else: + top1, top5, vloss, mAP = scaf_test(val_loader, model, accuracy_calculator) distiller.log_activation_statistics(epoch, "valid", loggers=all_tbloggers, collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) @@ -596,7 +630,7 @@ def main(): if not args.regression: stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1)])) - if args.num_classes > 5: + if args.num_classes > 5 and not args.dr: stats[1]['Top5'] = top5 else: stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), @@ -621,6 +655,9 @@ def main(): is_best = False checkpoint_extras = {'current_top1': top1, 'current_mAP': mAP} + if args.dr: + checkpoint_extras['loss_weights'] = criterion.W + checkpoint_extras['loss_optimizer_state_dict'] = loss_optimizer.state_dict() apputils.save_checkpoint(epoch, args.cnn, model, optimizer=optimizer, scheduler=compression_scheduler, extras=checkpoint_extras, @@ -631,7 +668,8 @@ def main(): compression_scheduler.on_epoch_end(epoch, optimizer) # Finally run results on the test set - test(test_loader, model, criterion, [pylogger], activations_collectors, args=args) + if not args.dr: + test(test_loader, model, criterion, [pylogger], activations_collectors, args=args) if args.copy_output_folder: msglogger.info('Copying output folder to: %s', args.copy_output_folder) @@ -664,6 +702,9 @@ def create_model(supported_models, dimensions, args, mode='default'): if not Model: raise RuntimeError("Model " + args.kd_teacher + " not found\n") + if args.dr and ('dr' not in module or not module['dr']): + raise ValueError("Dimensionality reduction is not supported for this model") + # Set model parameters if args.act_mode_8bit: weight_bits = 8 @@ -684,6 +725,12 @@ def create_model(supported_models, dimensions, args, mode='default'): model_args["bias_bits"] = bias_bits model_args["quantize_activation"] = quantize_activation + if args.dr: + model_args["dimensionality"] = args.dr + + if args.backbone_checkpoint: + model_args["backbone_checkpoint"] = args.backbone_checkpoint + if args.obj_detection: model_args["device"] = args.device @@ -729,7 +776,7 @@ def create_nas_kd_policy(model, compression_scheduler, epoch, next_state_start_e def train(train_loader, model, criterion, optimizer, epoch, - compression_scheduler, loggers, args): + compression_scheduler, loggers, args, loss_optimizer=None): """Training loop for one epoch.""" losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) @@ -823,7 +870,7 @@ def train(train_loader, model, criterion, optimizer, epoch, loss = criterion(output, target) # TODO Early exit mechanism for Object Detection case is NOT implemented yet - if not args.obj_detection and not args.kd_relationbased: + if not args.obj_detection and not args.dr and not args.kd_relationbased: if not args.earlyexit_lossweights: # Measure accuracy if the conditions are set. For `Last Batch` only accuracy # calculation last two batches are used as the last batch might include just a few @@ -868,11 +915,16 @@ def train(train_loader, model, criterion, optimizer, epoch, # Compute the gradient and do SGD step optimizer.zero_grad() + if args.dr: + loss_optimizer.zero_grad() + loss.backward() if compression_scheduler: compression_scheduler.before_parameter_optimization(epoch, train_step, steps_per_epoch, optimizer) optimizer.step() + if args.dr: + loss_optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) @@ -946,6 +998,23 @@ def update_bn_stats(train_loader, model, args): _ = model(inputs) +def get_all_embeddings(dataset, model): + """Get all embeddings from the test set""" + tester = testers.BaseTester() + return tester.get_all_embeddings(dataset, model) + + +def scaf_test(val_loader, model, accuracy_calculator): + """Perform test for SCAF""" + test_embeddings, test_labels = get_all_embeddings(val_loader.dataset, model) + test_labels = test_labels.squeeze(1) + accuracies = accuracy_calculator.get_accuracy( + test_embeddings, test_labels, None, None, True + ) + msglogger.info('Test set accuracy (Precision@1) = %f', accuracies['precision_at_1']) + return accuracies["precision_at_1"], 0, 0, 0 + + def validate(val_loader, model, criterion, loggers, args, epoch=-1, tflogger=None): """Model validation""" if epoch > -1: @@ -1192,7 +1261,8 @@ def save_tensor(t, f, regression=True): target /= 128. if args.generate_sample is not None and args.act_mode_8bit and not sample_saved: - sample.generate(args.generate_sample, inputs, target, output, args.dataset, False) + sample.generate(args.generate_sample, inputs, target, output, + args.dataset, False, args.slice_sample) sample_saved = True if args.csv_prefix is not None: diff --git a/train_all_models.sh b/train_all_models.sh index 5b23dcac5..1baac4a05 100755 --- a/train_all_models.sh +++ b/train_all_models.sh @@ -33,8 +33,11 @@ echo "-----------------------------" echo "Training kws20_v3 model" scripts/train_kws20_v3.sh "$@" echo "-----------------------------" -echo "Training faceid model" -scripts/train_faceid.sh "$@" +echo "Training faceid_112 model" +scripts/train_faceid_112.sh "$@" +echo "-----------------------------" +echo "Training mobilefacenet_112 model" +scripts/train_mobilefacenet_112.sh "$@" echo "-----------------------------" echo "Training unet model" scripts/train_camvid_unet.sh "$@" diff --git a/utils/kd_relationbased.py b/utils/kd_relationbased.py index 5610f8448..ed6c721b3 100644 --- a/utils/kd_relationbased.py +++ b/utils/kd_relationbased.py @@ -33,7 +33,7 @@ class RelationBasedKDPolicy(ScheduledTrainingPolicy): the distiller's ScheduledTrainingPolicy class. """ def __init__(self, student_model, teacher_model, - loss_weights=DistillationLossWeights(0.5, 0.5, 0)): + loss_weights=DistillationLossWeights(0.5, 0.5, 0), act_mode_8bit=False): super().__init__() self.student = student_model @@ -43,6 +43,7 @@ def __init__(self, student_model, teacher_model, self.loss_wts = loss_weights self.distillation_loss = nn.MSELoss() self.overall_loss = None + self.act_mode_8bit = act_mode_8bit # Active is always true, because test will be based on the overall loss and it will be # realized outside of the epoch loop @@ -65,6 +66,8 @@ def forward(self, *inputs): self.teacher_output = self.teacher(*inputs) out = self.student(*inputs) + if self.act_mode_8bit: + out /= 128. self.student_output = out.clone() return out