diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..adc29960 --- /dev/null +++ b/.gitignore @@ -0,0 +1,165 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Mac OS files +.DS_Store +.AppleDouble +.LSOverride + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..9fdea7f2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Aleksandr Karakulev + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 00000000..4834fd6f --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +# RLVI: Robust Learning via Variational Inference + +Implementation of [Adaptive Robust Learning using Latent Bernoulli Variables](https://arxiv.org/pdf/2312.00585). + +Our method, called RLVI, enables robust maximization of the likelihood. The learned parametric model is thus accurate even when the training data is corrupted. Additionally, RLVI is well-suited for online or stochastic optimization as it does not require estimating the total ratio of contaminated data and adaptively infers the probabilities of sample corruption. + + +## Standard parameter estimation +Benchmark, reproduced from [(Osama et al., 2020)](https://doi.org/10.1109/OJSP.2020.3039632), that compares robust learning algorithms on the four test problems with corrupted samples in the training data: linear and logistic regression, principal component analysis, covariance estimation. +``` +cd standard-learning +python3 main.py +``` + +## Online learning +Binary classification for Human Activity Recognition dataset [(Amine El Helou, 2023)](https://www.mathworks.com/matlabcentral/fileexchange/54138-sensor-har-recognition-app), performed in batches with varying level of corruption to simulate the online learning setting. +``` +cd online-learning +python3 main.py +``` + +Accuracy levels are higher with RLVI than with stochastic likelihood maximization (SGD). +
+ +
+ +## Deep learning +### Synthetic noise (MNIST, CIFAR10, CIFAR100) +Experiments with the datasets in which training data is corrupted with synthetic noise. +There are four types of noise: `symmetric`, `asymmetric`, `pairflip`, and `instance`. Noise rate from 0 to 1 needs to be specified to corrupt the training set. +For the method, one can use `rlvi` or one of the following: `regular`, `coteaching` [(Han et al., 2018)](https://papers.nips.cc/paper_files/paper/2018/hash/a19744e268754fb0148b017647355b7b-Abstract.html), `jocor` [(Wei et al., 2020)](https://openaccess.thecvf.com/content_CVPR_2020/papers/Wei_Combating_Noisy_Labels_by_Agreement_A_Joint_Training_Method_with_CVPR_2020_paper.pdf), `cdr` [(Xia et al., 2020)](https://openreview.net/forum?id=Eql5b1_hTE4), `usdnl` [(Xu et al., 2023)](https://doi.org/10.1609/aaai.v37i9.26264), and `bare` [(Patel & Sastry, 2023)](https://openaccess.thecvf.com/content/WACV2023/papers/Patel_Adaptive_Sample_Selection_for_Robust_Learning_Under_Label_Noise_WACV_2023_paper.pdf). + +Example: +``` +cd deep-learning +python3 main.py \ + --method=rlvi \ + --dataset=mnist \ + --noise_type=pairflip \ + --noise_rate=0.45 +``` + ++ +
+ +### Real noise (Food101) +Experiments with the dataset in which training data is corrupted by nature: some of the training images are mislabeled and contain some noise. +For the method, one can specify `rlvi` or one of the following: `regular`, `coteaching`, `jocor`, `cdr`, `usdnl`, `bare`. + +Example: +``` +cd deep-learning +python3 food.py --method=rlvi +``` diff --git a/Roboto.ttf b/Roboto.ttf new file mode 100644 index 00000000..0a1657d2 Binary files /dev/null and b/Roboto.ttf differ diff --git a/deep-learning/data_load.py b/deep-learning/data_load.py new file mode 100644 index 00000000..51a4aa1c --- /dev/null +++ b/deep-learning/data_load.py @@ -0,0 +1,515 @@ +import numpy as np +import torch +import torch.utils.data as Data +from PIL import Image +import os +import sys +if sys.version_info[0] == 2: + import cPickle as pickle +else: + import pickle + +import data_tools as tools + + +class Mnist(Data.Dataset): + urls = [ + 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', + 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', + 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', + 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', + ] + raw_folder = 'raw' + processed_folder = 'processed' + training_file = 'training.pt' + test_file = 'test.pt' + + def __init__(self, root, + train=True, transform=None, target_transform=None, download=False, + dataset='mnist', noise_type='symmetric', noise_rate=0.5, split_per=0.9, random_seed=1, num_class=10): + self.root = os.path.join(os.path.expanduser(root), 'MNIST') + self.transform = transform + self.target_transform = target_transform + self.train = train + + if download: + self.download() + + if not self._check_exists(): + raise RuntimeError('Dataset not found.' + + ' You can use download=True to download it') + + self.train_data, self.train_labels = torch.load( + os.path.join(self.root, self.processed_folder, self.training_file)) + self.train_labels = np.array(self.train_labels) + + # clean images and noisy labels (training and validation) + if noise_rate > 0: + self.train_data, self.val_data, self.train_labels, self.val_labels, self.noise_mask = tools.dataset_split( + self.train_data, self.train_labels, dataset, noise_type, noise_rate, split_per, random_seed, num_class + ) + else: + self.train_data, self.val_data, self.train_labels, self.val_labels, self.noise_mask = tools.dataset_split_without_noise( + self.train_data, self.train_labels, split_per, random_seed + ) + + def __getitem__(self, index): + if self.train: + img, label = self.train_data[index], self.train_labels[index] + else: + img, label = self.val_data[index], self.val_labels[index] + + img = Image.fromarray(img.numpy()) + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + label = self.target_transform(label) + + return img, label, index + + def __len__(self): + if self.train: + return len(self.train_data) + else: + return len(self.val_data) + + def _check_exists(self): + return os.path.exists(os.path.join(self.root, self.processed_folder, self.training_file)) and \ + os.path.exists(os.path.join(self.root, self.processed_folder, self.test_file)) + + def download(self): + """Download the MNIST data if it doesn't exist in processed_folder already.""" + from six.moves import urllib + import gzip + + if self._check_exists(): + return + + # download files + try: + os.makedirs(os.path.join(self.root, self.raw_folder)) + os.makedirs(os.path.join(self.root, self.processed_folder)) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + + for url in self.urls: + print('Downloading ' + url) + data = urllib.request.urlopen(url) + filename = url.rpartition('/')[2] + file_path = os.path.join(self.root, self.raw_folder, filename) + with open(file_path, 'wb') as f: + f.write(data.read()) + with open(file_path.replace('.gz', ''), 'wb') as out_f, \ + gzip.GzipFile(file_path) as zip_f: + out_f.write(zip_f.read()) + os.unlink(file_path) + + # process and save as torch files + print('Processing...') + + training_set = ( + tools.read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')), + tools.read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte')) + ) + test_set = ( + tools.read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')), + tools.read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte')) + ) + with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: + torch.save(training_set, f) + with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: + torch.save(test_set, f) + + print('Done!') + + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = 'train' if self.train is True else 'test' + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + tmp = ' Transforms (if any): ' + fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) + tmp = ' Target Transforms (if any): ' + fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) + return fmt_str + + +class MnistTest(Data.Dataset): + processed_folder = 'processed' + test_file = 'test.pt' + + def __init__(self, root, transform=None, target_transform=None): + self.root = os.path.join(os.path.expanduser(root), 'MNIST') + self.transform = transform + self.target_transform = target_transform + + self.test_data, self.test_labels = torch.load( + os.path.join(self.root, self.processed_folder, self.test_file)) + self.test_labels = np.array(self.test_labels) + + + def __getitem__(self, index): + img, label = self.test_data[index], self.test_labels[index] + img = Image.fromarray(img.numpy()) + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + label = self.target_transform(label) + + return img, label, index + + def __len__(self): + return len(self.test_data) + + +class Cifar10(Data.Dataset): + base_folder = 'cifar-10-batches-py' + url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" + filename = "cifar-10-python.tar.gz" + tgz_md5 = 'c58f30108f718f92721af3b95e74349a' + train_list = [ + ['data_batch_1', 'c99cafc152244af753f735de768cd75f'], + ['data_batch_2', 'd4bba439e000b95fd0a9bffe97cbabec'], + ['data_batch_3', '54ebc095f3ab1f0389bbae665268c751'], + ['data_batch_4', '634d18415352ddfa80567beed471001a'], + ['data_batch_5', '482c414d41f54cd18b22e5b47cb7c3cb'], + ] + + test_list = [ + ['test_batch', '40351d587109b95175f43aff81a1287e'], + ] + + def __init__(self, root, train=True, transform=None, target_transform=None, download=False, + dataset='cifar10', noise_type='symmetric', noise_rate=0.5, + split_per=0.9, random_seed=1, num_class=10 + ): + self.root = os.path.join(os.path.expanduser(root), 'CIFAR') + self.transform = transform + self.target_transform = target_transform + self.train = train + + if download: + self.download() + + if not self._check_integrity(): + raise RuntimeError('Dataset not found or corrupted.' + + ' You can use download=True to download it') + + # now load the picked numpy arrays + self.train_data = [] + self.train_labels = [] + for fentry in self.train_list: + f = fentry[0] + file = os.path.join(self.root, self.base_folder, f) + fo = open(file, 'rb') + if sys.version_info[0] == 2: + entry = pickle.load(fo) + else: + entry = pickle.load(fo, encoding='latin1') + self.train_data.append(entry['data']) + if 'labels' in entry: + self.train_labels += entry['labels'] + else: + self.train_labels += entry['fine_labels'] + fo.close() + + self.train_data = np.concatenate(self.train_data) + self.train_labels = np.array(self.train_labels) + # self.train_data = self.train_data.reshape((50000, 3, 32, 32)) + # self.train_data = self.train_data.transpose((0, 2, 3, 1)) # convert to HWC + + # clean images and noisy labels (training and validation) + if noise_rate > 0: + self.train_data, self.val_data, self.train_labels, self.val_labels, self.noise_mask = tools.dataset_split( + self.train_data, self.train_labels, dataset, noise_type, noise_rate, split_per, random_seed, num_class + ) + else: + self.train_data, self.val_data, self.train_labels, self.val_labels, self.noise_mask = tools.dataset_split_without_noise( + self.train_data, self.train_labels, split_per, random_seed + ) + + if self.train: + self.train_data = self.train_data.reshape((-1, 3, 32, 32)) + self.train_data = self.train_data.transpose((0, 2, 3, 1)) + + else: + self.val_data = self.val_data.reshape((-1, 3, 32, 32)) + self.val_data = self.val_data.transpose((0, 2, 3, 1)) + + def __getitem__(self, index): + + if self.train: + img, label = self.train_data[index], self.train_labels[index] + + else: + img, label = self.val_data[index], self.val_labels[index] + + img = Image.fromarray(img) + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + label = self.target_transform(label) + + return img, label, index + + def __len__(self): + if self.train: + return len(self.train_data) + else: + return len(self.val_data) + + def _check_integrity(self): + root = self.root + for fentry in (self.train_list + self.test_list): + filename, md5 = fentry[0], fentry[1] + fpath = os.path.join(root, self.base_folder, filename) + if not tools.check_integrity(fpath, md5): + return False + return True + + def download(self): + import tarfile + + if self._check_integrity(): + print('Files already downloaded and verified') + return + + root = self.root + tools.download_url(self.url, root, self.filename, self.tgz_md5) + + # extract file + cwd = os.getcwd() + tar = tarfile.open(os.path.join(root, self.filename), "r:gz") + os.chdir(root) + tar.extractall() + tar.close() + os.chdir(cwd) + + +class Cifar10Test(Data.Dataset): + base_folder = 'cifar-10-batches-py' + test_list = [ + ['test_batch', '40351d587109b95175f43aff81a1287e'], + ] + + def __init__(self, root, transform=None, target_transform=None): + self.root = os.path.join(os.path.expanduser(root), 'CIFAR') + self.transform = transform + self.target_transform = target_transform + + f = self.test_list[0][0] + file = os.path.join(self.root, self.base_folder, f) + fo = open(file, 'rb') + if sys.version_info[0] == 2: + entry = pickle.load(fo) + else: + entry = pickle.load(fo, encoding='latin1') + self.test_data = entry['data'] + if 'labels' in entry: + self.test_labels = entry['labels'] + else: + self.test_labels = entry['fine_labels'] + fo.close() + self.test_data = self.test_data.reshape((-1, 3, 32, 32)) + self.test_data = self.test_data.transpose((0, 2, 3, 1)) + self.test_labels = np.array(self.test_labels) + + def __getitem__(self, index): + img, label = self.test_data[index], self.test_labels[index] + img = Image.fromarray(img) + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + label = self.target_transform(label) + + return img, label, index + + def __len__(self): + return len(self.test_data) + + +class Cifar100(Data.Dataset): + + base_folder = 'cifar-100-python' + url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" + filename = "cifar-100-python.tar.gz" + tgz_md5 = 'eb9058c3a382ffc7106e4002c42a8d85' + train_list = [ + ['train', '16019d7e3df5f24257cddd939b257f8d'], + ] + + test_list = [ + ['test', 'f0ef6b0ae62326f3e7ffdfab6717acfc'], + ] + + def __init__(self, root, train=True, transform=None, target_transform=None, download=False, + dataset='cifar100', noise_type='symmetric', noise_rate=0.5, + split_per=0.9, random_seed=1, num_class=100 + ): + self.root = os.path.join(os.path.expanduser(root), 'CIFAR') + self.transform = transform + self.target_transform = target_transform + self.train = train + + if download: + self.download() + + if not self._check_integrity(): + raise RuntimeError('Dataset not found or corrupted.' + + ' You can use download=True to download it') + + # now load the picked numpy arrays + self.train_data = [] + self.train_labels = [] + for fentry in self.train_list: + f = fentry[0] + file = os.path.join(self.root, self.base_folder, f) + fo = open(file, 'rb') + if sys.version_info[0] == 2: + entry = pickle.load(fo) + else: + entry = pickle.load(fo, encoding='latin1') + self.train_data.append(entry['data']) + if 'labels' in entry: + self.train_labels += entry['labels'] + else: + self.train_labels += entry['fine_labels'] + fo.close() + + self.train_data = np.concatenate(self.train_data) + self.train_labels = np.array(self.train_labels) + + # clean images and noisy labels (training and validation) + if noise_rate > 0: + self.train_data, self.val_data, self.train_labels, self.val_labels, self.noise_mask = tools.dataset_split( + self.train_data, self.train_labels, dataset, noise_type, noise_rate, split_per, random_seed, num_class + ) + else: + self.train_data, self.val_data, self.train_labels, self.val_labels, self.noise_mask = tools.dataset_split_without_noise( + self.train_data, self.train_labels, split_per, random_seed + ) + + if self.train: + self.train_data = self.train_data.reshape((-1, 3, 32, 32)) + self.train_data = self.train_data.transpose((0, 2, 3, 1)) + + else: + self.val_data = self.val_data.reshape((-1, 3, 32, 32)) + self.val_data = self.val_data.transpose((0, 2, 3, 1)) + + def __getitem__(self, index): + if self.train: + img, label = self.train_data[index], self.train_labels[index] + else: + img, label = self.val_data[index], self.val_labels[index] + + img = Image.fromarray(img) + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + label = self.target_transform(label) + + return img, label, index + + def __len__(self): + if self.train: + return len(self.train_data) + else: + return len(self.val_data) + + def _check_integrity(self): + root = self.root + for fentry in (self.train_list + self.test_list): + filename, md5 = fentry[0], fentry[1] + fpath = os.path.join(root, self.base_folder, filename) + if not tools.check_integrity(fpath, md5): + return False + return True + + def download(self): + import tarfile + + if self._check_integrity(): + print('Files already downloaded and verified') + return + + root = self.root + tools.download_url(self.url, root, self.filename, self.tgz_md5) + + # extract file + cwd = os.getcwd() + tar = tarfile.open(os.path.join(root, self.filename), "r:gz") + os.chdir(root) + tar.extractall() + tar.close() + os.chdir(cwd) + + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = 'train' if self.train is True else 'test' + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + tmp = ' Transforms (if any): ' + fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) + tmp = ' Target Transforms (if any): ' + fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) + return fmt_str + + +class Cifar100Test(Data.Dataset): + + base_folder = 'cifar-100-python' + test_list = [ + ['test', 'f0ef6b0ae62326f3e7ffdfab6717acfc'], + ] + + def __init__(self, root, transform=None, target_transform=None): + self.root = os.path.join(os.path.expanduser(root), 'CIFAR') + self.transform = transform + self.target_transform = target_transform + + f = self.test_list[0][0] + file = os.path.join(self.root, self.base_folder, f) + fo = open(file, 'rb') + if sys.version_info[0] == 2: + entry = pickle.load(fo) + else: + entry = pickle.load(fo, encoding='latin1') + self.test_data = entry['data'] + if 'labels' in entry: + self.test_labels = entry['labels'] + else: + self.test_labels = entry['fine_labels'] + fo.close() + self.test_data = self.test_data.reshape((-1, 3, 32, 32)) + self.test_data = self.test_data.transpose((0, 2, 3, 1)) + self.test_labels = np.array(self.test_labels) + + def __getitem__(self, index): + img, label = self.test_data[index], self.test_labels[index] + img = Image.fromarray(img) + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + label = self.target_transform(label) + + return img, label, index + + def __len__(self): + return len(self.test_data) diff --git a/deep-learning/data_tools.py b/deep-learning/data_tools.py new file mode 100644 index 00000000..039a0062 --- /dev/null +++ b/deep-learning/data_tools.py @@ -0,0 +1,503 @@ +import os +import hashlib +import errno +import codecs + +import numpy as np +from numpy.testing import assert_array_almost_equal +from scipy import stats + +import torch +import torch.nn.functional as F +import torch.nn as nn + + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def get_int(b): + return int(codecs.encode(b, 'hex'), 16) + + +def read_label_file(path): + with open(path, 'rb') as f: + data = f.read() + assert get_int(data[:4]) == 2049 + length = get_int(data[4:8]) + parsed = np.frombuffer(data, dtype=np.uint8, offset=8) + parsed = np.array(parsed) + return torch.from_numpy(parsed).view(length).long() + + +def read_image_file(path): + with open(path, 'rb') as f: + data = f.read() + assert get_int(data[:4]) == 2051 + length = get_int(data[4:8]) + num_rows = get_int(data[8:12]) + num_cols = get_int(data[12:16]) + parsed = np.frombuffer(data, dtype=np.uint8, offset=16) + parsed = np.array(parsed) + return torch.from_numpy(parsed).view(length, num_rows, num_cols) + + +def check_integrity(fpath, md5): + if not os.path.isfile(fpath): + return False + md5o = hashlib.md5() + with open(fpath, 'rb') as f: + # read in 1MB chunks + for chunk in iter(lambda: f.read(1024 * 1024), b''): + md5o.update(chunk) + md5c = md5o.hexdigest() + if md5c != md5: + return False + return True + + +def download_url(url, root, filename, md5): + from six.moves import urllib + + root = os.path.expanduser(root) + fpath = os.path.join(root, filename) + + try: + os.makedirs(root) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + + # downloads file + if os.path.isfile(fpath) and check_integrity(fpath, md5): + print('Using downloaded and verified file: ' + fpath) + else: + try: + print('Downloading ' + url + ' to ' + fpath) + urllib.request.urlretrieve(url, fpath) + except: + if url[:5] == 'https': + url = url.replace('https:', 'http:') + print('Failed download. Trying https -> http instead.' + ' Downloading ' + url + ' to ' + fpath) + urllib.request.urlretrieve(url, fpath) + + +def list_dir(root, prefix=False): + """List all directories at a given root + Args: + root (str): Path to directory whose folders need to be listed + prefix (bool, optional): If true, prepends the path to each result, otherwise + only returns the name of the directories found + """ + root = os.path.expanduser(root) + directories = list( + filter( + lambda p: os.path.isdir(os.path.join(root, p)), + os.listdir(root) + ) + ) + + if prefix is True: + directories = [os.path.join(root, d) for d in directories] + + return directories + + +def list_files(root, suffix, prefix=False): + """List all files ending with a suffix at a given root + Args: + root (str): Path to directory whose folders need to be listed + suffix (str or tuple): Suffix of the files to match, e.g. '.png' or ('.jpg', '.png'). + It uses the Python "str.endswith" method and is passed directly + prefix (bool, optional): If true, prepends the path to each result, otherwise + only returns the name of the files found + """ + root = os.path.expanduser(root) + files = list( + filter( + lambda p: os.path.isfile(os.path.join(root, p)) and p.endswith(suffix), + os.listdir(root) + ) + ) + + if prefix is True: + files = [os.path.join(root, d) for d in files] + + return files + + +# basic function +def multiclass_noisify(y, P, random_state=1): + """ Flip classes according to transition probability matrix T. + It expects a number between 0 and the number of classes - 1. + """ + assert P.shape[0] == P.shape[1] + assert np.max(y) < P.shape[0] + + # row stochastic matrix + assert_array_almost_equal(P.sum(axis=1), np.ones(P.shape[1])) + assert (P >= 0.0).all() + + m = y.shape[0] + new_y = y.copy() + flipper = np.random.RandomState(random_state) + + for idx in np.arange(m): + i = y[idx] + # draw a vector with only an 1 + flipped = flipper.multinomial(1, P[i, :][0], 1)[0] + new_y[idx] = np.where(flipped == 1)[0] + + return new_y + + +def noisify_pairflip(y_train, noise, random_state=1, nb_classes=10): + """mistakes: + flip in the pair + """ + P = np.eye(nb_classes) + n = noise + + if n > 0.0: + # 0 -> 1 + P[0, 0], P[0, 1] = 1. - n, n + for i in range(1, nb_classes-1): + P[i, i], P[i, i + 1] = 1. - n, n + P[nb_classes-1, nb_classes-1], P[nb_classes-1, 0] = 1. - n, n + + y_train_noisy = multiclass_noisify(y_train, P=P, + random_state=random_state) + actual_noise = (y_train_noisy != y_train).mean() + assert actual_noise > 0.0 + print('Actual noise %.2f' % actual_noise) + y_train = y_train_noisy + + return y_train, actual_noise,P + +def noisify_multiclass_symmetric(y_train, noise, random_state=None, nb_classes=10): + """mistakes: + flip in the symmetric way + """ + P = np.ones((nb_classes, nb_classes)) + n = noise + P = (n / (nb_classes - 1)) * P + + if n > 0.0: + # 0 -> 1 + P[0, 0] = 1. - n + for i in range(1, nb_classes-1): + P[i, i] = 1. - n + P[nb_classes-1, nb_classes-1] = 1. - n + + y_train_noisy = multiclass_noisify(y_train, P=P, + random_state=random_state) + actual_noise = (y_train_noisy != y_train).mean() + assert actual_noise > 0.0 + print('Actual noise %.2f' % actual_noise) + y_train = y_train_noisy + + return y_train, actual_noise,P + +def noisify_trid(y_train, noise, random_state=1, nb_classes=10): + """mistakes: + flip in the trid + """ + P = np.eye(nb_classes) + n = noise + + if n > 0.0: + # 0 -> 1 + P[0, 0], P[0, 1], P[0, nb_classes-1] = 1. - n, n / 2, n /2 + for i in range(1, nb_classes-1): + P[i, i], P[i, i + 1], P[i, i - 1] = 1. - n, n / 2, n /2 + P[nb_classes-1, nb_classes-1], P[nb_classes-1, 0], P[nb_classes-1, nb_classes-2] = 1. - n, n / 2, n /2 + + y_train_noisy = multiclass_noisify(y_train, P=P, + random_state=random_state) + actual_noise = (y_train_noisy != y_train).mean() + y_train = y_train_noisy + + return y_train, actual_noise, P + + + +def noisify_multiclass_asymmetric_mnist(y_train, noise, random_state=None, nb_classes=10): + """mistakes: + flip in the symmetric way + """ + P = np.eye(10) + n = noise + + # 2 -> 7 + P[2, 2], P[2, 7] = 1. - n, n + + # 5 <-> 6 + P[5, 5], P[5, 6] = 1. - n, n + P[6, 6], P[6, 5] = 1. - n, n + + # 3 -> 8 + P[3, 3], P[3, 8] = 1. - n, n + + y_train_noisy = multiclass_noisify(y_train, P=P, random_state=random_state) + actual_noise = (y_train_noisy != y_train).mean() + assert actual_noise > 0.0 + print('Actual noise %.2f' % actual_noise) + + y_train = y_train_noisy + + return y_train, actual_noise, P + + +def build_for_cifar100(size, noise): + """ random flip between two random classes. + """ + assert(noise >= 0.) and (noise <= 1.) + + P = (1. - noise) * np.eye(size) + for i in np.arange(size - 1): + P[i, i+1] = noise + + # adjust last row + P[size-1, 0] = noise + + assert_array_almost_equal(P.sum(axis=1), 1, 1) + return P + + +def noisify_multiclass_asymmetric_cifar10(y_train, noise, random_state=None, nb_classes=10): + """mistakes: + flip in the symmetric way + """ + source_class = [9, 2, 3, 5, 4] + target_class = [1, 0, 5, 3, 7] + y_train_ = np.copy(np.array(y_train)) + for s, t in zip(source_class, target_class): + cls_idx = np.where(np.array(y_train) == s)[0] + n_noisy = int(noise * cls_idx.shape[0]) + noisy_sample_index = np.random.choice(cls_idx, n_noisy, replace=False) + for idx in noisy_sample_index: + y_train_[idx] = t + actual_noise = (y_train_ != np.array(y_train)).mean() + print('Actual noise %.2f' % actual_noise) + return y_train_, actual_noise, target_class + +def noisify_multiclass_asymmetric_cifar100(y_train, noise, random_state=None, nb_classes=100): + """mistakes: + flip in the symmetric way + """ + nb_classes = 100 + P = np.eye(nb_classes) + n = noise + nb_superclasses = 20 + nb_subclasses = 5 + + if n > 0.0: + for i in np.arange(nb_superclasses): + init, end = i * nb_subclasses, (i + 1) * nb_subclasses + P[init:end, init:end] = build_for_cifar100(nb_subclasses, n) + + y_train_noisy = multiclass_noisify(np.array(y_train), P=P, random_state=random_state) + actual_noise = (y_train_noisy != np.array(y_train)).mean() + assert actual_noise > 0.0 + print('Actual noise %.2f' % actual_noise) + targets = y_train_noisy + return targets, actual_noise, P + + +def noisify(dataset='mnist', nb_classes=10, train_labels=None, noise_type=None, noise_rate=0, random_state=1): + if noise_type == 'pairflip': + train_noisy_labels, actual_noise_rate = noisify_pairflip(train_labels, noise_rate, random_state=1, nb_classes=nb_classes) + if noise_type == 'symmetric': + train_noisy_labels, actual_noise_rate = noisify_multiclass_symmetric(train_labels, noise_rate, random_state=1, nb_classes=nb_classes) + return train_noisy_labels, actual_noise_rate + + +def init_params(net): + '''Init layer parameters.''' + for m in net.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal(m.weight, mode='fan_out') + if m.bias: + nn.init.constant(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant(m.weight, 1) + nn.init.constant(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal(m.weight, std=1e-3) + if m.bias: + nn.init.constant(m.bias, 0) + + +def transition_matrix_generate(noise_rate=0.5, num_classes=10): + P = np.ones((num_classes, num_classes)) + n = noise_rate + P = (n / (num_classes - 1)) * P + + if n > 0.0: + # 0 -> 1 + P[0, 0] = 1. - n + for i in range(1, num_classes-1): + P[i, i] = 1. - n + P[num_classes-1, num_classes-1] = 1. - n + return P + + +def fit(X, num_classes, filter_outlier=False): + # number of classes + c = num_classes + T = np.empty((c, c)) + eta_corr = X + for i in np.arange(c): + if not filter_outlier: + idx_best = np.argmax(eta_corr[:, i]) + else: + eta_thresh = np.percentile(eta_corr[:, i], 97,interpolation='higher') + robust_eta = eta_corr[:, i] + robust_eta[robust_eta >= eta_thresh] = 0.0 + idx_best = np.argmax(robust_eta) + for j in np.arange(c): + T[i, j] = eta_corr[idx_best, j] + return T + + +# flip clean labels to noisy labels +# train set and val set split +def dataset_split(train_images, train_labels, + dataset='mnist', noise_type='symmetric', noise_rate=0.5, + split_per=0.9, random_seed=1, num_classes=10 + ): + + clean_train_labels = train_labels[:, np.newaxis] + + if noise_type == 'symmetric': + noisy_labels, real_noise_rate, transition_matrix = noisify_multiclass_symmetric(clean_train_labels, + noise=noise_rate, + random_state=random_seed, + nb_classes=num_classes) + if noise_type == 'pairflip': + noisy_labels, real_noise_rate, transition_matrix = noisify_pairflip(clean_train_labels, + noise=noise_rate, + random_state=random_seed, + nb_classes=num_classes) + if noise_type == 'asymmetric' and dataset == 'mnist': + noisy_labels, real_noise_rate, transition_matrix = noisify_multiclass_asymmetric_mnist(clean_train_labels, + noise=noise_rate, + random_state=random_seed, + nb_classes=num_classes) + + if noise_type == 'asymmetric' and dataset == 'cifar10': + noisy_labels, real_noise_rate, transition_matrix = noisify_multiclass_asymmetric_cifar10(clean_train_labels, + noise=noise_rate, + random_state=random_seed, + nb_classes=num_classes) + + if noise_type == 'asymmetric' and dataset == 'cifar100': + noisy_labels, real_noise_rate, transition_matrix = noisify_multiclass_asymmetric_cifar100(clean_train_labels, + noise=noise_rate, + random_state=random_seed, + nb_classes=num_classes) + + if noise_type == 'instance' and dataset == 'mnist': + noisy_labels = get_instance_noisy_label(noise_rate, + images=train_images, + labels=train_labels, + num_classes=10, + feature_size=784, + norm_std=0.1, + seed=random_seed) + + if noise_type == 'instance' and dataset == 'cifar10': + noisy_labels = get_instance_noisy_label(noise_rate, + images=train_images, + labels=train_labels, + num_classes=10, + feature_size=3072, + norm_std=0.1, + seed=random_seed) + + if noise_type == 'instance' and dataset == 'cifar100': + noisy_labels = get_instance_noisy_label(noise_rate, + images=train_images, + labels=train_labels, + num_classes=100, + feature_size=3072, + norm_std=0.1, + seedcle=random_seed) + + noisy_labels = noisy_labels.squeeze() + num_samples = int(noisy_labels.shape[0]) + np.random.seed(random_seed) + train_set_index = np.random.choice(num_samples, int(num_samples*split_per), replace=False) + index = np.arange(train_images.shape[0]) + val_set_index = np.delete(index, train_set_index) + + train_set, val_set = train_images[train_set_index, :], train_images[val_set_index, :] + train_labels, val_labels = noisy_labels[train_set_index], noisy_labels[val_set_index] + + noise_mask = np.transpose(clean_train_labels[train_set_index]) == np.transpose(train_labels) + + return train_set, val_set, train_labels, val_labels, noise_mask + + +def dataset_split_without_noise(train_images, train_labels, split_per=0.9, random_seed=1): + total_labels = train_labels[:, np.newaxis] + num_samples = int(total_labels.shape[0]) + np.random.seed(random_seed) + train_set_index = np.random.choice(num_samples, int(num_samples * split_per), replace=False) + index = np.arange(train_images.shape[0]) + val_set_index = np.delete(index, train_set_index) + train_set, val_set = train_images[train_set_index], train_images[val_set_index] + train_labels, val_labels = total_labels[train_set_index], total_labels[val_set_index] + + noise_mask = np.full(len(train_labels), True) + + return train_set, val_set, train_labels.squeeze(), val_labels.squeeze(), noise_mask + + +def transform_target(label): + label = np.array(label) + target = torch.from_numpy(label).long() + return target + + +def get_instance_noisy_label(noise_rate, images, labels, num_classes, feature_size, norm_std, seed): + """ + images: mnist, cifar10, cifar100 + label_num: class number + feature_size: the size of input images (e.g. 28*28) + norm_std: default 0.1 + """ + + label_num = num_classes + np.random.seed(int(seed)) + torch.manual_seed(int(seed)) + torch.cuda.manual_seed(int(seed)) + + P = [] + flip_distribution = stats.truncnorm((0 - noise_rate) / norm_std, (1 - noise_rate) / norm_std, loc=noise_rate, scale=norm_std) + # flip_distribution = stats.beta(a=0.01, b=(0.01 / n) - 0.01, loc=0, scale=1) + flip_rate = flip_distribution.rvs(labels.shape[0]) + + images = torch.Tensor(images).to(torch.float) + if not torch.is_tensor(labels): + labels = torch.FloatTensor(labels) + labels = labels.to(DEVICE) + + W = np.random.randn(label_num, feature_size, label_num) + W = torch.FloatTensor(W).to(DEVICE) + + for i, (x, y) in enumerate(zip(images, labels)): + x = x.to(DEVICE) + y = int(y) + A = x.view(1, -1).mm(W[y]).squeeze(0) + A[y] = -np.inf + A = flip_rate[i] * F.softmax(A, dim=0) + A[y] += 1 - flip_rate[i] + P.append(A) + P = torch.stack(P, 0).cpu().numpy() + + l = [i for i in range(label_num)] + new_label = [np.random.choice(l, p=P[i]) for i in range(labels.shape[0])] + return np.array(new_label) diff --git a/deep-learning/food.py b/deep-learning/food.py new file mode 100644 index 00000000..9ac54327 --- /dev/null +++ b/deep-learning/food.py @@ -0,0 +1,280 @@ +import os +import argparse +import datetime +import time + +import numpy as np + +import torch +from torch.optim.lr_scheduler import MultiStepLR +from torchvision.datasets import Food101 +from torchvision import transforms +from torch.utils.data.sampler import SubsetRandomSampler + +import methods +import utils +from models.resnet50 import resnet50 + + +parser = argparse.ArgumentParser() +parser.add_argument('--result_dir', type=str, help = 'dir to save result txt files', default='results/') +parser.add_argument('--root_dir', type=str, help = 'dir that stores dataset', default='data/') +parser.add_argument('--lr_init', type=float, default=0.001) +parser.add_argument('--n_epoch', type=int, default=100) +parser.add_argument('--batch_size', type=int, help='batch_size', default=32) +parser.add_argument('--wd', type=float, help='l2 regularization', default=1e-4) +parser.add_argument('--method', type=str, help='[regular, rlvi, coteaching, jocor, cdr, usdnl, bare]', default='regular') +parser.add_argument('--noise_rate', type=float, help='corruption level, should be less than 1', default=0.1) +parser.add_argument('--split_percentage', type=float, help='train and validation', default=0.9) +parser.add_argument('--print_freq', type=int, default=1) +parser.add_argument('--num_workers', type=int, default=4, help='how many subprocesses to use for data loading') +parser.add_argument('--seed', type=int, default=1) +# For alternative methods +parser.add_argument('--forget_rate', type=float, help='forget rate', default=None) +parser.add_argument('--num_gradual', type=int, default=10, help='how many epochs for linear drop rate, can be 5, 10, 15. This parameter is equal to Tk for R(T) in Co-teaching paper.') +parser.add_argument('--exponent', type=float, default=1, help='exponent of the forget rate, can be 0.5, 1, 2. This parameter is equal to c in Tc for R(T) in Co-teaching paper.') + +args = parser.parse_args() + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + +# Seed +np.random.seed(args.seed) +torch.manual_seed(args.seed) +if DEVICE == "cuda": + torch.cuda.manual_seed(args.seed) + + +# Load datasets for training, validation, and testing +transform_train = transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.RandomHorizontalFlip(), + transforms.RandomVerticalFlip(), + transforms.RandomRotation(45), + transforms.RandomAffine(45), + transforms.ColorJitter(), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + +transform_test = transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + +class Food101Dataset(torch.utils.data.Dataset): + def __init__(self, root, train=True): + if train: + split = 'train' + transform = transform_train + else: + split = 'test' + transform = transform_test + + self.dataset = Food101(root=root, + download=True, + split=split, + transform=transform) + + def __getitem__(self, index): + data, target = self.dataset[index] + return data, target, index + + def __len__(self): + return len(self.dataset) + +train_dataset = Food101Dataset(root=args.root_dir, train=True) +test_dataset = Food101Dataset(root=args.root_dir, train=False) + + +# For alternative methods: +# create rate_schedule to gradually consider less and less samples +if args.forget_rate is None: + forget_rate = args.noise_rate +else: + forget_rate = args.forget_rate +rate_schedule = np.ones(args.n_epoch) * forget_rate +rate_schedule[:args.num_gradual] = np.linspace(0, forget_rate**args.exponent, args.num_gradual) + + +# Prepare a structured output +save_dir = f"{args.result_dir}/food101/{args.method}" +if not os.path.exists(save_dir): + os.system('mkdir -p %s' % save_dir) + +model_str = f"food101_{args.method}" +txtfile = f"{save_dir}/{model_str}-s{args.seed}.txt" +if os.path.exists(txtfile): + curr_time = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + new_dest = f"{txtfile}.bak-{curr_time}" + os.system(f"mv {txtfile} {new_dest}") + + +def run(): + # Data Loaders + indices = np.arange(len(train_dataset)) + np.random.shuffle(indices) + split = int(np.floor(args.split_percentage * len(train_dataset))) + train_indices, val_indices = indices[:split], indices[split:] + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=args.batch_size, + sampler=train_sampler, + num_workers=args.num_workers, + drop_last=False, + pin_memory=True) + + val_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=args.batch_size, + sampler=val_sampler, + num_workers=args.num_workers, + drop_last=False, + pin_memory=True) + + test_loader = torch.utils.data.DataLoader(dataset=test_dataset, + batch_size=args.batch_size, + num_workers=args.num_workers, + drop_last=False, + pin_memory=True) + + # Prepare models and optimizers + if args.method == 'usdnl': + model = resnet50(input_channel=3, num_classes=101, pretrained=True, dropout_rate=0.25) + else: + model = resnet50(input_channel=3, num_classes=101, pretrained=True) + model.to(DEVICE) + optimizer = torch.optim.Adam( + params=[ + {"params": model.fc_params(), "lr": args.lr_init}, + {"params": model.backbone_params()} + ], + lr=1e-4, weight_decay=args.wd + ) + + if args.method == 'jocor' or args.method == 'coteaching': + model_sec = resnet50(input_channel=3, num_classes=101, pretrained=True) + model_sec.to(DEVICE) + if args.method == 'jocor': + optimizer = torch.optim.Adam( + params=[ + {"params": list(model.fc_params()) + list(model_sec.fc_params()), "lr": args.lr_init}, + {"params": list(model.backbone_params()) + list(model_sec.backbone_params())} + ], + lr=1e-4, weight_decay=args.wd + ) + else: + optimizer_sec = torch.optim.Adam( + params=[ + {"params": model_sec.fc_params(), "lr": args.lr_init}, + {"params": model_sec.backbone_params()} + ], + lr=1e-4, weight_decay=args.wd + ) + + # Schedule for the learning rate + scheduler = MultiStepLR(optimizer, milestones=[10, 20, 30], gamma=0.2) + if args.method == 'coteaching': + scheduler_sec = MultiStepLR(optimizer_sec, milestones=[10, 20, 30], gamma=0.2) + + if args.method == 'rlvi': + sample_weights = torch.ones(len(train_dataset)).to(DEVICE) + residuals = torch.zeros_like(sample_weights).to(DEVICE) + overfit = False + threshold = 0 + + + # Evaluate init model + test_acc = utils.evaluate(test_loader, model) + utils.output_table(epoch=0, n_epoch=args.n_epoch, test_acc=test_acc) + with open(txtfile, "a") as myfile: + myfile.write("epoch:\ttime_ep\ttau\tfix\ttrain_acc\tval_acc\ttest_acc\n") + myfile.write(f"0:\t0\t0\t{False}\t0\t0\t{test_acc:8.4f}\n") + + # Training + for epoch in range(1, args.n_epoch): + model.train() + + time_ep = time.time() + + #### Start one epoch of training with selected method #### + + if args.method == "regular": + train_acc = methods.train_regular(train_loader, model, optimizer) + val_acc = utils.evaluate(val_loader, model) + + elif args.method == "rlvi": + train_acc, threshold = methods.train_rlvi( + train_loader, model, optimizer, + residuals, sample_weights, overfit, threshold + ) + val_acc = utils.evaluate(val_loader, model) + # Using no regularization by default (as val. score is increasing) + overfit = False + + elif args.method == 'coteaching': + model_sec.train() + train_acc = methods.train_coteaching( + train_loader, epoch, + model, optimizer, model_sec, optimizer_sec, + rate_schedule + ) + val_acc = utils.evaluate(val_loader, model) + scheduler_sec.step() + + elif args.method == 'jocor': + model_sec.train() + train_acc = methods.train_jocor( + train_loader, epoch, + model, model_sec, optimizer, + rate_schedule + ) + val_acc = utils.evaluate(val_loader, model) + + elif args.method == 'cdr': + train_acc = methods.train_cdr(train_loader, epoch, model, optimizer, rate_schedule) + val_acc = utils.evaluate(val_loader, model) + + elif args.method == 'usdnl': + train_acc = methods.train_usdnl(train_loader, epoch, model, optimizer, rate_schedule) + val_acc = utils.evaluate(val_loader, model) + + elif args.method == 'bare': + train_acc = methods.train_bare(train_loader, model, optimizer, num_classes=101) + val_acc = utils.evaluate(val_loader, model) + + + # Update LR + scheduler.step() + + #### Finish one epoch of training with selected method #### + + # Log info + time_ep = time.time() - time_ep + test_acc = utils.evaluate(test_loader, model) + + # Print log-table + if (epoch + 1) % args.print_freq == 0: + utils.output_table(epoch, args.n_epoch, time_ep, train_acc=train_acc, test_acc=test_acc) + else: + utils.output_table(epoch, args.n_epoch, time_ep, train_acc=train_acc) + + # Prepare output: put dummy values for alternative methods + if args.method != 'rlvi': + overfit = False + threshold = 0 + + # Save logs to the file + with open(txtfile, "a") as myfile: + myfile.write(f"{int(epoch)}:\t{time_ep:.2f}\t{threshold:.2f}\t{overfit}\t" + + f"{train_acc:8.4f}\t{val_acc:8.4f}\t{test_acc:8.4f}\n") + + +if __name__ == '__main__': + run() diff --git a/deep-learning/main.py b/deep-learning/main.py new file mode 100644 index 00000000..b0f05f39 --- /dev/null +++ b/deep-learning/main.py @@ -0,0 +1,354 @@ +import os +import argparse +import datetime +import time + +import numpy as np +import torch +from torch.optim.lr_scheduler import MultiplicativeLR, CosineAnnealingLR + +import methods +import data_load +import data_tools +import utils + +from models.lenet import LeNet +from models.resnet import ResNet18, ResNet34 +# models with dropout for USDNL algorithm: +from models.lenet import LeNetDO +from models.resnet import ResNet18DO, ResNet34DO + + +parser = argparse.ArgumentParser() +parser.add_argument('--result_dir', type=str, help = 'dir to save result txt files', default='results/') +parser.add_argument('--root_dir', type=str, help = 'dir that stores datasets', default='data/') +parser.add_argument('--dataset', type=str, help='[mnist, cifar10, cifar100]', default='mnist') +parser.add_argument('--method', type=str, help='[regular, rlvi, coteaching, jocor, cdr, usdnl, bare]', default='rlvi') +parser.add_argument('--noise_rate', type=float, help='corruption rate, should be less than 1', default=0.45) +parser.add_argument('--noise_type', type=str, help='[pairflip, symmetric, asymmetric, instance]', default='pairflip') +parser.add_argument('--split_percentage', type=float, help='train and validation', default=0.9) +parser.add_argument('--momentum', type=int, help='momentum', default=0.9) +parser.add_argument('--print_freq', type=int, default=1) +parser.add_argument('--num_workers', type=int, default=4, help='number of subprocesses for data loading') +parser.add_argument('--seed', type=int, default=1) +# For alternative methods +parser.add_argument('--forget_rate', type=float, help='forget rate', default=None) +parser.add_argument('--num_gradual', type=int, default=10, help='how many epochs for linear drop rate, can be 5, 10, 15. This parameter is equal to Tk for R(T) in Co-teaching paper.') +parser.add_argument('--exponent', type=float, default=1, help='exponent of the forget rate, can be 0.5, 1, 2. This parameter is equal to c in Tc for R(T) in Co-teaching paper.') + +args = parser.parse_args() + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + +# Seed +np.random.seed(args.seed) +torch.manual_seed(args.seed) +if DEVICE == "cuda": + torch.cuda.manual_seed(args.seed) + + +# Load datasets for training, validation, and testing +if args.dataset == 'mnist': + input_channel = 1 + num_classes = 10 + args.n_epoch = 100 + args.batch_size = 32 + args.wd = 1e-3 + args.lr_init = 0.01 + + if args.method != 'usdnl': + Model = LeNet + else: + Model = LeNetDO # with dropout + + train_dataset = data_load.Mnist(root=args.root_dir, + download=True, + train=True, + transform=Model.transform_train, + target_transform=data_tools.transform_target, + dataset=args.dataset, + noise_type=args.noise_type, + noise_rate=args.noise_rate, + split_per=args.split_percentage, + random_seed=args.seed) + + val_dataset = data_load.Mnist(root=args.root_dir, + train=False, + transform=Model.transform_test, + target_transform=data_tools.transform_target, + dataset=args.dataset, + noise_type=args.noise_type, + noise_rate=args.noise_rate, + split_per=args.split_percentage, + random_seed=args.seed) + + + test_dataset = data_load.MnistTest(root=args.root_dir, + transform=Model.transform_test, + target_transform=data_tools.transform_target) + + +if args.dataset == 'cifar10': + input_channel = 3 + num_classes = 10 + args.n_epoch = 200 + args.batch_size = 128 + args.wd = 5e-4 + args.lr_init = 0.01 + + if args.method != 'usdnl': + Model = ResNet18 + else: + Model = ResNet18DO # with dropout + + train_dataset = data_load.Cifar10(root=args.root_dir, + download=True, + train=True, + transform=Model.transform_train, + target_transform=data_tools.transform_target, + dataset=args.dataset, + noise_type=args.noise_type, + noise_rate=args.noise_rate, + split_per=args.split_percentage, + random_seed=args.seed) + + val_dataset = data_load.Cifar10(root=args.root_dir, + train=False, + transform=Model.transform_test, + target_transform=data_tools.transform_target, + dataset=args.dataset, + noise_type=args.noise_type, + noise_rate=args.noise_rate, + split_per=args.split_percentage, + random_seed=args.seed) + + + test_dataset = data_load.Cifar10Test(root=args.root_dir, + transform=Model.transform_test, + target_transform=data_tools.transform_target) + + +if args.dataset == 'cifar100': + input_channel = 3 + num_classes = 100 + args.n_epoch = 200 + args.batch_size = 128 + args.wd = 5e-4 + args.lr_init = 0.01 + + if args.method != 'usdnl': + Model = ResNet34 + else: + Model = ResNet34DO # with dropout + + train_dataset = data_load.Cifar100(root=args.root_dir, + download=True, + train=True, + transform=Model.transform_train, + target_transform=data_tools.transform_target, + dataset=args.dataset, + noise_type=args.noise_type, + noise_rate=args.noise_rate, + split_per=args.split_percentage, + random_seed=args.seed) + + val_dataset = data_load.Cifar100(root=args.root_dir, + train=False, + transform=Model.transform_test, + target_transform=data_tools.transform_target, + dataset=args.dataset, + noise_type=args.noise_type, + noise_rate=args.noise_rate, + split_per=args.split_percentage, + random_seed=args.seed) + + test_dataset = data_load.Cifar100Test(root=args.root_dir, + transform=Model.transform_test, + target_transform=data_tools.transform_target) + + +# For alternative methods: +# create rate_schedule to gradually consider less and less samples +if args.forget_rate is None: + forget_rate = args.noise_rate + if args.noise_type == 'asymmetric': + forget_rate /= 2. +else: + forget_rate = args.forget_rate +rate_schedule = np.ones(args.n_epoch) * forget_rate +rate_schedule[:args.num_gradual] = np.linspace(0, forget_rate**args.exponent, args.num_gradual) + + +# Prepare a structured output +save_dir = f"{args.result_dir}/{args.dataset}/{args.method}" +if not os.path.exists(save_dir): + os.system('mkdir -p %s' % save_dir) + +model_str = f"{args.dataset}_{args.method}_{args.noise_type}_{args.noise_rate}" +txtfile = f"{save_dir}/{model_str}-s{args.seed}.txt" +if os.path.exists(txtfile): + curr_time = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + new_dest = f"{txtfile}.bak-{curr_time}" + os.system(f"mv {txtfile} {new_dest}") + + +def run(): + # Data Loaders + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=args.batch_size, + num_workers=args.num_workers, + drop_last=False, + shuffle=True, + pin_memory=True) + + val_loader = torch.utils.data.DataLoader(dataset=val_dataset, + batch_size=args.batch_size, + num_workers=args.num_workers, + drop_last=False, + shuffle=False, + pin_memory=True) + + + test_loader = torch.utils.data.DataLoader(dataset=test_dataset, + batch_size=args.batch_size, + num_workers=args.num_workers, + drop_last=False, + shuffle=False, + pin_memory=True) + + # Prepare models and optimizers + model = Model(input_channel=input_channel, num_classes=num_classes) + model.to(DEVICE) + optimizer = torch.optim.SGD(model.parameters(), lr=args.lr_init, weight_decay=args.wd, momentum=args.momentum) + + if args.method == 'jocor': + model_sec = Model(input_channel=input_channel, num_classes=num_classes) + model_sec.to(DEVICE) + optimizer = torch.optim.SGD( + list(model.parameters()) + list(model_sec.parameters()), + lr=args.lr_init, weight_decay=args.wd, momentum=args.momentum + ) + + # Use Multipliactive LR decay for MNIST and Cosine Annealing for CIFAR10/100 + if args.dataset == 'mnist': + scheduler = MultiplicativeLR(optimizer, utils.get_lr_factor) + else: + scheduler = CosineAnnealingLR(optimizer, T_max=200) + + if args.method == 'coteaching': + model_sec = Model(input_channel=input_channel, num_classes=num_classes) + model_sec.to(DEVICE) + optimizer_sec = torch.optim.SGD(model_sec.parameters(), lr=args.lr_init, weight_decay=args.wd, momentum=args.momentum) + if args.dataset == 'mnist': + scheduler_sec = MultiplicativeLR(optimizer_sec, utils.get_lr_factor) + else: + scheduler_sec = CosineAnnealingLR(optimizer_sec, T_max=200) + + + if args.method == 'rlvi': + sample_weights = torch.ones(len(train_dataset)).to(DEVICE) + residuals = torch.zeros_like(sample_weights).to(DEVICE) + overfit = False + threshold = 0 + val_acc_old, val_acc_old_old = 0, 0 + + + # Evaluate init model + test_acc = utils.evaluate(test_loader, model) + utils.output_table(epoch=0, n_epoch=args.n_epoch, test_acc=test_acc) + with open(txtfile, "a") as myfile: + myfile.write("epoch:\ttime_ep\ttau\tfix\tclean,%\tcorr,%\ttrain_acc\tval_acc\ttest_acc\n") + myfile.write(f"0:\t0\t0\t{False}\t100\t0\t0\t0\t{test_acc:8.4f}\n") + + # Training + for epoch in range(1, args.n_epoch): + model.train() + + time_ep = time.time() + + #### Start one epoch of training with selected method #### + + if args.method == "regular": + train_acc = methods.train_regular(train_loader, model, optimizer) + val_acc = utils.evaluate(val_loader, model) + + elif args.method == "rlvi": + train_acc, threshold = methods.train_rlvi( + train_loader, model, optimizer, + residuals, sample_weights, overfit, threshold + ) + # Check if overfitting has started + val_acc = utils.evaluate(val_loader, model) + if not overfit: + if epoch > 2: + # Overfitting started <=> validation score is dropping + overfit = (val_acc < 0.5 * (val_acc_old + val_acc_old_old)) + val_acc_old_old = val_acc_old + val_acc_old = val_acc + + elif args.method == 'coteaching': + model_sec.train() + train_acc = methods.train_coteaching( + train_loader, epoch, + model, optimizer, model_sec, optimizer_sec, + rate_schedule + ) + val_acc = utils.evaluate(val_loader, model) + scheduler_sec.step() + + elif args.method == 'jocor': + model_sec.train() + train_acc = methods.train_jocor( + train_loader, epoch, + model, model_sec, optimizer, + rate_schedule + ) + val_acc = utils.evaluate(val_loader, model) + + elif args.method == 'cdr': + train_acc = methods.train_cdr(train_loader, epoch, model, optimizer, rate_schedule) + val_acc = utils.evaluate(val_loader, model) + + elif args.method == 'usdnl': + train_acc = methods.train_usdnl(train_loader, epoch, model, optimizer, rate_schedule) + val_acc = utils.evaluate(val_loader, model) + + elif args.method == 'bare': + train_acc = methods.train_bare(train_loader, model, optimizer, num_classes) + val_acc = utils.evaluate(val_loader, model) + + # Update LR + scheduler.step() + + #### Finish one epoch of training with selected method #### + + # Log info + time_ep = time.time() - time_ep + test_acc = utils.evaluate(test_loader, model) + + # Print log-table + if (epoch + 1) % args.print_freq == 0: + utils.output_table(epoch, args.n_epoch, time_ep, train_acc=train_acc, test_acc=test_acc) + else: + utils.output_table(epoch, args.n_epoch, time_ep, train_acc=train_acc) + + # Prepare output: put dummy values for alternative methods + if args.method != 'rlvi': + overfit = False + threshold = 0 + clean, corr = 1, 0 + # Check the number of correctly identified corrupted samples for RLVI + if (args.method == 'rlvi') and (args.noise_rate > 0): + mask = (sample_weights > threshold).cpu() + clean, corr = utils.get_ratio_corrupted(mask, train_dataset.noise_mask) + + # Save logs to the file + with open(txtfile, "a") as myfile: + myfile.write(f"{int(epoch)}:\t{time_ep:.2f}\t{threshold:.2f}\t{overfit}\t" + + f"{clean*100:.2f}\t{corr*100:.2f}\t" + + f"{train_acc:8.4f}\t{val_acc:8.4f}\t{test_acc:8.4f}\n") + + +if __name__ == '__main__': + run() diff --git a/deep-learning/methods/__init__.py b/deep-learning/methods/__init__.py new file mode 100644 index 00000000..24948695 --- /dev/null +++ b/deep-learning/methods/__init__.py @@ -0,0 +1,7 @@ +from .train_rlvi import * +from .train_regular import * +from .train_cdr import * +from .train_coteaching import * +from .train_jocor import * +from .train_usdnl import * +from .train_bare import * diff --git a/deep-learning/methods/train_bare.py b/deep-learning/methods/train_bare.py new file mode 100644 index 00000000..da3d7782 --- /dev/null +++ b/deep-learning/methods/train_bare.py @@ -0,0 +1,82 @@ +import torch +from torch import nn +from torch.nn import functional as F +from torch.autograd import Variable +from utils import accuracy + + +__all__ = ['train_bare'] + + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + + +class WeightedCCE(nn.Module): + """ + Implementing BARE with Cross-Entropy (CCE) Loss + """ + + def __init__(self, k=1, num_class=10, reduction="mean"): + super(WeightedCCE, self).__init__() + + self.k = k + self.reduction = reduction + self.num_class = num_class + + + def forward(self, prediction, target_label, one_hot=True): + EPS = 1e-8 + if one_hot: + y_true = F.one_hot(target_label.type(torch.LongTensor), + num_classes=self.num_class).to(DEVICE) + y_pred = F.softmax(prediction, dim=1) + y_pred = torch.clamp(y_pred, EPS, 1-EPS) + pred_tmp = torch.sum(y_true * y_pred, axis=-1).reshape(-1, 1) + + ## Compute batch statistics + avg_post = torch.mean(y_pred, dim=0) + avg_post = avg_post.reshape(-1, 1) + std_post = torch.std(y_pred, dim=0) + std_post = std_post.reshape(-1, 1) + avg_post_ref = torch.matmul(y_true.type(torch.float), avg_post) + std_post_ref = torch.matmul(y_true.type(torch.float), std_post) + pred_prun = torch.where((pred_tmp - avg_post_ref >= self.k * std_post_ref), + pred_tmp, torch.zeros_like(pred_tmp)) + + # prun_idx will tell us which examples are + # 'trustworthy' for the given batch + prun_idx = torch.where(pred_prun != 0.)[0] + if len(prun_idx) != 0: + prun_targets = torch.argmax(torch.index_select(y_true, 0, prun_idx), dim=1) + weighted_loss = F.cross_entropy(torch.index_select(prediction, 0, prun_idx), + prun_targets, reduction=self.reduction) + else: + weighted_loss = F.cross_entropy(prediction, target_label) + + return weighted_loss + + +def train_bare(train_loader, model, optimizer, num_classes): + train_total = 0 + train_correct = 0 + + loss_fn = WeightedCCE(k=1, num_class=num_classes, reduction="none") + for (images, labels, indexes) in train_loader: + + images = Variable(images).to(DEVICE) + labels = Variable(labels).to(DEVICE) + + logits = model(images) + + prec, _ = accuracy(logits, labels, topk=(1, 5)) + train_total += 1 + train_correct += prec + + loss = loss_fn(logits, labels) + optimizer.zero_grad() + loss.mean().backward() + optimizer.step() + + train_acc = float(train_correct) / float(train_total) + return train_acc diff --git a/deep-learning/methods/train_cdr.py b/deep-learning/methods/train_cdr.py new file mode 100644 index 00000000..1b63f7c4 --- /dev/null +++ b/deep-learning/methods/train_cdr.py @@ -0,0 +1,72 @@ +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F +from torch.autograd import Variable +from utils import accuracy + + +__all__ = ['train_cdr'] + + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + + +def train_one_step(model, data, label, optimizer, criterion, nonzero_ratio, clip): + model.train() + pred = model(data) + loss = criterion(pred, label) + loss.backward() + + to_concat_g = [] + to_concat_v = [] + for name, param in model.named_parameters(): + if param.dim() in [2, 4]: + to_concat_g.append(param.grad.data.view(-1)) + to_concat_v.append(param.data.view(-1)) + all_g = torch.cat(to_concat_g) + all_v = torch.cat(to_concat_v) + metric = torch.abs(all_g * all_v) + num_params = all_v.size(0) + nz = int(nonzero_ratio * num_params) + top_values, _ = torch.topk(metric, nz) + thresh = top_values[-1] + + if str(DEVICE) == 'cuda': + dtype = torch.cuda.FloatTensor + else: + dtype = torch.FloatTensor + for name, param in model.named_parameters(): + if param.dim() in [2, 4]: + mask = (torch.abs(param.data * param.grad.data) >= thresh).type(dtype) + mask = mask * clip + param.grad.data = mask * param.grad.data + + optimizer.step() + optimizer.zero_grad() + acc = accuracy(pred, label, topk=(1, 5)) + + return float(acc[0]), loss + + +def train_cdr(train_loader, epoch, model, optimizer, rate_schedule): + train_total = 0 + train_correct = 0 + + clip = 1 - rate_schedule[epoch] + for (data, labels, indexes) in train_loader: + data = data.to(DEVICE) + labels = labels.to(DEVICE) + # Forward + Backward + Optimize + logits = model(data) + prec, _ = accuracy(logits, labels, topk=(1, 5)) + train_total += 1 + train_correct += prec + # Loss transfer + prec, loss = train_one_step( + model, data, labels, optimizer, nn.CrossEntropyLoss(), clip, clip + ) + + train_acc = float(train_correct) / float(train_total) + return train_acc diff --git a/deep-learning/methods/train_coteaching.py b/deep-learning/methods/train_coteaching.py new file mode 100644 index 00000000..5db23984 --- /dev/null +++ b/deep-learning/methods/train_coteaching.py @@ -0,0 +1,76 @@ +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F +from torch.autograd import Variable +from utils import accuracy + + +__all__ = ['train_coteaching'] + + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + + +# Loss function +def loss_coteaching(y_1, y_2, t, forget_rate, ind): + loss_1 = F.cross_entropy(y_1, t, reduction='none') + ind_1_sorted = np.argsort(loss_1.data.cpu()).to(DEVICE) + loss_1_sorted = loss_1[ind_1_sorted] + + loss_2 = F.cross_entropy(y_2, t, reduction='none') + ind_2_sorted = np.argsort(loss_2.data.cpu()).to(DEVICE) + loss_2_sorted = loss_2[ind_2_sorted] + + remember_rate = 1 - forget_rate + num_remember = int(remember_rate * len(loss_1_sorted)) + + ind_1_update=ind_1_sorted[:num_remember] + ind_2_update=ind_2_sorted[:num_remember] + # exchange + loss_1_update = F.cross_entropy(y_1[ind_2_update], t[ind_2_update]) + loss_2_update = F.cross_entropy(y_2[ind_1_update], t[ind_1_update]) + + return torch.sum(loss_1_update)/num_remember, torch.sum(loss_2_update)/num_remember + + +# Train the Co-teaching Model +def train_coteaching(train_loader, epoch, + model1, optimizer1, + model2, optimizer2, + rate_schedule): + train_total = 0 + train_correct = 0 + train_total2 = 0 + train_correct2 = 0 + + for (images, labels, indexes) in train_loader: + ind = indexes.cpu().numpy().transpose() + + images = Variable(images).to(DEVICE) + labels = Variable(labels).to(DEVICE) + + logits1 = model1(images) + prec1, _ = accuracy(logits1, labels, topk=(1, 5)) + train_total += 1 + train_correct += prec1 + + logits2 = model2(images) + prec2, _ = accuracy(logits2, labels, topk=(1, 5)) + train_total2 += 1 + train_correct2 += prec2 + + loss_1, loss_2 = loss_coteaching( + logits1, logits2, labels, rate_schedule[epoch], ind + ) + + optimizer1.zero_grad() + loss_1.backward() + optimizer1.step() + optimizer2.zero_grad() + loss_2.backward() + optimizer2.step() + + train_acc1 = float(train_correct) / float(train_total) + return train_acc1 \ No newline at end of file diff --git a/deep-learning/methods/train_jocor.py b/deep-learning/methods/train_jocor.py new file mode 100644 index 00000000..4227637b --- /dev/null +++ b/deep-learning/methods/train_jocor.py @@ -0,0 +1,76 @@ +# -*- coding:utf-8 -*- +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +from utils import accuracy + + +__all__ = ['train_jocor'] + + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + + +def kl_loss_compute(pred, soft_targets, reduce='none'): + kl = F.kl_div( + F.log_softmax(pred, dim=1), + F.softmax(soft_targets, dim=1), + reduction=reduce + ) + if reduce: + return torch.mean(torch.sum(kl, dim=1)) + else: + return torch.sum(kl, 1) + + +def loss_jocor(y_1, y_2, t, forget_rate, ind, co_lambda=0.1): + loss_pick_1 = F.cross_entropy(y_1, t, reduction="none") * (1-co_lambda) + loss_pick_2 = F.cross_entropy(y_2, t, reduction="none") * (1-co_lambda) + loss_pick = ( + (loss_pick_1 + loss_pick_2 + co_lambda * kl_loss_compute(y_1, y_2, reduce='none') + + co_lambda * kl_loss_compute(y_2, y_1, reduce='none')).cpu() + ) + ind_sorted = np.argsort(loss_pick.data) + loss_sorted = loss_pick[ind_sorted] + remember_rate = 1 - forget_rate + num_remember = int(remember_rate * len(loss_sorted)) + ind_update = ind_sorted[:num_remember] + # exchange + loss = torch.mean(loss_pick[ind_update]) + return loss + + +def train_jocor(train_loader, epoch, model1, model2, optimizer, rate_schedule): + train_total = 0 + train_correct = 0 + train_total2 = 0 + train_correct2 = 0 + + for (images, labels, indexes) in train_loader: + ind = indexes.cpu().numpy().transpose() + + images = Variable(images).to(DEVICE) + labels = Variable(labels).to(DEVICE) + + # Forward + Backward + Optimize + logits1 = model1(images) + prec1, _ = accuracy(logits1, labels, topk=(1, 5)) + train_total += 1 + train_correct += prec1 + + logits2 = model2(images) + prec2, _ = accuracy(logits2, labels, topk=(1, 5)) + train_total2 += 1 + train_correct2 += prec2 + + loss_1 = loss_jocor(logits1, logits2, labels, rate_schedule[epoch], ind) + + optimizer.zero_grad() + loss_1.backward() + optimizer.step() + + train_acc1 = float(train_correct) / float(train_total) + return train_acc1 diff --git a/deep-learning/methods/train_regular.py b/deep-learning/methods/train_regular.py new file mode 100644 index 00000000..a9631444 --- /dev/null +++ b/deep-learning/methods/train_regular.py @@ -0,0 +1,36 @@ +import torch +from torch import nn +from torch.nn import functional as F +from torch.autograd import Variable +from utils import accuracy + + +__all__ = ['train_regular'] + + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + + +def train_regular(train_loader, model, optimizer): + train_total = 0 + train_correct = 0 + + for (images, labels, indexes) in train_loader: + + images = Variable(images).to(DEVICE) + labels = Variable(labels).to(DEVICE) + + logits = model(images) + + prec, _ = accuracy(logits, labels, topk=(1, 5)) + train_total += 1 + train_correct += prec + + loss = F.cross_entropy(logits, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + train_acc = float(train_correct) / float(train_total) + return train_acc \ No newline at end of file diff --git a/deep-learning/methods/train_rlvi.py b/deep-learning/methods/train_rlvi.py new file mode 100644 index 00000000..76f4bebf --- /dev/null +++ b/deep-learning/methods/train_rlvi.py @@ -0,0 +1,106 @@ +import torch +from torch.nn import functional as F +from torch.autograd import Variable +from utils import accuracy + + +__all__ = ['train_rlvi'] + + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + + +@torch.no_grad() +def update_sample_weights(residuals, weights, tol=1e-3, maxiter=40): + """ + Optimize Bernoulli probabilities + + Parameters + ---------- + residuals : array, + shape: len(train_loader) - cross-entropy loss for each training sample. + weights : array, + shape: len(train_loader) - Bernoulli probabilities pi: pi_i = probability that sample i is non-corrupted. + Updated inplace. + """ + residuals.sub_(residuals.min()) + exp_res = torch.exp(-residuals) + avg_weight = 0.95 + for _ in range(maxiter): + ratio = avg_weight / (1 - avg_weight) + new_weights = torch.div(ratio * exp_res, 1 + ratio * exp_res) + error = torch.norm(new_weights - weights) + weights[:] = new_weights + avg_weight = weights.mean() + if error < tol: + break + weights.div_(weights.max()) + + +def false_negative_criterion(weights, alpha=0.05): + '''Find threshold from the fixed probability (alpha) of type II error''' + total_positive = torch.sum(1 - weights) + beta = total_positive * alpha + sorted_weights, _ = torch.sort(weights, dim=0, descending=True) + false_negative = torch.cumsum(1 - sorted_weights, dim=0) + last_index = torch.sum(false_negative <= beta) - 1 + threshold = sorted_weights[last_index] + return threshold + + +def train_rlvi(train_loader, model, optimizer, + residuals, weights, overfit, threshold): + """ + Train one epoch: apply SGD updates using Bernoulli probabilities. + Thus, optimize variational bound of the marginal likelihood + instead of the standard neg. log-likelihood (cross-entropy for classification). + + Parameters + ---------- + residuals : array, + shape: len(train_loader) - to store cross-entropy loss on each training sample. + Shared across epochs. + weights : array, + shape: len(train_loader) - Bernoulli probabilities: pi_i = proba (in [0; 1]) that sample i is non-corrupted. + Shared across epochs. + overfit : bool - flag indicating whether overfitting has started. + threshold : float in [0; 1] - previous threshold for truncation: pi_i < threshold => pi_i = 0. + + Returns + ------- + train_acc : float - top-1 accuracy on training samples. + threshold : float in [0; 1] - updated threshold, based on type II error criterion. + """ + + train_total = 0 + train_correct = 0 + + for (images, labels, indexes) in train_loader: + + images = Variable(images).to(DEVICE) + labels = Variable(labels).to(DEVICE) + + logits = model(images) + prec, _ = accuracy(logits, labels, topk=(1, 5)) + train_total += 1 + train_correct += prec + + loss = F.cross_entropy(logits, labels, reduction='none') + residuals[indexes] = loss # save the losses on the current batch + + batch_weights = weights[indexes] + loss = loss * batch_weights # modify loss with Bernoulli probabilities + loss = loss.mean() + optimizer.zero_grad() + loss.backward() + optimizer.step() + + update_sample_weights(residuals, weights) + if overfit: + # Regularization: truncate samples with high probability of corruption + threshold = max(threshold, false_negative_criterion(weights)) + weights[weights < threshold] = 0 + + train_acc = float(train_correct) / float(train_total) + return train_acc, threshold diff --git a/deep-learning/methods/train_usdnl.py b/deep-learning/methods/train_usdnl.py new file mode 100644 index 00000000..2b4034ff --- /dev/null +++ b/deep-learning/methods/train_usdnl.py @@ -0,0 +1,53 @@ +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F +from torch.autograd import Variable +from utils import accuracy + + +__all__ = ['train_usdnl'] + + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + + +def loss_fn(logits, labels, forget_rate): + loss_pick = F.cross_entropy(logits, labels.long(), reduction='none') + loss_pick = loss_pick.cpu() + ind_sorted = np.argsort(loss_pick.data) + loss_sorted = loss_pick[ind_sorted] + + remember_rate = 1 - forget_rate + num_remember = int(remember_rate * len(loss_sorted)) + ind_update = ind_sorted[:num_remember] + # Exchange + loss = torch.mean(loss_pick[ind_update]) + return loss + + +def train_usdnl(train_loader, epoch, model, optimizer, rate_schedule): + model.train() + train_total = 0 + train_correct = 0 + + for (data, labels, indexes) in train_loader: + + data = Variable(data).to(DEVICE) + labels = Variable(labels).to(DEVICE) + + # Forward + Backward + Optimize + logits = model(data) + prec, _ = accuracy(logits, labels, topk=(1, 5)) + train_total += 1 + train_correct += prec + + loss = loss_fn(logits, labels, rate_schedule[epoch]) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + train_acc = float(train_correct) / float(train_total) + return train_acc diff --git a/deep-learning/models/__init__.py b/deep-learning/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deep-learning/models/lenet.py b/deep-learning/models/lenet.py new file mode 100644 index 00000000..cf9f14bf --- /dev/null +++ b/deep-learning/models/lenet.py @@ -0,0 +1,69 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision import transforms + + +class LeNet(nn.Module): + transform_train = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307, ),(0.3081, )), + ]) + transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307, ),(0.3081, )), + ]) + + def __init__(self, input_channel=3, num_classes=10, k=1): + super(LeNet, self).__init__() + self.conv1 = nn.Conv2d(input_channel, 6 * k, kernel_size=5) + self.conv2 = nn.Conv2d(6 * k, 16 * k, kernel_size=5) + self.fc1 = nn.Linear(16 * 4 * 4, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, num_classes) + + + def forward(self, x): + out = F.relu(self.conv1(x)) + out = F.max_pool2d(out, 2) + out = F.relu(self.conv2(out)) + out = F.max_pool2d(out, 2) + out = out.view(out.shape[0], -1) + out = F.relu(self.fc1(out)) + out = F.relu(self.fc2(out)) + out = self.fc3(out) + return out + + +class LeNetDO(nn.Module): + transform_train = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307, ),(0.3081, )), + ]) + transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307, ),(0.3081, )), + ]) + + def __init__(self, input_channel=3, num_classes=10, k=1, dropout_rate=0.25): + super(LeNetDO, self).__init__() + self.conv1 = nn.Conv2d(input_channel, 6 * k, kernel_size=5) + self.conv2 = nn.Conv2d(6 * k, 16 * k, kernel_size=5) + self.fc1 = nn.Linear(16 * 4 * 4, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, num_classes) + self.dropout_rate = dropout_rate + + + def forward(self, x): + out = F.relu(self.conv1(x)) + out = F.max_pool2d(out, 2) + out = F.dropout2d(out, p=self.dropout_rate) + out = F.relu(self.conv2(out)) + out = F.max_pool2d(out, 2) + out = F.dropout2d(out, p=self.dropout_rate) + out = out.view(out.shape[0], -1) + out = F.relu(self.fc1(out)) + out = F.relu(self.fc2(out)) + out = self.fc3(out) + return out diff --git a/deep-learning/models/resnet.py b/deep-learning/models/resnet.py new file mode 100644 index 00000000..3787f857 --- /dev/null +++ b/deep-learning/models/resnet.py @@ -0,0 +1,121 @@ +""" +ResNet: resnet18 and resnet34 implementation +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision import transforms + + +class Linear(nn.Module): + def __init__(self, in_features, out_features): + + super(Linear, self).__init__() + self.w = nn.Parameter(torch.randn(in_features, out_features)) + + def forward(self, x): + x = x.mm(self.w) + return x + + +class BasicBlock(nn.Module): + expansion = 1 + def __init__(self, in_planes, planes, stride=1, dropout_rate=0): + super(BasicBlock, self).__init__() + + self.dropout_rate = dropout_rate + + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + if self.dropout_rate > 0: + out = F.dropout2d(out, p=self.dropout_rate) + out += self.shortcut(x) + out = F.relu(out) + return out + + +class ResNet(nn.Module): + transform_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) + ]) + transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) + ]) + + def __init__(self, block, num_blocks, input_channel, num_classes, dropout_rate=0): + super(ResNet, self).__init__() + self.in_planes = 64 + self.conv1 = nn.Conv2d(input_channel, 64, kernel_size=3, stride=1, padding=0, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1, dropout_rate=dropout_rate) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2, dropout_rate=dropout_rate) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2, dropout_rate=dropout_rate) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2, dropout_rate=dropout_rate) + self.linear = nn.Linear(512*block.expansion, num_classes) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + + def _make_layer(self, block, planes, num_blocks, stride, dropout_rate): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride, dropout_rate)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + x = x.float() + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = self.avgpool(out) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +class ResNet18(ResNet): + def __init__(self, input_channel, num_classes): + super(ResNet18, self).__init__( + block=BasicBlock, num_blocks=[2,2,2,2], input_channel=input_channel, num_classes=num_classes + ) + +class ResNet34(ResNet): + def __init__(self, input_channel, num_classes): + super(ResNet34, self).__init__( + block=BasicBlock, num_blocks=[3,4,6,3], input_channel=input_channel, num_classes=num_classes + ) + + +class ResNet18DO(ResNet): + def __init__(self, input_channel, num_classes, dropout_rate=0.25): + super(ResNet18DO, self).__init__( + block=BasicBlock, num_blocks=[2,2,2,2], input_channel=input_channel, num_classes=num_classes, + dropout_rate=dropout_rate + ) + +class ResNet34DO(ResNet): + def __init__(self, input_channel, num_classes, dropout_rate=0.25): + super(ResNet34DO, self).__init__( + block=BasicBlock, num_blocks=[3,4,6,3], input_channel=input_channel, num_classes=num_classes, + dropout_rate=dropout_rate + ) diff --git a/deep-learning/models/resnet50.py b/deep-learning/models/resnet50.py new file mode 100644 index 00000000..c1943003 --- /dev/null +++ b/deep-learning/models/resnet50.py @@ -0,0 +1,142 @@ +""" +ResNet: resnet50, pre-trained on ImageNet +""" + +import math +import torch.nn as nn +import torch.utils.model_zoo as model_zoo + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, outplanes, stride=1, downsample=None, dropout_rate=0): + super().__init__() + + self.dropout_rate = dropout_rate + + self.conv1 = nn.Conv2d(inplanes, outplanes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(outplanes) + + self.conv2 = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(outplanes) + + self.conv3 = nn.Conv2d(outplanes, outplanes*self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(outplanes*self.expansion) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + if self.dropout_rate > 0: + out = nn.functional.dropout2d(out, p=self.dropout_rate) + out = self.conv3(out) + out = self.bn3(out) + if(self.downsample is not None): + residual = self.downsample(x) + out += residual + out = self.relu(out) + + return out + + + +class ResNet(nn.Module): + + def __init__(self, block, layer, num_classes, input_channel, dropout_rate=0): + super().__init__() + self.inplanes = 64 + self.conv1 = nn.Conv2d(input_channel, 64, kernel_size=7, stride=2, padding=3, bias = False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.layer1 = self.make_layer(block, 64, layer[0], dropout_rate=dropout_rate) + self.layer2 = self.make_layer(block, 128, layer[1], stride=2, dropout_rate=dropout_rate) + self.layer3 = self.make_layer(block, 256, layer[2], stride=2, dropout_rate=dropout_rate) + self.layer4 = self.make_layer(block, 512, layer[3], stride=2, dropout_rate=dropout_rate) + + self.avgpool = nn.AvgPool2d(7, stride = 2) + + self.dropout = nn.Dropout2d(p = 0.5, inplace = True) + + self.fc = nn.Linear(512*block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2./n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def make_layer(self, block, planes, blocks, stride=1, dropout_rate=0): + downsample = None + + if(stride !=1 or self.inplanes != planes * block.expansion): + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes*block.expansion)) + + layers = [] + + layers.append(block(self.inplanes, planes, stride, downsample, dropout_rate=dropout_rate)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + + return nn.Sequential(*layers) + + def fc_params(self): + params = [] + for name, param in self.named_parameters(): + if 'fc' in name: + params.append(param) + return params + + def backbone_params(self): + params = [] + for name, param in self.named_parameters(): + if 'fc' not in name: + params.append(param) + return params + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.avgpool(x) + x = self.dropout(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + return x + + +def resnet50(num_classes, input_channel, pretrained=True, dropout_rate=0): + model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes, input_channel, dropout_rate=dropout_rate) + + if pretrained == True: + state_dict = model_zoo.load_url( + 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_a1h2_176-001a1197.pth', + map_location='cpu' + ) + state_dict.pop('fc.weight', None) + state_dict.pop('fc.bias', None) + model.load_state_dict(state_dict, strict=False) + print("Loaded Imagenet pretrained model") + + return model diff --git a/deep-learning/utils.py b/deep-learning/utils.py new file mode 100644 index 00000000..a007817e --- /dev/null +++ b/deep-learning/utils.py @@ -0,0 +1,92 @@ +import tabulate +import numpy as np + +import torch +from torch.nn import functional as F +from torch.autograd import Variable + + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# DEVICE = 'cpu' + + +# Multiplicative LR schedule for SGD +def get_lr_factor(epoch): + """ + Compute multiplicative factor to decay learning rate. + To use for MNIST within 100 epochs + """ + t = epoch / 100 + lr_ratio = 0.01 + if t <= 0.2: + factor = 1.0 + elif t <= 0.4: + factor = 1.0 - (1.0 - lr_ratio) * (t - 0.2) / 0.2 + else: + factor = lr_ratio + return factor + + +# Print log table +def output_table(epoch, n_epoch, time_ep=None, train_acc=None, test_acc=None): + columns = ['ep', 'time_ep', 'tr_acc', 'te_acc'] + + values = [f"{epoch}/{n_epoch}", time_ep, train_acc, test_acc] + table = tabulate.tabulate( + [values], columns, tablefmt='simple', + stralign='center', numalign='center', floatfmt=f'8.2f' + ) + if epoch % 40 == 0: + table = table.split('\n') + table = '\n'.join([table[1]] + table) + else: + table = table.split('\n')[2] + print(table) + + +# Evaluate a single model +def evaluate(test_loader, model): + model.eval() + correct = 0 + total = 0 + with torch.no_grad(): + for images, labels, _ in test_loader: + images = Variable(images).to(DEVICE) + labels = Variable(labels).to(DEVICE) + logits = model(images) + outputs = F.softmax(logits, dim=1) + _, pred = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (pred == labels).sum() + acc = 100 * float(correct) / float(total) + return acc + + +def accuracy(logit, target, topk=(1,)): + """Computes the precision@k for the specified values of k""" + output = F.softmax(logit, dim=1) + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +# Ratio of identified corrupted vs clean for debug +def get_ratio_corrupted(mask, noise_mask): + clean_pred =(mask == True) + clean_true = noise_mask + clean_found = np.logical_and(clean_pred, clean_true) + clean_found_ratio = np.count_nonzero(clean_found) / np.count_nonzero(clean_true) + corrupted_pred = (mask == False) + corrupted_true = (noise_mask == False) + corrupted_found = np.logical_and(corrupted_pred, corrupted_true) + corrupted_found_ratio = np.count_nonzero(corrupted_found) / np.count_nonzero(corrupted_true) + return clean_found_ratio, corrupted_found_ratio \ No newline at end of file diff --git a/img/deep-learning-synthetic.png b/img/deep-learning-synthetic.png new file mode 100644 index 00000000..f7ba5926 Binary files /dev/null and b/img/deep-learning-synthetic.png differ diff --git a/img/online-learning.png b/img/online-learning.png new file mode 100644 index 00000000..6ed42055 Binary files /dev/null and b/img/online-learning.png differ diff --git a/online-learning/humanactivity.mat b/online-learning/humanactivity.mat new file mode 100644 index 00000000..b5d808a5 Binary files /dev/null and b/online-learning/humanactivity.mat differ diff --git a/online-learning/main.py b/online-learning/main.py new file mode 100644 index 00000000..858c721f --- /dev/null +++ b/online-learning/main.py @@ -0,0 +1,348 @@ +import os +import numpy as np +from matplotlib import pyplot as plt +from matplotlib import font_manager as fm + +from scipy.optimize import minimize_scalar +from scipy.io import loadmat + +from sklearn.linear_model import SGDClassifier +from sklearn.utils import shuffle +from sklearn import preprocessing + + +random_gen = np.random.default_rng(seed=1) + +partial_fit_classifiers = { + 'SGD': SGDClassifier(loss='log_loss', random_state=1), + 'RRM': SGDClassifier(loss='log_loss', random_state=1), + 'RLVI': SGDClassifier(loss='log_loss', random_state=1), +} + +CLF_COLORS = { + 'SGD': '#b85460', + 'RRM': '#e69e00', + 'RLVI': '#4477aa', +} + +CLF_ALPHA = { + 'SGD': 1., + 'RRM': 1., + 'RLVI': 1., +} + +# Use an open source Roboto font from Google (download first) +if os.path.isfile("../Roboto.ttf"): + custom_font = fm.FontProperties(fname="../Roboto.ttf", size=22) +else: + custom_font = fm.FontProperties(size=22) + +# To save results +if not os.path.exists('plots'): + os.makedirs('plots') + + +def update_weights_rlvi(losses, tol=1e-3, maxiter=100): + '''Optimize Bernoulli probabilities''' + exp_loss = np.exp(-losses) + weights = 0.5 * np.ones_like(losses) + for _ in range(maxiter): + avg_weight = np.mean(weights) + ratio = avg_weight / (1 - avg_weight) + new_weights = ratio * exp_loss / (1 + ratio * exp_loss) + error = np.linalg.norm(new_weights - weights) + if error < tol: + break + weights = np.copy(new_weights) + new_weights /= np.max(new_weights) * len(new_weights) + return new_weights + + +def update_weights_rrm(residuals, eps): + '''Optimize sample weights for Robust Risk Minimization''' + res = np.copy(residuals) + t = -np.log((1 - eps) * res.shape[0]) + numeric_cutoff = 1e-16 + + def objective(xi): + phi = np.exp(-res * np.exp(-xi)) + phi[phi < numeric_cutoff] = numeric_cutoff + sum_phi = np.sum(phi) + return np.exp(xi) * (np.log(sum_phi) + t) + + opt_res = minimize_scalar(objective) + opt_xi = opt_res['x'] + opt_alpha = np.exp(opt_xi) + phi = np.exp(-res / opt_alpha) + phi[phi < numeric_cutoff] = numeric_cutoff + sum_phi = np.sum(phi) + opt_beta_over_alpha = np.log(sum_phi) - 1 + opt_weights = np.exp(-res / opt_alpha) * np.exp(-opt_beta_over_alpha - 1) + return opt_weights + + +def cross_entropy(log_proba, targets): + return -targets * log_proba - (1 - targets) * log_proba + + +def smooth_triangle(data, degree=1): + '''Moving average filter (triangle scheme)''' + triangle = np.concatenate((np.arange(degree + 1), np.arange(degree)[::-1])) + smoothed = [] + for i in range(degree, len(data) - degree * 2): + point = data[i : i + len(triangle)] * triangle + smoothed.append(np.sum(point) / np.sum(triangle)) + # Boundaries + smoothed = [smoothed[0]] * int(degree + degree / 2) + smoothed + while len(smoothed) < len(data): + smoothed.append(smoothed[-1]) + return smoothed + + +def plot_score_evolution(clf_stats, smooth_deg=1): + def plot_score(x, y, name): + '''Plot accuracy vs number of samples''' + x = np.array(x) + y = np.array(y) * 100 + y = smooth_triangle(y, smooth_deg) + plt.plot(x, y, lw=3, c=CLF_COLORS[name], alpha=CLF_ALPHA[name]) + + clf_names = list(clf_stats.keys()) + plt.figure() + for name, stats in clf_stats.items(): + # Plot accuracy evolution with #examples + score, n_examples = zip(*stats['acc_history']) + # Scale by thousands + n_examples = np.array(n_examples) / 1000. + plot_score(n_examples, score, name) + plt.gca() + + plt.xlabel(r'observed samples $\times 10^3$', font_properties=custom_font) + plt.ylabel(r'Accuracy, %', font_properties=custom_font) + plt.xlim(1, 10.8) + plt.xticks(fontsize=20) + plt.yticks(np.arange(80, 101, 5), fontsize=20) + plt.ylim(85, 100) + plt.grid(True) + plt.legend(clf_names, loc='lower left', fontsize=19) + plt.tight_layout() + plt.savefig("plots/hard_score.png", dpi=300) + plt.close() + + +def plot_tp_evolution(clf_stats, smooth_deg=1): + def plot_recall(x, y, name): + '''Plot recall for pos. class vs number of samples''' + x = np.array(x) + y = np.array(y) + y = smooth_triangle(y, smooth_deg) + plt.plot(x, y, lw=3, c=CLF_COLORS[name], alpha=CLF_ALPHA[name]) + + clf_names = list(clf_stats.keys()) + plt.figure() + for name, stats in clf_stats.items(): + # Plot recall evolution with #examples + recall, n_examples = zip(*stats['tp_history']) + # Scale by thousands + n_examples = np.array(n_examples) / 1000. + plot_recall(n_examples, recall, name) + plt.gca() + + plt.xlabel(r'observed samples $\times 10^3$', font_properties=custom_font) + plt.ylabel('Recall', font_properties=custom_font) + plt.xticks(fontsize=20) + plt.yticks(np.arange(0.7, 1.01, 0.1), fontsize=20) + plt.ylim(0.75, 1) + plt.xlim(1, 10.8) + plt.grid(True) + plt.legend(clf_names, loc='lower left', fontsize=19) + plt.tight_layout() + plt.savefig("plots/hard_tp.png", dpi=300) + plt.close() + + +def plot_tn_evolution(clf_stats, smooth_deg=1): + def plot_recall(x, y, name): + '''Plot recall for neg. class vs number of samples''' + x = np.array(x) + y = np.array(y) + y = smooth_triangle(y, smooth_deg) + plt.plot(x, y, lw=3, c=CLF_COLORS[name], alpha=CLF_ALPHA[name]) + + clf_names = list(clf_stats.keys()) + plt.figure() + for name, stats in clf_stats.items(): + # Plot recall evolution with #examples + recall, n_examples = zip(*stats['tn_history']) + # Scale by thousands + n_examples = np.array(n_examples) / 1000. + plot_recall(n_examples, recall, name) + plt.gca() + + plt.xlabel(r'observed samples $\times 10^3$', font_properties=custom_font) + plt.ylabel('TN rate', font_properties=custom_font) + plt.xlim(1, 10.8) + plt.xticks(fontsize=20) + plt.yticks(np.arange(0.8, 1.01, 0.05), fontsize=20) + plt.ylim(0.8, 1) + plt.grid(True) + plt.legend(clf_names, loc='lower left', fontsize=19) + plt.tight_layout() + plt.savefig("plots/hard_tn.png", dpi=300) + plt.close() + + +def plot_f1_evolution(clf_stats, smooth_deg=1): + def plot_f1(x, y, name): + '''Plot f1-score vs number of samples''' + x = np.array(x) + y = np.array(y) + y = smooth_triangle(y, smooth_deg) + plt.plot(x, y, lw=3, c=CLF_COLORS[name], alpha=CLF_ALPHA[name]) + + clf_names = list(clf_stats.keys()) + plt.figure() + for name, stats in clf_stats.items(): + # Plot recall evolution with #examples + recall, n_examples = zip(*stats['f1_history']) + # Scale by thousands + n_examples = np.array(n_examples) / 1000. + plot_f1(n_examples, recall, name) + plt.gca() + + plt.xlabel(r'observed samples $\times 10^3$', font_properties=custom_font) + plt.ylabel(r'$F_1{-}$score', font_properties=custom_font) + plt.xlim(1, 10.8) + plt.xticks(fontsize=20) + plt.yticks(np.arange(0.8, 1.01, 0.05), fontsize=20) + plt.ylim(0.8, 1) + plt.grid(True) + plt.legend(clf_names, loc='lower left', fontsize=19) + plt.tight_layout() + plt.savefig("plots/hard_f1.png", dpi=300) + plt.close() + + +def read_hard_dataset(file): + data = loadmat(file) + X = data['feat'] + y = data['actid'].flatten() + y = np.array(y > 2, dtype=int) + return X, y + + +def pert(a, b, c, lmbda=4): + r = c - a + alpha = 1 + lmbda * (b - a) / r + beta = 1 + lmbda * (c - b) / r + return a + float(random_gen.beta(alpha, beta)) * r + + +def iter_minibatches(X, y, batch_size=100, test_size=0.2, noise_rate=0, dynamic=False): + size = len(X) + for idx in range(0, size, batch_size): + left = idx + right = min(idx + batch_size, size) + X_batch = X[left:right] + y_batch = y[left:right] + # Split the batch into training and test part + X_test = X_batch[:int(test_size * len(y_batch))] + y_test = y_batch[:int(test_size * len(y_batch))] + X_train = X_batch[int(test_size * len(y_batch)):] + y_train = y_batch[int(test_size * len(y_batch)):] + # Normalize features based on training data + scaler = preprocessing.StandardScaler().fit(X_train) + X_train = scaler.transform(X_train) + X_test = scaler.transform(X_test) + # Corrupt training data + if noise_rate > 0: + y_positive = np.where(y_train == 1)[0] + if dynamic: + n_corrupted = int(pert(a=0, b=min(0.1, noise_rate), c=noise_rate) * len(y_positive)) + else: + n_corrupted = int(noise_rate * len(y_positive)) + corrupted_labels = random_gen.choice(y_positive, size=n_corrupted, replace=False) + y_train[corrupted_labels] = 0 + yield (X_train, y_train), (X_test, y_test) + + +def check_fitted(clf): + return hasattr(clf, "classes_") + + +data_X, data_y = read_hard_dataset('./humanactivity.mat') +data_X, data_y = shuffle(data_X, data_y, random_state=1) + + +def main(): + clf_stats = {} + for clf_name in partial_fit_classifiers: + stats = {'n_train': 0, 'acc': 0.0, 'acc_history': [(0, 0)], + 'tp': 0, 'tn': 0, 'tp_history': [(0, 0)], 'tn_history': [(0, 0)], + 'f1': 0, 'f1_history': [(0, 0)], + } + clf_stats[clf_name] = stats + + minibatch_iterators = iter_minibatches(data_X, data_y, batch_size=200, test_size=0.5, noise_rate=0.3, dynamic=True) + # Main loop : iterate on mini-batchs of examples + for i, batch in enumerate(minibatch_iterators): + (X_batch, y_batch), (X_test, y_test) = batch + for clf_name, clf in partial_fit_classifiers.items(): + if clf_name == 'RLVI': + if (not check_fitted(clf)): + log_proba = np.log(0.5 * np.ones_like(y_batch)) + else: + log_proba = clf.predict_log_proba(X_batch)[:, 1] + residuals = cross_entropy(log_proba, y_batch) + sample_weight = update_weights_rlvi(residuals) + # update estimator with examples in the current mini-batch + clf.partial_fit(X_batch, y_batch, classes=[0, 1], sample_weight=sample_weight) + + elif clf_name == 'RRM': + if not check_fitted(clf): + log_proba = np.log(0.5 * np.ones_like(y_batch)) + else: + log_proba = clf.predict_log_proba(X_batch)[:, 1] + residuals = cross_entropy(log_proba, y_batch) + sample_weight = update_weights_rrm(residuals, eps=0.3) + # update estimator with examples in the current mini-batch + clf.partial_fit(X_batch, y_batch, classes=[0, 1], sample_weight=sample_weight) + + else: + # update estimator with examples in the current mini-batch + clf.partial_fit(X_batch, y_batch, classes=[0, 1]) + + # accumulate test accuracy stats + clf_stats[clf_name]['n_train'] += len(X_batch) + + y_pred = clf.predict(X_test) + + true_positive = np.count_nonzero(np.logical_and(y_pred == 1, y_test == 1)) + clf_stats[clf_name]['tp'] = true_positive / np.count_nonzero(y_test == 1) + tp_history = (clf_stats[clf_name]['tp'], clf_stats[clf_name]['n_train']) + clf_stats[clf_name]['tp_history'].append(tp_history) + + true_negative = np.count_nonzero(np.logical_and(y_pred == 0, y_test == 0)) + clf_stats[clf_name]['tn'] = true_negative / np.count_nonzero(y_test == 0) + tn_history = (clf_stats[clf_name]['tn'], clf_stats[clf_name]['n_train']) + clf_stats[clf_name]['tn_history'].append(tn_history) + + false_positive = np.count_nonzero(np.logical_and(y_pred == 1, y_test == 0)) + false_negative = np.count_nonzero(np.logical_and(y_pred == 0, y_test == 1)) + f1 = true_positive / (true_positive + (false_positive + false_negative) / 2.) + clf_stats[clf_name]['f1'] = f1 + f1_history = (clf_stats[clf_name]['f1'], clf_stats[clf_name]['n_train']) + clf_stats[clf_name]['f1_history'].append(f1_history) + + clf_stats[clf_name]['acc'] = clf.score(X_test, y_test) + acc_history = (clf_stats[clf_name]['acc'], clf_stats[clf_name]['n_train']) + clf_stats[clf_name]['acc_history'].append(acc_history) + + smooth_deg = 10 + plot_score_evolution(clf_stats, smooth_deg=smooth_deg) + plot_tp_evolution(clf_stats, smooth_deg=smooth_deg) + plot_tn_evolution(clf_stats, smooth_deg=smooth_deg) + plot_f1_evolution(clf_stats, smooth_deg=smooth_deg) + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..9ee2bdd8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +numpy==1.24.0 +scipy==1.12.0 +scikit-learn==1.2.0 +torch==2.1.2 +torchvision==0.16.2 +matplotlib==3.7.0 +tabulate==0.9.0 diff --git a/standard-learning/huber.py b/standard-learning/huber.py new file mode 100644 index 00000000..1efb3525 --- /dev/null +++ b/standard-learning/huber.py @@ -0,0 +1,40 @@ +import numpy as np +from scipy.stats import chi2 +from scipy.linalg import lstsq + +def psi_huber(vec, c): + psi = np.zeros(len(vec)) + mask = np.abs(vec) <= c + psi[mask] = vec[mask] + psi[~mask] = c * np.sign(vec[~mask]) + return psi + + +def linear_regression(X, y, c=0.7317, sigma0=1): + n = X.shape[0] + alpha = c**2 * (1 - chi2.cdf(c**2, df=1)) + chi2.cdf(c**2, df=3) + + theta0 = np.linalg.solve(X.T @ X, X.T @ y) + + while True: + # update residual + res = y - X @ theta0 + # update pesudo residual + res_psi = sigma0 * psi_huber(res / sigma0, c) + # update scale + sigma = 1 / np.sqrt(n*2 * alpha) * np.linalg.norm(res_psi) + # (re)update the pseudo residual + res_psi = sigma * psi_huber(res / sigma, c) + # regress X on r_psi + # delta = np.linalg.solve(X.T @ X, X.T @ res_psi) + delta = lstsq(X, res_psi)[0] + # update theta + theta = theta0 + delta + # stopping criteria + if (np.linalg.norm(theta - theta0) / np.linalg.norm(theta0)) > 1e-6: + sigma0 = sigma + theta0 = theta + else: + break + + return theta diff --git a/standard-learning/main.py b/standard-learning/main.py new file mode 100644 index 00000000..1f8c7dcf --- /dev/null +++ b/standard-learning/main.py @@ -0,0 +1,585 @@ +import os +import numpy as np +from scipy.linalg import lstsq +from matplotlib import pyplot as plt +from matplotlib import font_manager as fm + +import huber +import sever +import rrm +import rlvi + +import utils + + +# For reproducibility +SEED = 0 +random_gen = np.random.default_rng(seed=SEED) + + +# Properties for boxplots +box_plot_props = dict( + widths=0.6, + whis=[0, 100], + meanprops={'marker': 'o', 'markersize': 8, 'markerfacecolor': 'red', 'markeredgewidth': 0}, + medianprops={'linestyle': '-', 'linewidth': 1, 'color': 'r'}, + whiskerprops={'linestyle': (0, (10, 10))}, + boxprops={'c': 'blue', 'lw': 0.8}, + showfliers=False, autorange=True, showmeans=True, +) +textbox_props = dict(boxstyle='round', facecolor='white') + +# Use an open source Roboto font from Google (download first) +if os.path.isfile("../Roboto.ttf"): + custom_font = fm.FontProperties(fname="../Roboto.ttf", size=22) +else: + custom_font = fm.FontProperties(size=22) + + +# To save results +if not os.path.exists('plots'): + os.makedirs('plots') + + +def generate_data_mean(size=100, eps=0.2): + n2 = random_gen.binomial(n=size, p=eps) # number of corrupted samples + n1 = size - n2 + + mean = np.array([200, 200]) + mean = mean.reshape((-1, 1)) + sigma = np.array([ + [400, 50], + [50, 400] + ]) + sigma_root = np.linalg.cholesky(sigma) + sample1 = mean + sigma_root @ random_gen.normal(size=(2, n1)) + + nu = 2.5 + C = 2 * (nu / (nu - 2)) * sigma + C_root = np.linalg.cholesky(C) + u = random_gen.chisquare(df=nu, size=n2) / nu + u = u[None, :] + y = C_root @ random_gen.normal(size=(2, n2)) + sample2 = mean + y / np.sqrt(u) + + sample = np.hstack([sample1, sample2]) + return sample.T + + +def generate_data_linear_regression(size=40, eps=0.2, nu=1.5): + X = -5 + 10 * random_gen.random(size=(size, 10)) + n2 = random_gen.binomial(n=size, p=eps) # number of corrupted samples + n1 = size - n2 + + theta = np.ones(X.shape[-1]) + sigma = 0.25 + y1 = X[:n1] @ theta + sigma * random_gen.normal(size=n1) + + C = 1 + u = random_gen.chisquare(df=nu, size=n2) / nu + u = u[None, :] + v = np.sqrt(C) * random_gen.normal(size=n2) + y2 = X[n1:] @ theta + v / np.sqrt(u) + + y = np.vstack([y1.reshape((-1, 1)), y2.reshape((-1, 1))]).flatten() + return X, y + + +def generate_data_logistic_regression(size=100, eps=0.05): + n2 = random_gen.binomial(n=size, p=eps) # number of corrupted samples + n1 = size - n2 + theta = np.array([-1, 1, 1]) + + mean1 = np.array([0.5, 0.5]) + mean1 = mean1.reshape((-1, 1)) + rho = 0.99 + sigma1 = np.array([ + [0.25, -0.25 * rho], + [-0.25 * rho, 0.25] + ]) + sigma_root1 = np.linalg.cholesky(sigma1) + X1 = mean1 + sigma_root1 @ random_gen.normal(size=(2, n1)) + X1 = X1.T + X1_aug = np.hstack([ + np.ones((X1.shape[0], 1)), + X1, + ]) + inverse_sigmoid = 1 + np.exp(-X1_aug @ theta) + y1 = np.ones(X1.shape[0]) + y1[inverse_sigmoid > 2] = 0 + + mean2 = np.array([0.5, 1.25]) + mean2 = mean2.reshape((-1, 1)) + sigma_root2 = np.array([ + [0.1, 0], + [0, 0.1] + ]) + X2 = mean2 + sigma_root2 @ random_gen.normal(size=(2, n2)) + X2 = X2.T + y2 = np.zeros(X2.shape[0]) + + X = np.vstack([X1, X2]) + y = np.vstack([y1.reshape((-1, 1)), y2.reshape((-1, 1))]).flatten() + return X, y + + +def generate_data_pca(size=200, eps=0.2): + n2 = random_gen.binomial(n=size, p=eps) # number of corrupted samples + n1 = size - n2 + + z1 = random_gen.normal(size=n1) + sigma = 0.25 + z2 = 2 * z1 + sigma * random_gen.normal(size=n1) + z_true = np.column_stack([z1, z2]) + + nu = 1.5 + u = random_gen.chisquare(df=nu, size=n2) / nu + u = u[None, :] + v = random_gen.normal(size=(2, n2)) + z_false = v / np.sqrt(u) + z_false = z_false.T + + sample = np.vstack([z_true, z_false]) + return sample + + +def generate_data_cov(size=50, eps=0.2): + n2 = random_gen.binomial(n=size, p=eps) # number of corrupted samples + n1 = size - n2 + + true_mean = np.zeros((2, 1)) + true_sigma = np.array([ + [1, 0.8], + [0.8, 1] + ]) + true_sigma_root = np.linalg.cholesky(true_sigma) + + sample1 = true_mean + true_sigma_root @ random_gen.normal(size=(2, n1)) + + nu = 1.5 + + u = random_gen.chisquare(df=nu, size=n2) / nu + u = u[None, :] + y = true_sigma_root @ random_gen.normal(size=(2, n2)) + sample2 = true_mean + y / np.sqrt(u) + + sample = np.hstack([sample1, sample2]) + return sample.T + + +def test_mean(): + print("Running test for mean estimation") + + true_mean = np.array([200, 200]) + MC_runs = 100 + eps_values = np.linspace(0, 0.4, 20) + ml_errors = [0 for _ in range(len(eps_values))] + rrm_errors = [0 for _ in range(len(eps_values))] + rlvi_errors = [0 for _ in range(len(eps_values))] + + for _ in range(MC_runs): + for i, eps in enumerate(eps_values): + samples = generate_data_mean(size=100, eps=eps) + mean_ml = np.mean(samples, axis=0) + mean_rrm = rrm.mean(samples, eps=0.4) + mean_rlvi = rlvi.mean(samples) + ml_errors[i] += np.linalg.norm(true_mean - mean_ml) / np.linalg.norm(true_mean) / MC_runs + rrm_errors[i] += np.linalg.norm(true_mean - mean_rrm) / np.linalg.norm(true_mean) / MC_runs + rlvi_errors[i] += np.linalg.norm(true_mean - mean_rlvi) / np.linalg.norm(true_mean) / MC_runs + + plt.rcParams["mathtext.fontset"] = "cm" + plt.plot(eps_values, ml_errors, lw=3, c='brown', label='ML') + plt.plot(eps_values, rrm_errors, lw=3, c='green', label='RRM') + plt.plot(eps_values, rlvi_errors, lw=3, c='blue', label='RLVI') + plt.xlabel(r"corruption level $\varepsilon$", font_properties=custom_font) + # plt.ylabel("$Average\ relative\ error$", fontsize=22) + plt.ylabel("Mean\navg. relative error", font_properties=custom_font) + plt.xticks(np.arange(0, 0.5, 0.1), fontsize=20) + plt.yticks(fontsize=20) + plt.xlim(0, 0.4) + # plt.ylim(0, 0.4) + plt.legend(fontsize=18) + plt.grid() + plt.tight_layout() + plt.savefig("plots/mean.png", dpi=300) + plt.close() + print("\tDone\n") + + +def test_mean_fixed_eps(): + print("Running test for mean estimation (fixed eps.)") + + true_mean = np.array([200, 200]) + MC_runs = 100 + + ml_errors = [] + rrm_errors = [] + rlvi_errors = [] + + for _ in range(MC_runs): + samples = generate_data_mean(size=100, eps=0.2) + mean_ml = np.mean(samples, axis=0) + mean_rrm = rrm.mean(samples, eps=0.4) + mean_rlvi = rlvi.mean(samples) + ml_errors.append(np.linalg.norm(true_mean - mean_ml) / np.linalg.norm(true_mean)) + rrm_errors.append(np.linalg.norm(true_mean - mean_rrm) / np.linalg.norm(true_mean)) + rlvi_errors.append(np.linalg.norm(true_mean - mean_rlvi) / np.linalg.norm(true_mean)) + + errors_to_plot = { + "ML": ml_errors, + "RRM": rrm_errors, + "RLVI": rlvi_errors + } + + plt.rcParams["mathtext.fontset"] = "cm" + for i, errors in enumerate(errors_to_plot.values()): + plt.boxplot(errors, positions=[i + 1], **box_plot_props) + plt.xticks(range(1, len(errors_to_plot) + 1), errors_to_plot.keys(), font_properties=custom_font) + plt.ylabel("Mean\nrelative error", font_properties=custom_font) + plt.ylim(0, 0.12) + plt.yticks(np.arange(0, 0.20, 0.05), fontsize=20) + plt.grid() + plt.tight_layout() + plt.savefig("plots/mean_boxplots.png", dpi=300) + plt.close() + print("\tDone\n") + + +def test_linear_regression(): + print("Running test for linear regression") + + true_theta = np.ones(10) + eps_values = np.linspace(0, 0.4, 20) + + ml_loss = [] + huber_loss = [] + sever_loss = [] + rrm_loss = [] + rlvi_loss = [] + + MC_runs = 100 + for eps in eps_values: + ml_loss_avg = 0 + huber_loss_avg = 0 + sever_loss_avg = 0 + rrm_loss_avg = 0 + rlvi_loss_avg = 0 + for _ in range(MC_runs): + X, y = generate_data_linear_regression(eps=eps, nu=2.5) + theta_ml = lstsq(X, y)[0] + theta_huber = huber.linear_regression(X, y) + theta_sever = sever.linear_regression(X, y, eps=0.4) + theta_rrm = rrm.linear_regression(X, y, eps=0.4) + theta_rlvi = rlvi.linear_regression(X, y) + + ml_loss_avg += np.linalg.norm(true_theta - theta_ml) + huber_loss_avg += np.linalg.norm(true_theta - theta_huber) + sever_loss_avg += np.linalg.norm(true_theta - theta_sever) + rrm_loss_avg += np.linalg.norm(true_theta - theta_rrm) + rlvi_loss_avg += np.linalg.norm(true_theta - theta_rlvi) + + ml_loss.append(ml_loss_avg / np.linalg.norm(true_theta) / MC_runs) + huber_loss.append(huber_loss_avg / np.linalg.norm(true_theta) / MC_runs) + sever_loss.append(sever_loss_avg / np.linalg.norm(true_theta) / MC_runs) + rrm_loss.append(rrm_loss_avg / np.linalg.norm(true_theta) / MC_runs) + rlvi_loss.append(rlvi_loss_avg / np.linalg.norm(true_theta) / MC_runs) + + plt.rcParams["mathtext.fontset"] = "cm" + plt.plot(eps_values, ml_loss, lw=3, c='brown', label='ML') + plt.plot(eps_values, huber_loss, lw=3, c='chocolate', label='Huber') + plt.plot(eps_values, sever_loss, lw=3, c='orange', label='SEVER') + plt.plot(eps_values, rrm_loss, lw=3, c='teal', label='RRM') + plt.plot(eps_values, rlvi_loss, lw=3, c='#4477aa', label='RLVI') + + plt.xlabel(r"corruption level, $\varepsilon$", font_properties=custom_font)#, labelpad=10) + plt.ylabel("LinReg\navg. relative error", font_properties=custom_font) + plt.xticks(np.arange(0, 0.5, 0.1), fontsize=20) + plt.xlim(0, 0.4) + plt.yticks(np.arange(0.010, 0.08, 0.01), fontsize=20) + plt.ylim(0.015, 0.08) + plt.legend(fontsize=18, framealpha=1) + plt.grid() + plt.tight_layout() + plt.savefig("plots/linear_regression.png", dpi=300) + plt.close() + print("\tDone\n") + + +def test_linear_regression_fixed_eps(): + print("Running test for linear regression (fixed eps.)") + + true_theta = np.ones(10) + + ml_errors = [] + huber_errors = [] + sever_errors = [] + rrm_errors = [] + rlvi_errors = [] + + MC_runs = 100 + for _ in range(MC_runs): + X, y = generate_data_linear_regression(eps=0.2, nu=2.5) + theta_ml = np.linalg.solve(X.T @ X, X.T @ y) + theta_huber = huber.linear_regression(X, y) + theta_sever = sever.linear_regression(X, y, eps=0.4) + theta_rrm = rrm.linear_regression(X, y, eps=0.4) + theta_rlvi = rlvi.linear_regression(X, y) + + ml_errors.append(np.linalg.norm(true_theta - theta_ml) / np.linalg.norm(true_theta)) + huber_errors.append(np.linalg.norm(true_theta - theta_huber) / np.linalg.norm(true_theta)) + sever_errors.append(np.linalg.norm(true_theta - theta_sever) / np.linalg.norm(true_theta)) + rrm_errors.append(np.linalg.norm(true_theta - theta_rrm) / np.linalg.norm(true_theta)) + rlvi_errors.append(np.linalg.norm(true_theta - theta_rlvi) / np.linalg.norm(true_theta)) + + errors_to_plot = { + "ML": ml_errors, + "Huber": huber_errors, + "SEVER": sever_errors, + "RRM": rrm_errors, + "RLVI": rlvi_errors + } + fig, ax = plt.subplots() + plt.rcParams["mathtext.fontset"] = "cm" + for i, errors in enumerate(errors_to_plot.values()): + plt.boxplot(errors, positions=[i + 1], **box_plot_props) + plt.xticks(range(1, len(errors_to_plot) + 1), errors_to_plot.keys(), fontsize=20) + plt.ylabel("LinReg\nrelative error", font_properties=custom_font) + plt.ylim(0, 0.15) + plt.yticks(np.arange(0, 0.16, 0.05), font_properties=custom_font) + + plt.text(0.95, 0.9, r"$\varepsilon=20\%$", + transform=ax.transAxes, fontsize=22, + ha='right', va='top', bbox=textbox_props) + plt.grid() + plt.tight_layout() + plt.savefig("plots/linear_regression_boxplots.png", dpi=300) + plt.close() + print("\tDone\n") + + +def test_logistic_regression(): + print("Running test for logistic regression") + + true_theta = np.array([-1, 1, 1]) + eps_values = np.linspace(0, 0.4, 20) + + ml_loss = [] + sever_loss = [] + rrm_loss = [] + rlvi_loss = [] + + MC_runs = 100 + for eps in eps_values: + ml_loss_avg = 0 + sever_loss_avg = 0 + rrm_loss_avg = 0 + rlvi_loss_avg = 0 + for _ in range(MC_runs): + X, y = generate_data_logistic_regression(eps=eps) + theta_ml, _ = utils.sklearn_log_reg(X, y) + # theta_ml, _ = utils.mm_log_reg(X, y) + theta_sever = sever.logistic_regression(X, y, eps=0.1) + theta_rrm = rrm.logistic_regression(X, y, eps=0.4) + theta_rlvi = rlvi.logistic_regression(X, y) + + ml_loss_avg += np.arccos( + theta_ml @ true_theta / (np.linalg.norm(theta_ml) * np.linalg.norm(true_theta)) + ) * 180 / np.pi + sever_loss_avg += np.arccos( + theta_sever @ true_theta / (np.linalg.norm(theta_sever) * np.linalg.norm(true_theta)) + ) * 180 / np.pi + rrm_loss_avg += np.arccos( + theta_rrm @ true_theta / (np.linalg.norm(theta_rrm) * np.linalg.norm(true_theta)) + ) * 180 / np.pi + rlvi_loss_avg += np.arccos( + theta_rlvi @ true_theta / (np.linalg.norm(theta_rlvi) * np.linalg.norm(true_theta)) + ) * 180 / np.pi + + ml_loss.append(ml_loss_avg / MC_runs) + sever_loss.append(sever_loss_avg / MC_runs) + rrm_loss.append(rrm_loss_avg / MC_runs) + rlvi_loss.append(rlvi_loss_avg / MC_runs) + + plt.rcParams["mathtext.fontset"] = "cm" + # plt.plot(eps_values, ml_loss, lw=3, c='brown', label='ML') + plt.plot(eps_values, sever_loss, lw=3, c='orange', label='SEVER') + plt.plot(eps_values, rrm_loss, lw=3, c='green', label='RRM') + plt.plot(eps_values, rlvi_loss, lw=3, c='blue', label='RLVI') + plt.xlabel(r"corruption level $\varepsilon$", font_properties=custom_font) + plt.ylabel("LogReg\navg. angle in degree", font_properties=custom_font) + + plt.xticks(np.arange(0, 0.5, 0.1), fontsize=20) + plt.xlim(0, 0.4) + plt.yticks(fontsize=20) + plt.legend(fontsize=18) + plt.grid() + plt.tight_layout() + plt.savefig("plots/logistic_regression.png", dpi=300) + plt.close() + print("\tDone\n") + + +def test_logistic_regression_fixed_eps(): + print("Running test for logistic regression (fixed eps.)") + + true_theta = np.array([-1, 1, 1]) + ml_angles = [] + sever_angles = [] + rrm_angles = [] + rlvi_angles = [] + MC_runs = 100 + for _ in range(MC_runs): + X, y = generate_data_logistic_regression(size=200, eps=0.05) + theta_ml, _ = utils.sklearn_log_reg(X, y, weights=np.ones(len(y))) + # theta_ml, _ = utils.mm_log_reg(X, y, weights=np.ones(len(y))) + theta_sever = sever.logistic_regression(X, y, eps=0.3) + theta_rrm = rrm.logistic_regression(X, y, eps=0.3) + theta_rlvi = rlvi.logistic_regression(X, y) + ml_angles.append( + np.arccos(theta_ml @ true_theta / (np.linalg.norm(theta_ml) * np.linalg.norm(true_theta))) * 180 / np.pi + ) + sever_angles.append( + np.arccos(theta_sever @ true_theta / (np.linalg.norm(theta_sever) * np.linalg.norm(true_theta))) * 180 / np.pi + ) + rrm_angles.append( + np.arccos(theta_rrm @ true_theta / (np.linalg.norm(theta_rrm) * np.linalg.norm(true_theta))) * 180 / np.pi + ) + rlvi_angles.append( + np.arccos(theta_rlvi @ true_theta / (np.linalg.norm(theta_rlvi) * np.linalg.norm(true_theta))) * 180 / np.pi + ) + + errors_to_plot = { + "ML": ml_angles, + "SEVER": sever_angles, + "RRM": rrm_angles, + "RLVI": rlvi_angles + } + fig, ax = plt.subplots() + plt.rcParams["mathtext.fontset"] = "cm" + for i, errors in enumerate(errors_to_plot.values()): + plt.boxplot(errors, positions=[i + 1], **box_plot_props) + plt.xticks(range(1, len(errors_to_plot) + 1), errors_to_plot.keys(), font_properties=custom_font) + plt.ylabel("LogReg\nangle in degree", font_properties=custom_font) + plt.ylim(0, 30) + plt.yticks(np.arange(0, 35, 10), fontsize=20) + plt.text(0.95, 0.9, r"$\varepsilon=5\%$", + transform=ax.transAxes, fontsize=22, + ha='right', va='top', bbox=textbox_props) + plt.grid() + plt.tight_layout() + plt.savefig("plots/logistic_regression_boxplots.png", dpi=300) + plt.close() + print("\tDone\n") + + +def test_pca(): + print("Running test for PCA") + + true_theta = np.array([1 / np.sqrt(5), 2 / np.sqrt(5)]) + ml_errors = [] + rrm_errors = [] + sever_errors = [] + rlvi_errors = [] + MC_runs = 100 + + # Original benchmark in Osama et al. (2020) used the initial guess for RRM – so do we. + # For a fair comparison we use it for all methods: SEVER, RRM, and RLVI + theta_init = np.array([0.1, 1]) + theta_init /= np.linalg.norm(theta_init) + for _ in range(MC_runs): + sample = generate_data_pca(size=40, eps=0.2) + theta_ml, _ = utils.pca(sample, weights=np.ones(len(sample))) + theta_rrm = rrm.pca(sample, eps=0.4, theta_init=theta_init) + theta_sever = sever.pca(sample, eps=0.4, theta_init=theta_init) + theta_rlvi = rlvi.pca(sample, theta_init=theta_init) + + ml_errors.append(1 - np.abs(theta_ml @ true_theta)) + rrm_errors.append(1 - np.abs(theta_rrm @ true_theta)) + sever_errors.append(1 - np.abs(theta_sever @ true_theta)) + rlvi_errors.append(1 - np.abs(theta_rlvi @ true_theta)) + + errors_to_plot = { + "ML": ml_errors, + "SEVER": sever_errors, + "RRM": rrm_errors, + "RLVI": rlvi_errors + } + + fig, ax = plt.subplots() + plt.rcParams["mathtext.fontset"] = "cm" + for i, errors in enumerate(errors_to_plot.values()): + plt.boxplot(errors, positions=[i + 1], **box_plot_props) + plt.xticks(range(1, len(errors_to_plot) + 1), errors_to_plot.keys(), font_properties=custom_font) + plt.ylabel("PCA\nmisalignment error", font_properties=custom_font) + plt.ylim(0, 0.061) + plt.yticks(np.arange(0, 0.061, 0.02), fontsize=20) + plt.text(0.95, 0.9, r"$\varepsilon=20\%$", + transform=ax.transAxes, fontsize=22, + ha='right', va='top', bbox=textbox_props) + plt.grid() + plt.tight_layout() + plt.savefig("plots/pca_boxplots.png", dpi=300) + plt.close() + print("\tDone\n") + + +def test_covariance(): + print("Running test for covariance estimation") + + true_cov = np.array([ + [1, 0.8], + [0.8, 1] + ]) + + ml_errors = [] + rrm_errors = [] + sever_errors = [] + rlvi_errors = [] + MC_runs = 100 + for _ in range(MC_runs): + samples = generate_data_cov(size=50, eps=0.2) + cov_ml = np.cov(samples, rowvar=False) + cov_rrm = rrm.covariance(samples, eps=0.4) + cov_sever = sever.covariance(samples, eps=0.4, numiter=3) + cov_rlvi = rlvi.covariance(samples, eps=0.4) # use constraint due to unbounded likelihood + ml_errors.append(np.linalg.norm(true_cov - cov_ml, ord='fro') / np.linalg.norm(true_cov, ord='fro')) + rrm_errors.append(np.linalg.norm(true_cov - cov_rrm, ord='fro') / np.linalg.norm(true_cov, ord='fro')) + sever_errors.append(np.linalg.norm(true_cov - cov_sever, ord='fro') / np.linalg.norm(true_cov, ord='fro')) + rlvi_errors.append(np.linalg.norm(true_cov - cov_rlvi, ord='fro') / np.linalg.norm(true_cov, ord='fro')) + + errors_to_plot = { + "ML": ml_errors, + "SEVER": sever_errors, + "RRM": rrm_errors, + "RLVI": rlvi_errors + } + + fig, ax = plt.subplots() + plt.rcParams["mathtext.fontset"] = "cm" + for i, errors in enumerate(errors_to_plot.values()): + plt.boxplot(errors, positions=[i + 1], **box_plot_props) + plt.xticks(range(1, len(errors_to_plot) + 1), errors_to_plot.keys(), font_properties=custom_font) + plt.ylabel("Covariance\nrelative error", font_properties=custom_font) + plt.ylim(0, 3) + plt.yticks(list(range(4)), fontsize=20) + plt.text(0.95, 0.9, r"$\varepsilon=20\%$", + transform=ax.transAxes, fontsize=22, + ha='right', va='top', bbox=textbox_props) + plt.grid() + plt.tight_layout() + plt.savefig("plots/cov_boxplots.png", dpi=300) + plt.close() + print("\tDone\n") + + +def main(): + test_mean() + test_linear_regression() + test_linear_regression_fixed_eps() + test_logistic_regression_fixed_eps() + test_pca() + test_covariance() + + +if __name__ == "__main__": + main() diff --git a/standard-learning/rlvi.py b/standard-learning/rlvi.py new file mode 100644 index 00000000..cf88ef10 --- /dev/null +++ b/standard-learning/rlvi.py @@ -0,0 +1,144 @@ +import numpy as np +from scipy import optimize as opt +from scipy.linalg import lstsq + +import utils + + +def update_weights(losses, tol=1e-3, maxiter=100): + '''Optimize Bernoulli probabilities''' + weights = 0.95 * np.ones_like(losses) + new_weights = np.copy(weights) + for _ in range(maxiter): + eps = 1 - np.mean(weights) + ratio = eps / (1 - eps) + new_weights = np.exp(-losses) / (ratio + np.exp(-losses)) + error = np.linalg.norm(new_weights - weights) + weights = np.copy(new_weights) + if error < tol: + break + return new_weights + + +def update_weights_constrained(losses, n_eff, tol=1e-3, maxiter=100): + """ + Constraint optimization of Bernoulli probabilities + for the unbounded likelihood case. + Uses the constraint: `sum(weights) >= n_eff`. + """ + + n = len(losses) + # First, solve the unconstrained optimization + new_weights = update_weights(losses, tol=tol, maxiter=maxiter) + # Then, correct based on the KKT condition + def shift_obj(s): + return np.square( + np.sum( + np.exp(-losses + s) / ((n - n_eff)/n_eff + np.exp(-losses + s)) + ) - n_eff + ) + if np.sum(new_weights) < n_eff: + shift = opt.minimize_scalar(shift_obj)['x'] + new_weights = np.exp(-losses + shift) / ((n - n_eff)/n_eff + np.exp(-losses + shift)) + return new_weights + + +def mean(sample, maxiter=100, tol=1e-3): + weights = np.ones(sample.shape[0]) + theta = weights @ sample / np.sum(weights) + residuals = np.linalg.norm(theta - sample, axis=1)**2 + sigma2 = weights @ residuals / np.sum(weights) + losses = 0.5 * residuals/sigma2 + + for _ in range(maxiter): + weights = update_weights(losses) + prev_theta = np.copy(theta) + theta = weights @ sample / np.sum(weights) + residuals = np.linalg.norm(theta - sample, axis=1)**2 + sigma2 = weights @ residuals / np.sum(weights) + losses = 0.5 * residuals/sigma2 + + discrepancy = np.linalg.norm(theta - prev_theta) / np.linalg.norm(prev_theta) + if discrepancy <= tol: + break + + return theta + + +def linear_regression(X, y, maxiter=100, tol=1e-3): + weights = np.ones(X.shape[0]) + w_sqrt = np.diag(np.sqrt(weights)) + theta = lstsq(w_sqrt @ X, w_sqrt @ y)[0] + residuals = (y - X @ theta)**2 + sigma2 = weights @ residuals / np.sum(weights) + losses = 0.5 * residuals/sigma2 + + for _ in range(maxiter): + weights = update_weights(losses) + prev_theta = np.copy(theta) + w_sqrt = np.diag(np.sqrt(weights)) + theta = lstsq(w_sqrt @ X, w_sqrt @ y)[0] + residuals = (y - X @ theta)**2 + sigma2 = weights @ residuals / np.sum(weights) + losses = 0.5 * residuals/sigma2 + + discrepancy = np.linalg.norm(theta - prev_theta) / np.linalg.norm(prev_theta) + if discrepancy <= tol: + break + + return theta + + +def logistic_regression(X, y, maxiter=100, tol=1e-2): + weights = np.ones(X.shape[0]) + theta, losses = utils.sklearn_log_reg(X, y, weights) + # theta, losses = utils.mm_log_reg(X, y, weights) + + for _ in range(maxiter): + weights = update_weights(losses) + + prev_theta = np.copy(theta) + theta, losses = utils.sklearn_log_reg(X, y, weights) + # theta, losses = utils.mm_log_reg(X, y, weights) + + discrepancy = np.linalg.norm(theta - prev_theta) / np.linalg.norm(prev_theta) + if discrepancy <= tol: + break + + return theta + + +def pca(sample, maxiter=100, tol=1e-2, theta_init=None): + weights = np.ones(sample.shape[0]) + theta, losses = utils.pca(sample, weights, theta_init) + + for _ in range(maxiter): + weights = update_weights(losses) + + prev_theta = np.copy(theta) + theta, losses = utils.pca(sample, weights) + + discrepancy = np.linalg.norm(theta - prev_theta) / np.linalg.norm(prev_theta) + if discrepancy <= tol: + break + + return theta + + +def covariance(sample, eps, maxiter=100, tol=1e-2): + n, d = sample.shape + n_eff = n * (1 - eps) + + weights = np.ones(n) + theta, losses = utils.covariance(sample, weights) + + for _ in range(maxiter): + weights = update_weights_constrained(losses, n_eff) + prev_theta = np.copy(theta) + theta, losses = utils.covariance(sample, weights) + + discrepancy = np.linalg.norm(theta - prev_theta, ord='fro') / np.linalg.norm(prev_theta, ord='fro') + if discrepancy <= tol: + break + + return theta diff --git a/standard-learning/rrm.py b/standard-learning/rrm.py new file mode 100644 index 00000000..a7684b1b --- /dev/null +++ b/standard-learning/rrm.py @@ -0,0 +1,124 @@ +""" +Based on paper Robust Risk Minimization for Statistical Learning +Osama et al., 2020 +""" +import numpy as np +from scipy import optimize as opt +from scipy.linalg import lstsq + +import utils + + +def update_weights(losses, eps): + '''Optimize sample weights for Robust Risk Minimization''' + res = np.copy(losses) + t = -np.log((1 - eps) * res.shape[0]) + numeric_cutoff = 1e-16 + def objective(xi): + phi = np.exp(-res * np.exp(-xi)) + phi[phi < numeric_cutoff] = numeric_cutoff + sum_phi = np.sum(phi) + return np.exp(xi) * (np.log(sum_phi) + t) + + opt_res = opt.minimize_scalar(objective) + opt_xi = opt_res['x'] + opt_alpha = np.exp(opt_xi) + + phi = np.exp(-res / opt_alpha) + phi[phi < numeric_cutoff] = numeric_cutoff + sum_phi = np.sum(phi) + opt_beta_over_alpha = np.log(sum_phi) - 1 + + opt_weights = np.exp(-res / opt_alpha) * np.exp(-opt_beta_over_alpha - 1) + return opt_weights + + +def mean(sample, eps, maxiter=100, tol=1e-3): + weights = np.ones(sample.shape[0]) / sample.shape[0] + theta = weights @ sample / np.sum(weights) + losses = np.linalg.norm(theta - sample, axis=1)**2 + + for _ in range(maxiter): + weights = update_weights(losses, eps) + + prev_theta = np.copy(theta) + theta = weights @ sample / np.sum(weights) + losses = np.linalg.norm(theta - sample, axis=1)**2 + + discrepancy = np.linalg.norm(theta - prev_theta) / np.linalg.norm(prev_theta) + if discrepancy <= tol: + break + + return theta + + +def linear_regression(X, y, eps, maxiter=100, tol=1e-3): + weights = np.ones(X.shape[0]) / X.shape[0] + w_sqrt = np.diag(np.sqrt(weights)) + theta = lstsq(w_sqrt @ X, w_sqrt @ y)[0] + losses = (y - X @ theta)**2 + + for _ in range(maxiter): + weights = update_weights(losses, eps) + + prev_theta = np.copy(theta) + w_sqrt = np.diag(np.sqrt(weights)) + theta = lstsq(w_sqrt @ X, w_sqrt @ y)[0] + losses = (y - X @ theta)**2 + + discrepancy = np.linalg.norm(theta - prev_theta) / np.linalg.norm(prev_theta) + if discrepancy <= tol: + break + + return theta + + +def logistic_regression(X, y, eps, maxiter=100, tol=1e-2): + weights = np.ones(X.shape[0]) / X.shape[0] + theta, losses = utils.sklearn_log_reg(X, y, weights) + # theta, losses = utils.mm_log_reg(X, y, weights) + for _ in range(maxiter): + weights = update_weights(losses, eps) + + prev_theta = np.copy(theta) + theta, losses = utils.sklearn_log_reg(X, y, weights) + # theta, losses = utils.mm_log_reg(X, y, weights) + + discrepancy = np.linalg.norm(theta - prev_theta) / np.linalg.norm(prev_theta) + if discrepancy <= tol: + break + + return theta + + +def pca(sample, eps, maxiter=100, tol=1e-2, theta_init=None): + weights = np.ones(sample.shape[0]) / sample.shape[0] + theta, losses = utils.pca(sample, weights, theta_init) + for _ in range(maxiter): + weights = update_weights(losses, eps) + + prev_theta = np.copy(theta) + theta, losses = utils.pca(sample, weights) + + discrepancy = np.linalg.norm(theta - prev_theta) / np.linalg.norm(prev_theta) + if discrepancy <= tol: + break + + return theta + + +def covariance(sample, eps, maxiter=100, tol=1e-2): + weights = np.ones(sample.shape[0]) / sample.shape[0] + theta, losses = utils.covariance(sample, weights) + + for _ in range(maxiter): + weights = update_weights(losses, eps) + + prev_theta = np.copy(theta) + theta, losses = utils.covariance(sample, weights) + + discrepancy = np.linalg.norm(theta - prev_theta, ord='fro') / np.linalg.norm(prev_theta, ord='fro') + if discrepancy <= tol: + break + + return theta diff --git a/standard-learning/sever.py b/standard-learning/sever.py new file mode 100644 index 00000000..df5b12c8 --- /dev/null +++ b/standard-learning/sever.py @@ -0,0 +1,156 @@ +""" +Based on paper SEVER: a robust meta-algorithm for stochastic optimization +Ilias Diakonikolas et al., 2019 +""" +import numpy as np +from scipy.linalg import lstsq +import utils + + +def linear_regression(X, y, eps, numiter=4): + n = len(y) + s = range(n) # active set + # X and y corresponding to active set + Xs = X[s, :] + ys = y[s] + + for _ in range(numiter): + # base learner: least-square + theta = lstsq(Xs, ys)[0] + + # gradients + G_uncen = 2 * (Xs @ theta - ys)[:, np.newaxis] * Xs + G_cen = G_uncen - np.mean(G_uncen, axis=0) + + # top right singular vector + V = np.linalg.svd(G_cen)[-1] + v = V[:, 0] + + # outlier scores + tau = (G_cen @ v)**2 + + # remove p points with highest scores + p = int(eps / 2 * Xs.shape[0]) + idx = np.argsort(-tau) + + # new active set + s = idx[p:] + Xs = Xs[s, :] + ys = ys[s] + + return theta + + +def logistic_regression(X, y, eps, numiter=4): + n = len(y) + s = range(n) # active set + X = np.hstack([np.ones((X.shape[0], 1)), X]) + # X and y corresponding to active set + Xs = X[s, :] + ys = y[s] + + for _ in range(numiter): + # base learner: logistic regression + theta, _ = utils.sklearn_log_reg(Xs[:, 1:], ys, weights=np.ones(len(ys))) + # theta, _ = utils.mm_log_reg(Xs[:, 1:], ys, weights=np.ones(len(ys))) + + # gradients + phi = Xs @ theta + proba = utils.sigmoid(phi) + G_uncen = (-ys * (Xs @ theta) + proba)[:, np.newaxis] * Xs + G_cen = G_uncen - np.mean(G_uncen, axis=0) + + # top right singular vector + V = np.linalg.svd(G_cen)[-1] + v = V[:, 0] + + # outlier scores + tau = (G_cen @ v)**2 + + # remove p points with highest scores + p = int(eps / 2 * Xs.shape[0]) + idx = np.argsort(-tau) + + # new active set + s = idx[p:] + Xs = Xs[s, :] + ys = ys[s] + + return theta + + +def pca(samples, eps, numiter=4, theta_init=None): + n = len(samples) + s = range(n) # active set + # samples corresponding to active set + samples_s = samples[s, :] + + for k in range(numiter): + # base learner: pca + if (theta_init is not None) and (k == 0): + theta = theta_init + else: + theta, _ = utils.pca(samples_s, weights=np.ones(len(samples_s))) + + # gradients + G_uncen = -2 * ((samples_s @ theta)[:, np.newaxis] * samples_s) + G_cen = G_uncen - np.mean(G_uncen, axis=0) + + # top right singular vector + V = np.linalg.svd(G_cen)[-1] + v = V[:, 0] + + # outlier scores + tau = (G_cen @ v)**2 + + # remove p points with highest scores + p = int(eps / 2 * samples_s.shape[0]) + idx = np.argsort(-tau) + + # new active set + s = idx[p:] + samples_s = samples_s[s, :] + + return theta + + +def covariance(samples, eps, numiter=4): + n, d = samples.shape + s = range(n) # active set + # samples corresponding to active set + samples_s = samples[s, :] + + for _ in range(numiter): + # base learner: sample covariance + mean = np.mean(samples_s, axis=0) + cov = np.cov(samples_s, rowvar=False) + cov_inv = np.linalg.inv(cov) + + # gradients + G_uncen = np.zeros((samples_s.shape[0], d + d**2)) + for row in range(len(samples_s)): + G_uncen[row, :d] = cov_inv @ (mean - samples_s[row, :]) + centered = (samples_s[row, :] - mean).reshape((-1, 1)) + row_cov = centered @ centered.T + g = cov_inv @ (np.eye(d) - row_cov / cov) / 2. + g = g.reshape(-1) + G_uncen[row, d:] = g + + G_cen = G_uncen - np.mean(G_uncen, axis=0) + + # top right singular vector + V = np.linalg.svd(G_cen)[-1] + v = V[:, 0] + + # outlier scores + tau = (G_cen @ v)**2 + + # remove p points with highest scores + p = int(eps / 2 * samples_s.shape[0]) + idx = np.argsort(-tau) + + # new active set + s = idx[p:] + samples_s = samples_s[s, :] + + return cov \ No newline at end of file diff --git a/standard-learning/utils.py b/standard-learning/utils.py new file mode 100644 index 00000000..9d329ad8 --- /dev/null +++ b/standard-learning/utils.py @@ -0,0 +1,108 @@ +import numpy as np +from scipy.linalg import lstsq +from sklearn.linear_model import LogisticRegression +from sklearn.decomposition import PCA + + +def sigmoid(x): + '''Numerically stable logistic function''' + pos_mask = (x >= 0) + neg_mask = (x < 0) + z = np.zeros_like(x) + z[pos_mask] = np.exp(-x[pos_mask]) + z[neg_mask] = np.exp(x[neg_mask]) + top = np.ones_like(x) + top[neg_mask] = z[neg_mask] + return top / (1 + z) + + +def cross_entropy(X, theta, y): + phi = X @ theta + return -y * phi + phi + np.log1p(np.exp(-phi)) + + +def clf_predict(X, theta, augment=True): + if augment: + X = np.hstack([np.ones((X.shape[0], 1)), X]) + phi = X @ theta + proba = sigmoid(phi) + return np.array(proba > 0.5, dtype=int) + + +def mm_log_reg(X, y, weights): + X = np.hstack([np.ones((X.shape[0], 1)), X]) + theta0 = np.zeros(X.shape[1]) + + X_tilde = 0.5 * X * np.sqrt(weights)[:, None] + Q = X_tilde.T @ X_tilde + Q_inv = np.linalg.inv(Q) + + def grad(theta): + return X.T @ (weights * (sigmoid(X @ theta) - y)) + + def delta(theta): + g = grad(theta) + return -Q_inv @ g + + theta1 = theta0 + delta(theta0) + + while True: + if np.linalg.norm(theta1 - theta0) > 1e-2: + theta0 = theta1 + theta1 = theta0 + delta(theta0) + else: + theta_best = theta1 + break + + losses = cross_entropy(X, theta_best, y) + return theta_best, losses + + +def sklearn_log_reg(X, y, weights, reg_coeff=1e2): + def get_losses(classifier): + log_proba = classifier.predict_log_proba(X).T[0] + return -y * log_proba - (1 - y) * log_proba + + weights /= np.max(weights) + clf = LogisticRegression(solver="liblinear", C=reg_coeff) + clf.fit(X, y, sample_weight=weights) + theta = np.hstack([clf.intercept_.flatten(), clf.coef_.flatten()]) + + # Compute loss on samples + losses = get_losses(clf) + return theta, losses + + +def pca(samples, weights, theta=None): + def get_losses(princ_comp): + proj = samples @ princ_comp + return np.sum(samples**2, axis=1) - proj**2 + + if theta is None: + pca_ = PCA(n_components=1) + pca_.fit(np.diag(weights) @ samples) + theta = pca_.components_[0] + theta /= np.linalg.norm(theta) + + # Compute loss on samples + losses = get_losses(theta) + return theta, losses + + +def covariance(samples, weights, mean=None): + def get_losses(mean, cov, weights): + d = mean.shape[0] + centered_samples = samples - mean + scaled_samples = lstsq(cov, centered_samples.T)[0] + residuals = np.sum(centered_samples * scaled_samples.T, axis=1) + (sign, logabsdet) = np.linalg.slogdet(cov) + if sign <= 0: + raise ValueError("Singular covariance matrix") + return 0.5*(residuals + logabsdet + d*np.log(2*np.pi)) + + mean = samples.T @ weights / np.sum(weights) + centered_samples = samples - mean + cov = centered_samples.T @ np.diag(weights) @ centered_samples / np.sum(weights) + # Compute loss on samples + losses = get_losses(mean, cov, weights) + return cov, losses