diff --git a/configs/_base_/schedules/cifar10_bs128.py b/configs/_base_/schedules/cifar10_bs128.py index a04f89c1990..f134dbce3be 100644 --- a/configs/_base_/schedules/cifar10_bs128.py +++ b/configs/_base_/schedules/cifar10_bs128.py @@ -3,4 +3,4 @@ optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict(policy='step', step=[100, 150]) -total_epochs = 200 +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py b/configs/_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py index f3f6bbd1e48..99fbdda9f59 100644 --- a/configs/_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py +++ b/configs/_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py @@ -14,4 +14,4 @@ warmup='constant', warmup_iters=5000, ) -total_epochs = 300 +runner = dict(type='EpochBasedRunner', max_epochs=300) diff --git a/configs/_base_/schedules/imagenet_bs2048.py b/configs/_base_/schedules/imagenet_bs2048.py index 68a0915b716..93fdebfdd15 100644 --- a/configs/_base_/schedules/imagenet_bs2048.py +++ b/configs/_base_/schedules/imagenet_bs2048.py @@ -9,4 +9,4 @@ warmup_iters=2500, warmup_ratio=0.25, step=[30, 60, 90]) -total_epochs = 100 +runner = dict(type='EpochBasedRunner', max_epochs=100) diff --git a/configs/_base_/schedules/imagenet_bs2048_coslr.py b/configs/_base_/schedules/imagenet_bs2048_coslr.py index e65b2997cfe..b9e77f2c6ad 100644 --- a/configs/_base_/schedules/imagenet_bs2048_coslr.py +++ b/configs/_base_/schedules/imagenet_bs2048_coslr.py @@ -9,4 +9,4 @@ warmup='linear', warmup_iters=2500, warmup_ratio=0.25) -total_epochs = 100 +runner = dict(type='EpochBasedRunner', max_epochs=100) diff --git a/configs/_base_/schedules/imagenet_bs256.py b/configs/_base_/schedules/imagenet_bs256.py index 4a1dbe69f47..3b5d19847a6 100644 --- a/configs/_base_/schedules/imagenet_bs256.py +++ b/configs/_base_/schedules/imagenet_bs256.py @@ -3,4 +3,4 @@ optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict(policy='step', step=[30, 60, 90]) -total_epochs = 100 +runner = dict(type='EpochBasedRunner', max_epochs=100) diff --git a/configs/_base_/schedules/imagenet_bs256_140e.py b/configs/_base_/schedules/imagenet_bs256_140e.py index df345ab041f..caba1577c73 100644 --- a/configs/_base_/schedules/imagenet_bs256_140e.py +++ b/configs/_base_/schedules/imagenet_bs256_140e.py @@ -3,4 +3,4 @@ optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict(policy='step', step=[40, 80, 120]) -total_epochs = 140 +runner = dict(type='EpochBasedRunner', max_epochs=140) diff --git a/configs/_base_/schedules/imagenet_bs256_coslr.py b/configs/_base_/schedules/imagenet_bs256_coslr.py index 66f94939e63..779b4792eda 100644 --- a/configs/_base_/schedules/imagenet_bs256_coslr.py +++ b/configs/_base_/schedules/imagenet_bs256_coslr.py @@ -3,4 +3,4 @@ optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict(policy='CosineAnnealing', min_lr=0) -total_epochs = 100 +runner = dict(type='EpochBasedRunner', max_epochs=100) diff --git a/configs/_base_/schedules/imagenet_bs256_epochstep.py b/configs/_base_/schedules/imagenet_bs256_epochstep.py index 7ea3c0e153b..2347a043544 100644 --- a/configs/_base_/schedules/imagenet_bs256_epochstep.py +++ b/configs/_base_/schedules/imagenet_bs256_epochstep.py @@ -3,4 +3,4 @@ optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict(policy='step', gamma=0.98, step=1) -total_epochs = 300 +runner = dict(type='EpochBasedRunner', max_epochs=300) diff --git a/configs/mnist/lenet5.py b/configs/mnist/lenet5.py index 576ba28c2d1..af4d4085e27 100644 --- a/configs/mnist/lenet5.py +++ b/configs/mnist/lenet5.py @@ -47,7 +47,7 @@ ]) # yapf:enable # runtime settings -total_epochs = 20 +runner = dict(type='EpochBasedRunner', max_epochs=20) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mnist/' diff --git a/docs/install.md b/docs/install.md index e74a549bbab..8c463508e1b 100644 --- a/docs/install.md +++ b/docs/install.md @@ -4,7 +4,7 @@ - Python 3.6+ - PyTorch 1.3+ -- [mmcv](https://github.com/open-mmlab/mmcv) +- [mmcv](https://github.com/open-mmlab/mmcv) 1.1.4+ ### Install mmclassification diff --git a/docs/tutorials/finetune.md b/docs/tutorials/finetune.md index e0748ad3758..a1c6428b774 100644 --- a/docs/tutorials/finetune.md +++ b/docs/tutorials/finetune.md @@ -75,7 +75,7 @@ optimizer_config = dict(grad_clip=None) lr_config = dict( policy='step', step=[15]) -total_epochs = 20 +runner = dict(type='EpochBasedRunner', max_epochs=200) log_config = dict(interval=100) ``` diff --git a/mmcls/apis/train.py b/mmcls/apis/train.py index 25c4582a8e1..972109717a4 100644 --- a/mmcls/apis/train.py +++ b/mmcls/apis/train.py @@ -1,9 +1,10 @@ import random +import warnings import numpy as np import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel -from mmcv.runner import DistSamplerSeedHook, EpochBasedRunner, build_optimizer +from mmcv.runner import DistSamplerSeedHook, build_optimizer, build_runner from mmcls.core import (DistEvalHook, DistOptimizerHook, EvalHook, Fp16OptimizerHook) @@ -70,12 +71,26 @@ def train_model(model, # build runner optimizer = build_optimizer(model, cfg.optimizer) - runner = EpochBasedRunner( - model, - optimizer=optimizer, - work_dir=cfg.work_dir, - logger=logger, - meta=meta) + + if cfg.get('runner') is None: + cfg.runner = { + 'type': 'EpochBasedRunner', + 'max_epochs': cfg.total_epochs + } + warnings.warn( + 'config is now expected to have a `runner` section, ' + 'please set `runner` in your config.', UserWarning) + + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + batch_processor=None, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp @@ -107,6 +122,7 @@ def train_model(model, shuffle=False, round_up=False) eval_cfg = cfg.get('evaluation', {}) + eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) @@ -114,4 +130,4 @@ def train_model(model, runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) - runner.run(data_loaders, cfg.workflow, cfg.total_epochs) + runner.run(data_loaders, cfg.workflow) diff --git a/mmcls/core/evaluation/eval_hooks.py b/mmcls/core/evaluation/eval_hooks.py index c2e7aecd019..17ea558c571 100644 --- a/mmcls/core/evaluation/eval_hooks.py +++ b/mmcls/core/evaluation/eval_hooks.py @@ -12,21 +12,30 @@ class EvalHook(Hook): interval (int): Evaluation interval (by epochs). Default: 1. """ - def __init__(self, dataloader, interval=1, **eval_kwargs): + def __init__(self, dataloader, interval=1, by_epoch=True, **eval_kwargs): if not isinstance(dataloader, DataLoader): raise TypeError('dataloader must be a pytorch DataLoader, but got' f' {type(dataloader)}') self.dataloader = dataloader self.interval = interval self.eval_kwargs = eval_kwargs + self.by_epoch = by_epoch def after_train_epoch(self, runner): - if not self.every_n_epochs(runner, self.interval): + if not self.by_epoch or not self.every_n_epochs(runner, self.interval): return from mmcls.apis import single_gpu_test results = single_gpu_test(runner.model, self.dataloader, show=False) self.evaluate(runner, results) + def after_train_iter(self, runner): + if self.by_epoch or not self.every_n_iters(runner, self.interval): + return + from mmcls.apis import single_gpu_test + runner.log_buffer.clear() + results = single_gpu_test(runner.model, self.dataloader, show=False) + self.evaluate(runner, results) + def evaluate(self, runner, results): eval_res = self.dataloader.dataset.evaluate( results, logger=runner.logger, **self.eval_kwargs) @@ -51,6 +60,7 @@ def __init__(self, dataloader, interval=1, gpu_collect=False, + by_epoch=True, **eval_kwargs): if not isinstance(dataloader, DataLoader): raise TypeError('dataloader must be a pytorch DataLoader, but got ' @@ -58,12 +68,27 @@ def __init__(self, self.dataloader = dataloader self.interval = interval self.gpu_collect = gpu_collect + self.by_epoch = by_epoch self.eval_kwargs = eval_kwargs def after_train_epoch(self, runner): - if not self.every_n_epochs(runner, self.interval): + if not self.by_epoch or not self.every_n_epochs(runner, self.interval): + return + from mmcls.apis import multi_gpu_test + results = multi_gpu_test( + runner.model, + self.dataloader, + tmpdir=osp.join(runner.work_dir, '.eval_hook'), + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + self.evaluate(runner, results) + + def after_train_iter(self, runner): + if self.by_epoch or not self.every_n_iters(runner, self.interval): return from mmcls.apis import multi_gpu_test + runner.log_buffer.clear() results = multi_gpu_test( runner.model, self.dataloader, diff --git a/requirements/readthedocs.txt b/requirements/readthedocs.txt index 0542bfce6df..0367917f619 100644 --- a/requirements/readthedocs.txt +++ b/requirements/readthedocs.txt @@ -1,3 +1,3 @@ -mmcv +mmcv>=1.1.4 torch torchvision diff --git a/requirements/runtime.txt b/requirements/runtime.txt index b558c13b85e..451637f2305 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,3 +1,3 @@ matplotlib -mmcv +mmcv>=1.1.4 numpy diff --git a/tests/test_eval_hook.py b/tests/test_eval_hook.py new file mode 100644 index 00000000000..823e235399c --- /dev/null +++ b/tests/test_eval_hook.py @@ -0,0 +1,197 @@ +import logging +import tempfile +from unittest.mock import MagicMock, patch + +import mmcv.runner +import pytest +import torch +import torch.nn as nn +from mmcv.runner import obj_from_dict +from torch.utils.data import DataLoader, Dataset + +from mmcls.apis import single_gpu_test +from mmcls.core import DistEvalHook, EvalHook + + +class ExampleDataset(Dataset): + + def __getitem__(self, idx): + results = dict(img=torch.tensor([1]), img_metas=dict()) + return results + + def __len__(self): + return 1 + + +class ExampleModel(nn.Module): + + def __init__(self): + super(ExampleModel, self).__init__() + self.test_cfg = None + self.conv = nn.Conv2d(3, 3, 3) + + def forward(self, img, img_metas, test_mode=False, **kwargs): + return img + + def train_step(self, data_batch, optimizer): + loss = self.forward(**data_batch) + return dict(loss=loss) + + +def test_iter_eval_hook(): + with pytest.raises(TypeError): + test_dataset = ExampleModel() + data_loader = [ + DataLoader( + test_dataset, + batch_size=1, + sampler=None, + num_worker=0, + shuffle=False) + ] + EvalHook(data_loader, by_epoch=False) + + test_dataset = ExampleDataset() + test_dataset.evaluate = MagicMock(return_value=dict(test='success')) + loader = DataLoader(test_dataset, batch_size=1) + model = ExampleModel() + data_loader = DataLoader( + test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False) + optim_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) + optimizer = obj_from_dict(optim_cfg, torch.optim, + dict(params=model.parameters())) + + # test EvalHook + with tempfile.TemporaryDirectory() as tmpdir: + eval_hook = EvalHook(data_loader, by_epoch=False) + runner = mmcv.runner.IterBasedRunner( + model=model, + optimizer=optimizer, + work_dir=tmpdir, + logger=logging.getLogger(), + max_iters=1) + runner.register_hook(eval_hook) + runner.run([loader], [('train', 1)], 1) + test_dataset.evaluate.assert_called_with([torch.tensor([1])], + logger=runner.logger) + + +def test_epoch_eval_hook(): + with pytest.raises(TypeError): + test_dataset = ExampleModel() + data_loader = [ + DataLoader( + test_dataset, + batch_size=1, + sampler=None, + num_worker=0, + shuffle=False) + ] + EvalHook(data_loader, by_epoch=True) + + test_dataset = ExampleDataset() + test_dataset.evaluate = MagicMock(return_value=dict(test='success')) + loader = DataLoader(test_dataset, batch_size=1) + model = ExampleModel() + data_loader = DataLoader( + test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False) + optim_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) + optimizer = obj_from_dict(optim_cfg, torch.optim, + dict(params=model.parameters())) + + # test EvalHook with interval + with tempfile.TemporaryDirectory() as tmpdir: + eval_hook = EvalHook(data_loader, by_epoch=True, interval=2) + runner = mmcv.runner.EpochBasedRunner( + model=model, + optimizer=optimizer, + work_dir=tmpdir, + logger=logging.getLogger(), + max_epochs=2) + runner.register_hook(eval_hook) + runner.run([loader], [('train', 1)]) + test_dataset.evaluate.assert_called_once_with([torch.tensor([1])], + logger=runner.logger) + + +def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + results = single_gpu_test(model, data_loader) + return results + + +@patch('mmcls.apis.multi_gpu_test', multi_gpu_test) +def test_dist_eval_hook(): + with pytest.raises(TypeError): + test_dataset = ExampleModel() + data_loader = [ + DataLoader( + test_dataset, + batch_size=1, + sampler=None, + num_worker=0, + shuffle=False) + ] + DistEvalHook(data_loader, by_epoch=False) + + test_dataset = ExampleDataset() + test_dataset.evaluate = MagicMock(return_value=dict(test='success')) + loader = DataLoader(test_dataset, batch_size=1) + model = ExampleModel() + data_loader = DataLoader( + test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False) + optim_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) + optimizer = obj_from_dict(optim_cfg, torch.optim, + dict(params=model.parameters())) + + # test DistEvalHook + with tempfile.TemporaryDirectory() as tmpdir: + eval_hook = DistEvalHook(data_loader, by_epoch=False) + runner = mmcv.runner.IterBasedRunner( + model=model, + optimizer=optimizer, + work_dir=tmpdir, + logger=logging.getLogger(), + max_iters=1) + runner.register_hook(eval_hook) + runner.run([loader], [('train', 1)]) + test_dataset.evaluate.assert_called_with([torch.tensor([1])], + logger=runner.logger) + + +@patch('mmcls.apis.multi_gpu_test', multi_gpu_test) +def test_dist_eval_hook_epoch(): + with pytest.raises(TypeError): + test_dataset = ExampleModel() + data_loader = [ + DataLoader( + test_dataset, + batch_size=1, + sampler=None, + num_worker=0, + shuffle=False) + ] + DistEvalHook(data_loader) + + test_dataset = ExampleDataset() + test_dataset.evaluate = MagicMock(return_value=dict(test='success')) + loader = DataLoader(test_dataset, batch_size=1) + model = ExampleModel() + data_loader = DataLoader( + test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False) + optim_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) + optimizer = obj_from_dict(optim_cfg, torch.optim, + dict(params=model.parameters())) + + # test DistEvalHook + with tempfile.TemporaryDirectory() as tmpdir: + eval_hook = DistEvalHook(data_loader, by_epoch=True, interval=2) + runner = mmcv.runner.EpochBasedRunner( + model=model, + optimizer=optimizer, + work_dir=tmpdir, + logger=logging.getLogger(), + max_epochs=2) + runner.register_hook(eval_hook) + runner.run([loader], [('train', 1)]) + test_dataset.evaluate.assert_called_with([torch.tensor([1])], + logger=runner.logger)