diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f3c315641..82fbb9b67 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -49,8 +49,6 @@ jobs: MILABENCH_DASH: "no" steps: - - uses: actions/checkout@v3 - - uses: conda-incubator/setup-miniconda@v2 with: auto-activate-base: false @@ -58,6 +56,12 @@ jobs: miniconda-version: "latest" activate-environment: test + # - name: clean + # run: | + # python -c "import shutil; shutil.rmtree('/opt/actions-runner/_work/milabench/milabench')" + + - uses: actions/checkout@v3 + - name: Pytorch Sanity run: | if [[ "${{ matrix.arch }}" == "rocm" ]]; then diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..012435f51 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,7 @@ +[submodule "benchmarks/mlperf/apex"] + path = benchmarks/mlperf/apex + url = https://github.com/NVIDIA/apex.git + +[submodule "benchmarks/mlperf/training_results_v2.1"] + path = benchmarks/mlperf/training_results_v2.1 + url = https://github.com/mlcommons/training_results_v2.1.git diff --git a/benchmarks/huggingface/bench/__main__.py b/benchmarks/huggingface/bench/__main__.py index 4e46f4ffd..3190a0b75 100644 --- a/benchmarks/huggingface/bench/__main__.py +++ b/benchmarks/huggingface/bench/__main__.py @@ -12,6 +12,7 @@ from .synth import SyntheticData, generators + def is_tf32_allowed(args): return "tf32" in args.precision @@ -20,6 +21,16 @@ def is_fp16_allowed(args): return "fp16" in args.precision +class ModelWrapper(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, x): + out = self.model(input_ids=x['input_ids'], labels=x['labels']) + return out['loss'], out['logits'] + + class Runner: def __init__(self, args): use_cuda = not args.no_cuda and torch.cuda.is_available() @@ -32,17 +43,30 @@ def __init__(self, args): self.device = torch.device("cuda" if use_cuda else "cpu") self.batch_size = args.batch_size info = models[args.model]() - self.model = info.model.to(self.device) - self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) - + + self.data = SyntheticData( n=args.batch_size, repeat=100000, generators=generators[info.category](info), ) + self.loader = DataLoader( self.data, batch_size=args.batch_size, num_workers=args.num_workers ) + + example = next(iter(self.loader)) + example = {k: x.to(self.device) for k, x in example.items()} + + model = ModelWrapper(info.model).to(self.device) + + jit = False + if jit: + model = torch.jit.trace(model, example) + + self.model = model + self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) + self.amp_scaler = torch.cuda.amp.GradScaler(enabled=is_fp16_allowed(args)) if is_fp16_allowed(args): @@ -52,9 +76,7 @@ def __init__(self, args): def step(self, data): with self.amp_context(): - outputs = self.model(**data) - - loss = outputs.loss + loss, _ = self.model(data) self.amp_scaler.scale(loss).backward() self.amp_scaler.step(self.optimizer) diff --git a/benchmarks/mlperf/apex b/benchmarks/mlperf/apex new file mode 160000 index 000000000..05091d498 --- /dev/null +++ b/benchmarks/mlperf/apex @@ -0,0 +1 @@ +Subproject commit 05091d498d21058a0fe736b828c43431d4f0dda2 diff --git a/benchmarks/mlperf/benchfile.py b/benchmarks/mlperf/benchfile.py new file mode 100644 index 000000000..8e69d4a95 --- /dev/null +++ b/benchmarks/mlperf/benchfile.py @@ -0,0 +1,10 @@ +from milabench.pack import Package + + +class MLPerfBenchmark(Package): + base_requirements = "requirements.in" + main_script = "main.py" + + +__pack__ = MLPerfBenchmark + diff --git a/benchmarks/mlperf/main.py b/benchmarks/mlperf/main.py new file mode 100644 index 000000000..c665b653e --- /dev/null +++ b/benchmarks/mlperf/main.py @@ -0,0 +1,13 @@ + +import sys +import os + + +FOLDER = os.path.dirname(__file__) +BENCH = "training_results_v2.1/NVIDIA/benchmarks/bert/implementations/pytorch-preview" + +print(sys.path) +sys.path.append(os.path.join(FOLDER, BENCH)) +print(sys.path) + +import run_squad diff --git a/benchmarks/mlperf/requirements.in b/benchmarks/mlperf/requirements.in new file mode 100644 index 000000000..fef907ca0 --- /dev/null +++ b/benchmarks/mlperf/requirements.in @@ -0,0 +1,4 @@ +git+https://github.com/NVIDIA/mlperf-common.git +git+https://github.com/NVIDIA/apex.git +git+https://github.com/mlcommons/logging.git +boto3 diff --git a/benchmarks/mlperf/training_results_v2.1 b/benchmarks/mlperf/training_results_v2.1 new file mode 160000 index 000000000..158189d4c --- /dev/null +++ b/benchmarks/mlperf/training_results_v2.1 @@ -0,0 +1 @@ +Subproject commit 158189d4cbfbee366c10da1f0f086c85d8f15b5f diff --git a/config/base.yaml b/config/base.yaml index ffa1fe629..9fd435cd2 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -10,6 +10,16 @@ _defaults: gpu_load_threshold: 0.5 gpu_mem_threshold: 0.5 + +mlperf: + inherits: _defaults + definition: ../benchmarks/mlperf + group: mlperf + install_group: torch + plan: + method: per_gpu + + _torchvision: inherits: _defaults definition: ../benchmarks/torchvision @@ -92,7 +102,9 @@ resnet50: argv: --model: resnet50 - --batch-size: 64 + --batch-size: 256 + --synthetic-data: true + --precision: 'fp16' efficientnet_b4: inherits: _torchvision @@ -172,7 +184,7 @@ _bert-base: - precision-showcase argv: --model: "Bert" - --batch-size: 32 + --batch-size: 48 voir: options: stop: 30