From e5505ee0c6e0fe547af149b4ca87d0d7538cdd58 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 5 Sep 2024 17:19:59 -0400 Subject: [PATCH] Fix llava --- benchmarks/llava/benchfile.py | 42 +++++++++++++++++++++++++ benchmarks/llava/main.py | 53 ++++++++++++++++++++++++-------- benchmarks/llava/prepare.py | 22 +++++++++++++ benchmarks/llava/requirements.in | 7 +++++ benchmarks/llava/voirfile.py | 38 +++++++++++++++++++++++ benchmarks/purejaxrl/main.py | 1 - config/base.yaml | 43 ++++++++++++++++++++++++++ milabench/_version.py | 7 ++--- scripts/article/run_cuda.sh | 2 +- 9 files changed, 196 insertions(+), 19 deletions(-) create mode 100644 benchmarks/llava/benchfile.py mode change 100644 => 100755 benchmarks/llava/main.py create mode 100755 benchmarks/llava/prepare.py create mode 100644 benchmarks/llava/requirements.in create mode 100644 benchmarks/llava/voirfile.py diff --git a/benchmarks/llava/benchfile.py b/benchmarks/llava/benchfile.py new file mode 100644 index 000000000..3bc06eaa7 --- /dev/null +++ b/benchmarks/llava/benchfile.py @@ -0,0 +1,42 @@ +from milabench.pack import Package +from milabench.commands import AccelerateAllNodes + + +class Llava(Package): + # Requirements file installed by install(). It can be empty or absent. + base_requirements = "requirements.in" + + # The preparation script called by prepare(). It must be executable, + # but it can be any type of script. It can be empty or absent. + prepare_script = "prepare.py" + + # The main script called by run(). It must be a Python file. It has to + # be present. + main_script = "main.py" + + # You can remove the functions below if you don't need to modify them. + + def make_env(self): + # Return a dict of environment variables for prepare_script and + # main_script. + return super().make_env() + + async def install(self): + await super().install() # super() call installs the requirements + + async def prepare(self): + await super().prepare() # super() call executes prepare_script + + def build_run_plan(self): + from milabench.commands import PackCommand + + main = self.dirs.code / self.main_script + plan = PackCommand(self, *self.argv, lazy=True) + + if False: + plan = VoirCommand(plan, cwd=main.parent) + + return AccelerateAllNodes(plan).use_stdout() + + +__pack__ = Llava diff --git a/benchmarks/llava/main.py b/benchmarks/llava/main.py old mode 100644 new mode 100755 index fe105ca51..ea6f5c28a --- a/benchmarks/llava/main.py +++ b/benchmarks/llava/main.py @@ -1,8 +1,7 @@ -# This is the script run by milabench run (by default) +#!/usr/bin/env python -import time +from dataclasses import dataclass -import numpy as np import torch from accelerate import Accelerator from accelerate.utils import set_seed @@ -12,6 +11,7 @@ from torch.utils.data.dataloader import default_collate from transformers import AutoProcessor, LlavaForConditionalGeneration +import argklass from benchmate.observer import BenchObserver @@ -34,22 +34,33 @@ def custom_collate(batch): return default_collate(batch) +@dataclass +class Arguments: + batch_size: int = 10 + epochs: int = 10 + seed: int = 42 + num_workers: int = 5 + gradient_accumulation_steps: int = 4 + + def main(): + parser = argklass.ArgumentParser(description="llava") + parser.add_arguments(Arguments) + args = parser.parse_args() + accelerator = Accelerator( mixed_precision="no", - gradient_accumulation_steps=4, + gradient_accumulation_steps=args.gradient_accumulation_steps, log_with="all", project_dir="logs", ) - set_seed(42) - batch_size = 1 # Set to 1 for now, but can be easily changed - num_epochs = 1 + set_seed(args.seed) # Load LLaVA model and processor with device_map="auto" model = LlavaForConditionalGeneration.from_pretrained( "llava-hf/llava-1.5-7b-hf", - torch_dtype=torch.float32, # Change to float32 + torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") @@ -57,7 +68,11 @@ def main(): # Load dataset and create DataLoader dataset = load_dataset("HuggingFaceM4/the_cauldron", "aokvqa")["train"] dataloader = DataLoader( - dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate + dataset, + batch_size=args.batch_size, + shuffle=True, + collate_fn=custom_collate, + num_workers=args.num_workers ) def batch_size_fn(batch): @@ -68,13 +83,14 @@ def batch_size_fn(batch): ) observer = BenchObserver( - batch_size_fn=batch_size_fn, earlystop=70, raise_stop_program=True + batch_size_fn=batch_size_fn, earlystop=70, raise_stop_program=True, + stdout=True, ) optimizer = observer.optimizer(torch.optim.AdamW(model.parameters(), lr=5e-5)) model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader) - for epoch in range(num_epochs): - for batch in observer.iterate(dataloader): + for epoch in range(args.epochs): + for i, batch in enumerate(observer.iterate(dataloader)): images = batch["images"][0] # Access the first item in the list of images texts = batch["texts"] prompt = apply_chat_template(texts) @@ -93,7 +109,11 @@ def batch_size_fn(batch): ) for k, v in inputs.items() } + + inputs["labels"] = inputs["input_ids"] + outputs = model(**inputs) + loss = outputs.loss accelerator.backward(loss) @@ -111,4 +131,11 @@ def batch_size_fn(batch): if __name__ == "__main__": - main() + from voir.phase import StopProgram + from benchmate.monitor import bench_monitor + + try: + with bench_monitor(): + main() + except StopProgram: + pass \ No newline at end of file diff --git a/benchmarks/llava/prepare.py b/benchmarks/llava/prepare.py new file mode 100755 index 000000000..afa480b86 --- /dev/null +++ b/benchmarks/llava/prepare.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python + +import torch +from datasets import load_dataset +from transformers import AutoProcessor, LlavaForConditionalGeneration + + +def main(): + # Load LLaVA model and processor with device_map="auto" + _ = LlavaForConditionalGeneration.from_pretrained( + "llava-hf/llava-1.5-7b-hf", + torch_dtype=torch.float32, # Change to float32 + device_map="auto", + ) + _ = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + + # Load dataset and create DataLoader + _ = load_dataset("HuggingFaceM4/the_cauldron", "aokvqa")["train"] + + +if __name__ == "__main__": + main() diff --git a/benchmarks/llava/requirements.in b/benchmarks/llava/requirements.in new file mode 100644 index 000000000..fbddd3f15 --- /dev/null +++ b/benchmarks/llava/requirements.in @@ -0,0 +1,7 @@ +voir>=0.2.19,<0.3 +torch +numpy +accelerate +pillow +datasets +transformers diff --git a/benchmarks/llava/voirfile.py b/benchmarks/llava/voirfile.py new file mode 100644 index 000000000..d93f886cd --- /dev/null +++ b/benchmarks/llava/voirfile.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass + +from voir import configurable +from voir.instruments import dash, early_stop, log, rate +from benchmate.monitor import monitor_monogpu + +@dataclass +class Config: + """voir configuration""" + + # Whether to display the dash or not + dash: bool = False + + # How often to log the rates + interval: str = "1s" + + # Number of rates to skip before logging + skip: int = 5 + + # Number of rates to log before stopping + stop: int = 20 + + # Number of seconds between each gpu poll + gpu_poll: int = 3 + + +@configurable +def instrument_main(ov, options: Config): + yield ov.phases.init + + if options.dash: + ov.require(dash) + + ov.require( + log("value", "progress", "rate", "units", "loss", "gpudata", context="task"), + early_stop(n=options.stop, key="rate", task="train"), + monitor_monogpu(poll_interval=options.gpu_poll), + ) diff --git a/benchmarks/purejaxrl/main.py b/benchmarks/purejaxrl/main.py index 38eaf2792..f37c45e0d 100644 --- a/benchmarks/purejaxrl/main.py +++ b/benchmarks/purejaxrl/main.py @@ -4,7 +4,6 @@ # clone_subtree in the benchfile.py, in which case this file can simply # be deleted. -import argparse import argklass diff --git a/config/base.yaml b/config/base.yaml index 09cc11c85..9d71652a0 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -732,3 +732,46 @@ torchatari: --num-envs: auto({cpu_per_gpu}, 128) --total-timesteps: 1000000 --env-id: Breakout-v5 + + +llava: + inherits: _defaults + definition: ../benchmarks/llava + install_group: torch + plan: + method: per_gpu + + tags: + - llm + argv: + --batch_size: 1 + --num_workers: 4 + + +llava-single: + inherits: _defaults + definition: ../benchmarks/llava + install_group: torch + plan: + method: per_gpu + + tags: + - llm + argv: + --batch_size: 1 + --num_workers: 4 + +llava-gpus: + inherits: _defaults + definition: ../benchmarks/llava + install_group: torch + plan: + method: njobs + n: 1 + + tags: + - llm + argv: + --batch_size: 1 + --num_workers: 4 + --gradient_accumulation_steps: 1 \ No newline at end of file diff --git a/milabench/_version.py b/milabench/_version.py index 0202d13c4..591f21dbf 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,6 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-28-g8069946" -__commit__ = "8069946d331fb92090057d7eedd598515249521d" -__date__ = "2024-08-01 12:39:13 -0400" - +__tag__ = "v0.1.0-82-gea44ea63" +__commit__ = "ea44ea63be161bea2dd22c6dd23b1386474f09a7" +__date__ = "2024-09-05 12:03:19 -0400" diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh index 59e61a754..70c9ede87 100644 --- a/scripts/article/run_cuda.sh +++ b/scripts/article/run_cuda.sh @@ -45,7 +45,7 @@ install_prepare() { # # Install milabench's benchmarks in their venv # - # milabench pin --variant cuda --from-scratch $ARGS + milabench pin --variant cuda --from-scratch $ARGS milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS which pip