From 9f5af498d88b448599ddaf7d77435953d954d9f5 Mon Sep 17 00:00:00 2001 From: bghira Date: Mon, 23 Dec 2024 10:48:45 -0600 Subject: [PATCH 1/4] deepspeed stage 3 needs validations disabled thoroughly --- helpers/training/trainer.py | 91 ++++++++++++++++++++-------------- helpers/training/validation.py | 24 ++++++--- 2 files changed, 70 insertions(+), 45 deletions(-) diff --git a/helpers/training/trainer.py b/helpers/training/trainer.py index 577ccd82..6ec57a9c 100644 --- a/helpers/training/trainer.py +++ b/helpers/training/trainer.py @@ -470,23 +470,24 @@ def init_vae(self, move_to_accelerator: bool = True): else: from diffusers import AutoencoderKL as AutoencoderClass - try: - self.vae = AutoencoderClass.from_pretrained(**self.config.vae_kwargs) - except: - logger.warning( - "Couldn't load VAE with default path. Trying without a subfolder.." - ) - self.config.vae_kwargs["subfolder"] = None - self.vae = AutoencoderClass.from_pretrained(**self.config.vae_kwargs) - if ( - self.vae is not None - and self.config.vae_enable_tiling - and hasattr(self.vae, "enable_tiling") - ): + with ContextManagers(deepspeed_zero_init_disabled_context_manager()): + try: + self.vae = AutoencoderClass.from_pretrained(**self.config.vae_kwargs) + except: logger.warning( - "Enabling VAE tiling for greatly reduced memory consumption due to --vae_enable_tiling which may result in VAE tiling artifacts in encoded latents." + "Couldn't load VAE with default path. Trying without a subfolder.." ) - self.vae.enable_tiling() + self.config.vae_kwargs["subfolder"] = None + self.vae = AutoencoderClass.from_pretrained(**self.config.vae_kwargs) + if ( + self.vae is not None + and self.config.vae_enable_tiling + and hasattr(self.vae, "enable_tiling") + ): + logger.warning( + "Enabling VAE tiling for greatly reduced memory consumption due to --vae_enable_tiling which may result in VAE tiling artifacts in encoded latents." + ) + self.vae.enable_tiling() if not move_to_accelerator: logger.debug("Not moving VAE to accelerator.") return @@ -530,28 +531,28 @@ def init_text_encoder(self, move_to_accelerator: bool = True): None, None, ) - if self.tokenizer_1 is not None: - self.text_encoder_cls_1 = import_model_class_from_model_name_or_path( - self.config.text_encoder_path, - self.config.revision, - self.config, - subfolder=self.config.text_encoder_subfolder, - ) - if self.tokenizer_2 is not None: - self.text_encoder_cls_2 = import_model_class_from_model_name_or_path( - self.config.pretrained_model_name_or_path, - self.config.revision, - self.config, - subfolder="text_encoder_2", - ) - if self.tokenizer_3 is not None and self.config.model_family == "sd3": - self.text_encoder_cls_3 = import_model_class_from_model_name_or_path( - self.config.pretrained_model_name_or_path, - self.config.revision, - self.config, - subfolder="text_encoder_3", - ) with ContextManagers(deepspeed_zero_init_disabled_context_manager()): + if self.tokenizer_1 is not None: + self.text_encoder_cls_1 = import_model_class_from_model_name_or_path( + self.config.text_encoder_path, + self.config.revision, + self.config, + subfolder=self.config.text_encoder_subfolder, + ) + if self.tokenizer_2 is not None: + self.text_encoder_cls_2 = import_model_class_from_model_name_or_path( + self.config.pretrained_model_name_or_path, + self.config.revision, + self.config, + subfolder="text_encoder_2", + ) + if self.tokenizer_3 is not None and self.config.model_family == "sd3": + self.text_encoder_cls_3 = import_model_class_from_model_name_or_path( + self.config.pretrained_model_name_or_path, + self.config.revision, + self.config, + subfolder="text_encoder_3", + ) tokenizers = [self.tokenizer_1, self.tokenizer_2, self.tokenizer_3] text_encoder_classes = [ self.text_encoder_cls_1, @@ -669,7 +670,13 @@ def init_data_backend(self): raise e - self.init_validation_prompts() + try: + self.init_validation_prompts() + except Exception as e: + logger.error("Could not generate validation prompts.") + logger.error(e) + raise e + # We calculate the number of steps per epoch by dividing the number of images by the effective batch divisor. # Gradient accumulation steps mean that we only update the model weights every /n/ steps. collected_data_backend_str = list(StateTracker.get_data_backends().keys()) @@ -695,6 +702,16 @@ def init_data_backend(self): self.accelerator.wait_for_everyone() def init_validation_prompts(self): + if ( + hasattr(self.accelerator, "state") + and hasattr(self.accelerator.state, "deepspeed_plugin") + and getattr(self.accelerator.state.deepspeed_plugin, "deepspeed_config", {}) + .get("zero_optimization", {}) + .get("stage") + == 3 + ): + logger.error("Cannot run validations with DeepSpeed ZeRO stage 3.") + return if self.accelerator.is_main_process: if self.config.model_family == "flux": ( diff --git a/helpers/training/validation.py b/helpers/training/validation.py index de8cf7cf..e16f4090 100644 --- a/helpers/training/validation.py +++ b/helpers/training/validation.py @@ -27,6 +27,11 @@ from helpers.image_manipulation.brightness import calculate_luminance from PIL import Image, ImageDraw, ImageFont from diffusers import SanaPipeline +from helpers.training.deepspeed import ( + deepspeed_zero_init_disabled_context_manager, + prepare_model_for_deepspeed, +) +from transformers.utils import ContextManagers logger = logging.getLogger(__name__) logger.setLevel(os.environ.get("SIMPLETUNER_LOG_LEVEL") or "INFO") @@ -523,14 +528,17 @@ def init_vae(self): self.vae = precached_vae if self.vae is None: logger.info(f"Initialising {AutoencoderClass}") - self.vae = AutoencoderClass.from_pretrained( - vae_path, - subfolder=( - "vae" if args.pretrained_vae_model_name_or_path is None else None - ), - revision=args.revision, - force_upcast=False, - ).to(self.inference_device) + with ContextManagers(deepspeed_zero_init_disabled_context_manager()): + self.vae = AutoencoderClass.from_pretrained( + vae_path, + subfolder=( + "vae" + if args.pretrained_vae_model_name_or_path is None + else None + ), + revision=args.revision, + force_upcast=False, + ).to(self.inference_device) StateTracker.set_vae(self.vae) return self.vae From 27799f8912287dfe7d5d2b68563129b0de323880 Mon Sep 17 00:00:00 2001 From: bghira Date: Tue, 24 Dec 2024 14:02:22 -0600 Subject: [PATCH 2/4] check for latent attr or retrieve whole latent --- helpers/caching/vae.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py index 4f607cfc..52e433e7 100644 --- a/helpers/caching/vae.py +++ b/helpers/caching/vae.py @@ -533,7 +533,8 @@ def encode_images(self, images, filepaths, load_from_cache=True): ) * self.vae.config.scaling_factor else: latents_uncached = ( - latents_uncached.latent * self.vae.config.scaling_factor + getattr(latents_uncached, "latent", latents_uncached) + * self.vae.config.scaling_factor ) logger.debug(f"Latents shape: {latents_uncached.shape}") From eb72efb9b622ba1ab8ed67f9bb29493702958a73 Mon Sep 17 00:00:00 2001 From: bghira Date: Tue, 24 Dec 2024 14:09:02 -0600 Subject: [PATCH 3/4] set text encoders to .eval() mode --- helpers/training/deepspeed.py | 61 +++++++++++++++++++++++++++++-- helpers/training/text_encoding.py | 6 ++- train.py | 2 +- 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/helpers/training/deepspeed.py b/helpers/training/deepspeed.py index d13d7ea0..5a8fb0ea 100644 --- a/helpers/training/deepspeed.py +++ b/helpers/training/deepspeed.py @@ -1,9 +1,45 @@ -import accelerate, logging, os +import accelerate, logging, os, contextlib, transformers from accelerate.state import AcceleratorState +from transformers.integrations import HfDeepSpeedConfig -logger = logging.getLogger(__name__) +logger = logging.getLogger("DeepSpeed") logger.setLevel(os.environ.get("SIMPLETUNER_LOG_LEVEL", "INFO")) +from transformers.integrations.deepspeed import ( + is_deepspeed_zero3_enabled, + set_hf_deepspeed_config, + unset_hf_deepspeed_config, +) + + +@contextlib.contextmanager +def temporarily_disable_deepspeed_zero3(): + # https://github.com/huggingface/transformers/issues/28106 + deepspeed_plugin = ( + AcceleratorState().deepspeed_plugin + if accelerate.state.is_initialized() + else None + ) + if deepspeed_plugin is None: + print("DeepSpeed was not enabled.") + return [] + + if deepspeed_plugin and is_deepspeed_zero3_enabled(): + print("DeepSpeed being disabled.") + _hf_deepspeed_config_weak_ref = ( + transformers.integrations.deepspeed._hf_deepspeed_config_weak_ref + ) + unset_hf_deepspeed_config() + yield + print("DeepSpeed being enabled.") + set_hf_deepspeed_config(HfDeepSpeedConfig(deepspeed_plugin.deepspeed_config)) + transformers.integrations.deepspeed._hf_deepspeed_config_weak_ref = ( + _hf_deepspeed_config_weak_ref + ) + else: + print(f"Doing nothing, deepspeed zero3 was not enabled?") + yield + def deepspeed_zero_init_disabled_context_manager(): """ @@ -15,9 +51,16 @@ def deepspeed_zero_init_disabled_context_manager(): else None ) if deepspeed_plugin is None: + logger.debug("DeepSpeed context manager disabled, no DeepSpeed detected.") return [] - return [deepspeed_plugin.zero3_init_context_manager(enable=False)] + logger.debug( + f"DeepSpeed context manager enabled, DeepSpeed detected: {deepspeed_plugin}" + ) + return [ + deepspeed_plugin.zero3_init_context_manager(enable=False), + temporarily_disable_deepspeed_zero3(), + ] def prepare_model_for_deepspeed(accelerator, args): @@ -38,9 +81,19 @@ def prepare_model_for_deepspeed(accelerator, args): if offload_param["nvme_path"] == "none": if args.offload_param_path is None: raise ValueError( - f"DeepSpeed is using {offload_param['device']} but nvme_path is not specified." + f"DeepSpeed is using {offload_param['device']} but nvme_path is not specified. The configuration has '{offload_param['nvme_path']}' for 'nvme_path'." ) else: + offload_buffer = 100000000.0 + if args.model_family in ["flux"]: + # flux is big + offload_buffer = 131600000.0 + logger.info( + f"Attempting to allocate {offload_buffer} size byte buffer." + ) + accelerator.state.deepspeed_plugin.deepspeed_config[ + "zero_optimization" + ]["offload_param"]["buffer_size"] = offload_buffer accelerator.state.deepspeed_plugin.deepspeed_config[ "zero_optimization" ]["offload_param"]["nvme_path"] = args.offload_param_path diff --git a/helpers/training/text_encoding.py b/helpers/training/text_encoding.py index 34e65126..4eab5fd3 100644 --- a/helpers/training/text_encoding.py +++ b/helpers/training/text_encoding.py @@ -261,7 +261,6 @@ def load_tes( "EleutherAI/pile-t5-base", torch_dtype=weight_dtype, ).encoder - text_encoder_1.eval() if tokenizer_2 is not None: if args.model_family.lower() == "flux": @@ -287,4 +286,9 @@ def load_tes( variant=args.variant, ) + for te in [text_encoder_1, text_encoder_2, text_encoder_3]: + if te is None: + continue + te.eval() + return text_encoder_variant, text_encoder_1, text_encoder_2, text_encoder_3 diff --git a/train.py b/train.py index c6457eaf..9cbc4438 100644 --- a/train.py +++ b/train.py @@ -48,7 +48,7 @@ trainer.init_preprocessing_models() trainer.init_precision(preprocessing_models_only=True) trainer.init_data_backend() - trainer.init_validation_prompts() + # trainer.init_validation_prompts() trainer.init_unload_text_encoder() trainer.init_unload_vae() From dff9b3a4fceb35332a34fd87960ffda399d422c7 Mon Sep 17 00:00:00 2001 From: bghira Date: Tue, 24 Dec 2024 14:09:32 -0600 Subject: [PATCH 4/4] update dependencies for deepspeed integrations --- poetry.lock | 10 +++++----- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index f4f7c2f8..625c5004 100644 --- a/poetry.lock +++ b/poetry.lock @@ -655,12 +655,12 @@ vision = ["Pillow (>=9.4.0)"] [[package]] name = "deepspeed" -version = "0.16.1" +version = "0.16.2" description = "DeepSpeed library" optional = false python-versions = "*" files = [ - {file = "deepspeed-0.16.1.tar.gz", hash = "sha256:058cb748d1c39f88580343fb69633fe45586225025522c1cf9f64b608a71820c"}, + {file = "deepspeed-0.16.2.tar.gz", hash = "sha256:a5b45fdefae65ab48aa30873d3c4d6ec85993eee232aa3a088087c9b88b22281"}, ] [package.dependencies] @@ -678,10 +678,10 @@ tqdm = "*" [package.extras] 1bit-mpi = ["mpi4py"] -all = ["accelerate", "autodoc_pydantic (>=2.0.0)", "clang-format (==18.1.3)", "comet_ml (>=3.41.0)", "deepspeed-kernels", "diffusers (>=0.25.0)", "docutils (<0.18)", "future", "google", "hjson", "importlib-metadata (>=4)", "lm-eval (==0.3.0)", "mpi4py", "mup", "neural-compressor (==2.1.0)", "packaging", "pre-commit (>=2.20.0)", "protobuf", "psutil", "py-cpuinfo", "pydantic (>=2.0.0)", "pytest (>=7.2.0)", "pytest-forked", "pytest-randomly", "pytest-xdist", "qtorch", "qtorch (==0.3.0)", "recommonmark", "safetensors", "sentencepiece", "sphinx", "sphinx-rtd-theme", "sphinx_rtd_theme", "tabulate", "tensorboard", "torch", "torchvision", "tqdm", "transformers (>=4.32.1)", "transformers (>=4.39.0)", "triton (==1.0.0)", "triton (==2.1.0)", "triton (>=2.1.0)", "wandb", "xgboost"] +all = ["accelerate", "autodoc_pydantic (>=2.0.0)", "clang-format (==18.1.3)", "comet_ml (>=3.41.0)", "deepspeed-kernels", "diffusers (>=0.25.0)", "docutils (<0.18)", "future", "google", "hjson", "importlib-metadata (>=4)", "lm-eval (==0.3.0)", "mpi4py", "mup", "neural-compressor (==2.1.0)", "packaging", "pre-commit (>=3.2.0)", "protobuf", "psutil", "py-cpuinfo", "pydantic (>=2.0.0)", "pytest (>=7.2.0)", "pytest-forked", "pytest-randomly", "pytest-xdist", "qtorch", "qtorch (==0.3.0)", "recommonmark", "safetensors", "sentencepiece", "sphinx", "sphinx-rtd-theme", "sphinx_rtd_theme", "tabulate", "tensorboard", "torch", "torchvision", "tqdm", "transformers (>=4.32.1)", "transformers (>=4.39.0)", "triton (==1.0.0)", "triton (==2.1.0)", "triton (>=2.1.0)", "wandb", "xgboost"] autotuning = ["tabulate"] autotuning-ml = ["hjson", "tabulate", "xgboost"] -dev = ["accelerate", "clang-format (==18.1.3)", "comet_ml (>=3.41.0)", "deepspeed-kernels", "docutils (<0.18)", "future", "importlib-metadata (>=4)", "mup", "pre-commit (>=2.20.0)", "pytest (>=7.2.0)", "pytest-forked", "pytest-randomly", "pytest-xdist", "qtorch (==0.3.0)", "recommonmark", "sphinx", "sphinx-rtd-theme", "tensorboard", "torchvision", "transformers (>=4.39.0)", "wandb"] +dev = ["accelerate", "clang-format (==18.1.3)", "comet_ml (>=3.41.0)", "deepspeed-kernels", "docutils (<0.18)", "future", "importlib-metadata (>=4)", "mup", "pre-commit (>=3.2.0)", "pytest (>=7.2.0)", "pytest-forked", "pytest-randomly", "pytest-xdist", "qtorch (==0.3.0)", "recommonmark", "sphinx", "sphinx-rtd-theme", "tensorboard", "torchvision", "transformers (>=4.39.0)", "wandb"] inf = ["google", "lm-eval (==0.3.0)", "protobuf", "qtorch", "safetensors", "sentencepiece", "transformers (>=4.32.1)"] readthedocs = ["autodoc_pydantic (>=2.0.0)", "docutils (<0.18)", "hjson", "packaging", "psutil", "py-cpuinfo", "pydantic (>=2.0.0)", "recommonmark", "sphinx_rtd_theme", "torch", "tqdm"] sd = ["diffusers (>=0.25.0)", "triton (>=2.1.0)"] @@ -5327,4 +5327,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "3d5bf1d8e4086ff4e2c2f769b3c99eaae79fc65003de82608d1a468d588c2bf4" +content-hash = "3318240032866a32b1657f3b375a3a92b6bfe0f6c09cba6f40932082740233d8" diff --git a/pyproject.toml b/pyproject.toml index 582d3ad5..96577140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ wandb = "^0.19.1" requests = "^2.32.3" pillow = "^11.0.0" opencv-python = "^4.10.0.84" -deepspeed = "^0.16.1" +deepspeed = "^0.16.2" accelerate = "^1.2.1" safetensors = "^0.4.5" compel = "^2.0.1"