From 58631bf3c2dfdbbf554667423eb64548144937e4 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Mon, 16 Dec 2024 14:37:32 +0530 Subject: [PATCH 1/5] x Signed-off-by: SumanthRH --- docs/source/usage_guides/fsdp.md | 17 +++++++++++++++++ .../by_feature/fsdp_with_peak_mem_tracking.py | 8 ++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/source/usage_guides/fsdp.md b/docs/source/usage_guides/fsdp.md index a2f9c717a3f..5f460330299 100644 --- a/docs/source/usage_guides/fsdp.md +++ b/docs/source/usage_guides/fsdp.md @@ -186,6 +186,23 @@ accelerate merge-weights pytorch_model_fsdp_0/ output_path ## A few caveats to be aware of +- PyTorch FSDP auto wraps sub-modules. With `use_orig_params=False`, it flattens the parameters in each sub-module and shards them in place. + Due to this, any optimizer created before model wrapping gets broken and occupies more memory. Further, you might also observe correctness issues during training. + Hence, it is highly recommended and efficient to prepare the model before creating the optimizer. Example: +```diff + model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True) ++ model = accelerator.prepare(model) + + optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr) + +- model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( +- model, optimizer, train_dataloader, eval_dataloader, lr_scheduler +- ) + ++ optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( ++ optimizer, train_dataloader, eval_dataloader, lr_scheduler ++ ) +``` - In case of multiple models, pass the optimizers to the prepare call in the same order as corresponding models else `accelerator.save_state()` and `accelerator.load_state()` will result in wrong/unexpected behaviour. - This feature is incompatible with `--predict_with_generate` in the `run_translation.py` script of `Transformers` library. diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py index 0cd3fec55d5..a788515f1c8 100644 --- a/examples/by_feature/fsdp_with_peak_mem_tracking.py +++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py @@ -245,6 +245,9 @@ def collate_fn(examples): model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, return_dict=True, low_cpu_mem_usage=True ) + # In FSDP, ith `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer. + # We prepare the model beforehand in all cases for simplicity. + model = accelerator.prepare(model) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ @@ -267,8 +270,9 @@ def collate_fn(examples): num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps, ) - model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + + optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + optimizer, train_dataloader, eval_dataloader, lr_scheduler ) overall_step = 0 From ee33c1be59add6ade54384b81a2a1635f1eaf15e Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Mon, 16 Dec 2024 14:42:02 +0530 Subject: [PATCH 2/5] x Signed-off-by: SumanthRH --- examples/by_feature/fsdp_with_peak_mem_tracking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py index a788515f1c8..426518988a0 100644 --- a/examples/by_feature/fsdp_with_peak_mem_tracking.py +++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py @@ -245,7 +245,7 @@ def collate_fn(examples): model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, return_dict=True, low_cpu_mem_usage=True ) - # In FSDP, ith `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer. + # In FSDP, with `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer. # We prepare the model beforehand in all cases for simplicity. model = accelerator.prepare(model) From 32600c5bf48be86d70cc99da7620e8812157d64d Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Mon, 16 Dec 2024 20:21:05 +0530 Subject: [PATCH 3/5] x Signed-off-by: SumanthRH --- docs/source/usage_guides/fsdp.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/usage_guides/fsdp.md b/docs/source/usage_guides/fsdp.md index 5f460330299..50823142173 100644 --- a/docs/source/usage_guides/fsdp.md +++ b/docs/source/usage_guides/fsdp.md @@ -180,10 +180,11 @@ accelerate merge-weights pytorch_model_fsdp_0/ output_path ## Mapping between FSDP sharding strategies and DeepSpeed ZeRO Stages * `FULL_SHARD` maps to the DeepSpeed `ZeRO Stage-3`. Shards optimizer states, gradients and parameters. -* `SHARD_GRAD_OP` maps to the DeepSpeed `ZeRO Stage-2`. Shards optimizer states and gradients. +* `SHARD_GRAD_OP` maps to DeepSpeed `ZeRO Stage-2`. Shards optimizer states and gradients. A key difference from `ZeRO Stage-2` is that `SHARD_GRAD_OP` also shards the model parameters outside of computation (forward/backward passes). * `NO_SHARD` maps to `ZeRO Stage-0`. No sharding wherein each GPU has full copy of model, optimizer states and gradients. * `HYBRID_SHARD` maps to `ZeRO++ Stage-3` wherein `zero_hpz_partition_size=`. Here, this will shard optimizer states, gradients and parameters within each node while each node has full copy. + ## A few caveats to be aware of - PyTorch FSDP auto wraps sub-modules. With `use_orig_params=False`, it flattens the parameters in each sub-module and shards them in place. From 4ee8c3b9352df1d46ef217e9e5e272e563667987 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Mon, 16 Dec 2024 20:22:05 +0530 Subject: [PATCH 4/5] x Signed-off-by: SumanthRH --- docs/source/usage_guides/fsdp.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/usage_guides/fsdp.md b/docs/source/usage_guides/fsdp.md index 50823142173..515f5db9aab 100644 --- a/docs/source/usage_guides/fsdp.md +++ b/docs/source/usage_guides/fsdp.md @@ -184,7 +184,6 @@ accelerate merge-weights pytorch_model_fsdp_0/ output_path * `NO_SHARD` maps to `ZeRO Stage-0`. No sharding wherein each GPU has full copy of model, optimizer states and gradients. * `HYBRID_SHARD` maps to `ZeRO++ Stage-3` wherein `zero_hpz_partition_size=`. Here, this will shard optimizer states, gradients and parameters within each node while each node has full copy. - ## A few caveats to be aware of - PyTorch FSDP auto wraps sub-modules. With `use_orig_params=False`, it flattens the parameters in each sub-module and shards them in place. From 00bafb4c10ed4d13e9160f93b3e7d7575a625ec4 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Mon, 16 Dec 2024 21:07:06 +0530 Subject: [PATCH 5/5] fix ruff errors Signed-off-by: SumanthRH --- examples/by_feature/fsdp_with_peak_mem_tracking.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py index 426518988a0..f2c87aa2c4e 100644 --- a/examples/by_feature/fsdp_with_peak_mem_tracking.py +++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py @@ -245,7 +245,7 @@ def collate_fn(examples): model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, return_dict=True, low_cpu_mem_usage=True ) - # In FSDP, with `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer. + # In FSDP, with `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer. # We prepare the model beforehand in all cases for simplicity. model = accelerator.prepare(model) @@ -270,7 +270,6 @@ def collate_fn(examples): num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps, ) - optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( optimizer, train_dataloader, eval_dataloader, lr_scheduler )