From 58631bf3c2dfdbbf554667423eb64548144937e4 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Mon, 16 Dec 2024 14:37:32 +0530
Subject: [PATCH 1/5] x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 docs/source/usage_guides/fsdp.md                | 17 +++++++++++++++++
 .../by_feature/fsdp_with_peak_mem_tracking.py   |  8 ++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/docs/source/usage_guides/fsdp.md b/docs/source/usage_guides/fsdp.md
index a2f9c717a3f..5f460330299 100644
--- a/docs/source/usage_guides/fsdp.md
+++ b/docs/source/usage_guides/fsdp.md
@@ -186,6 +186,23 @@ accelerate merge-weights pytorch_model_fsdp_0/ output_path
 
 ## A few caveats to be aware of
 
+- PyTorch FSDP auto wraps sub-modules. With `use_orig_params=False`, it flattens the parameters in each sub-module and shards them in place.
+  Due to this, any optimizer created before model wrapping gets broken and occupies more memory. Further, you might also observe correctness issues during training. 
+  Hence, it is highly recommended and efficient to prepare the model before creating the optimizer. Example: 
+```diff
+  model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
++ model = accelerator.prepare(model)
+
+  optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
+
+- model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+-        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+-    )
+
++ optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
++         optimizer, train_dataloader, eval_dataloader, lr_scheduler
++    )
+```
 - In case of multiple models, pass the optimizers to the prepare call in the same order as corresponding models else `accelerator.save_state()` and `accelerator.load_state()` will result in wrong/unexpected behaviour.
 - This feature is incompatible with `--predict_with_generate` in the `run_translation.py` script of `Transformers` library.
 
diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py
index 0cd3fec55d5..a788515f1c8 100644
--- a/examples/by_feature/fsdp_with_peak_mem_tracking.py
+++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py
@@ -245,6 +245,9 @@ def collate_fn(examples):
     model = AutoModelForSequenceClassification.from_pretrained(
         args.model_name_or_path, return_dict=True, low_cpu_mem_usage=True
     )
+    # In FSDP, ith `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer. 
+    # We prepare the model beforehand in all cases for simplicity.
+    model = accelerator.prepare(model)
 
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
@@ -267,8 +270,9 @@ def collate_fn(examples):
         num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
     )
 
-    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+
+    optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        optimizer, train_dataloader, eval_dataloader, lr_scheduler
     )
 
     overall_step = 0

From ee33c1be59add6ade54384b81a2a1635f1eaf15e Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Mon, 16 Dec 2024 14:42:02 +0530
Subject: [PATCH 2/5] x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 examples/by_feature/fsdp_with_peak_mem_tracking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py
index a788515f1c8..426518988a0 100644
--- a/examples/by_feature/fsdp_with_peak_mem_tracking.py
+++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py
@@ -245,7 +245,7 @@ def collate_fn(examples):
     model = AutoModelForSequenceClassification.from_pretrained(
         args.model_name_or_path, return_dict=True, low_cpu_mem_usage=True
     )
-    # In FSDP, ith `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer. 
+    # In FSDP, with `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer. 
     # We prepare the model beforehand in all cases for simplicity.
     model = accelerator.prepare(model)
 

From 32600c5bf48be86d70cc99da7620e8812157d64d Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Mon, 16 Dec 2024 20:21:05 +0530
Subject: [PATCH 3/5] x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 docs/source/usage_guides/fsdp.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/usage_guides/fsdp.md b/docs/source/usage_guides/fsdp.md
index 5f460330299..50823142173 100644
--- a/docs/source/usage_guides/fsdp.md
+++ b/docs/source/usage_guides/fsdp.md
@@ -180,10 +180,11 @@ accelerate merge-weights pytorch_model_fsdp_0/ output_path
 
 ## Mapping between FSDP sharding strategies and DeepSpeed ZeRO Stages
 * `FULL_SHARD` maps to the DeepSpeed `ZeRO Stage-3`. Shards optimizer states, gradients and parameters.
-* `SHARD_GRAD_OP` maps to the DeepSpeed `ZeRO Stage-2`. Shards optimizer states and gradients.
+* `SHARD_GRAD_OP` maps to DeepSpeed `ZeRO Stage-2`. Shards optimizer states and gradients. A key difference from `ZeRO Stage-2` is that `SHARD_GRAD_OP` also shards the model parameters outside of computation (forward/backward passes).
 * `NO_SHARD` maps to `ZeRO Stage-0`. No sharding wherein each GPU has full copy of model, optimizer states and gradients.
 * `HYBRID_SHARD` maps to `ZeRO++ Stage-3` wherein `zero_hpz_partition_size=<num_gpus_per_node>`. Here, this will shard optimizer states, gradients and parameters within each node while each node has full copy.
 
+
 ## A few caveats to be aware of
 
 - PyTorch FSDP auto wraps sub-modules. With `use_orig_params=False`, it flattens the parameters in each sub-module and shards them in place.

From 4ee8c3b9352df1d46ef217e9e5e272e563667987 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Mon, 16 Dec 2024 20:22:05 +0530
Subject: [PATCH 4/5] x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 docs/source/usage_guides/fsdp.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/usage_guides/fsdp.md b/docs/source/usage_guides/fsdp.md
index 50823142173..515f5db9aab 100644
--- a/docs/source/usage_guides/fsdp.md
+++ b/docs/source/usage_guides/fsdp.md
@@ -184,7 +184,6 @@ accelerate merge-weights pytorch_model_fsdp_0/ output_path
 * `NO_SHARD` maps to `ZeRO Stage-0`. No sharding wherein each GPU has full copy of model, optimizer states and gradients.
 * `HYBRID_SHARD` maps to `ZeRO++ Stage-3` wherein `zero_hpz_partition_size=<num_gpus_per_node>`. Here, this will shard optimizer states, gradients and parameters within each node while each node has full copy.
 
-
 ## A few caveats to be aware of
 
 - PyTorch FSDP auto wraps sub-modules. With `use_orig_params=False`, it flattens the parameters in each sub-module and shards them in place.

From 00bafb4c10ed4d13e9160f93b3e7d7575a625ec4 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Mon, 16 Dec 2024 21:07:06 +0530
Subject: [PATCH 5/5] fix ruff errors

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 examples/by_feature/fsdp_with_peak_mem_tracking.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py
index 426518988a0..f2c87aa2c4e 100644
--- a/examples/by_feature/fsdp_with_peak_mem_tracking.py
+++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py
@@ -245,7 +245,7 @@ def collate_fn(examples):
     model = AutoModelForSequenceClassification.from_pretrained(
         args.model_name_or_path, return_dict=True, low_cpu_mem_usage=True
     )
-    # In FSDP, with `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer. 
+    # In FSDP, with `use_orig_params` as False, we need to `.prepare` the model before instantiating the optimizer.
     # We prepare the model beforehand in all cases for simplicity.
     model = accelerator.prepare(model)
 
@@ -270,7 +270,6 @@ def collate_fn(examples):
         num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
     )
 
-
     optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
         optimizer, train_dataloader, eval_dataloader, lr_scheduler
     )