From 83239b0dec01be30842cdde53cb19da2bd3697a7 Mon Sep 17 00:00:00 2001
From: Szymon Duchniewicz <szymon.duchniewicz.20@ucl.ac.uk>
Date: Wed, 18 Sep 2024 12:51:10 +0100
Subject: [PATCH 1/3] Add special case for saving model when running with
 ZERO-3 optimisation.

Signed-off-by: Szymon Duchniewicz <szymon.duchniewicz.20@ucl.ac.uk>
---
 llm_unlearn_ucl/unlearn_harm.py | 67 +++++++++++++++++++++++++++++----
 1 file changed, 60 insertions(+), 7 deletions(-)

diff --git a/llm_unlearn_ucl/unlearn_harm.py b/llm_unlearn_ucl/unlearn_harm.py
index 95ddd28..794092f 100644
--- a/llm_unlearn_ucl/unlearn_harm.py
+++ b/llm_unlearn_ucl/unlearn_harm.py
@@ -132,14 +132,14 @@ def main(args) -> None:
     ) % args.batch_size == 0, "samples in each 'sequence' (--samples_count / --sequential) should be a multiple of batch_size."
 
     if args.wandb_log:
-        accelerator = Accelerator(log_with="wandb")
+        accelerator: Accelerator = Accelerator(log_with="wandb")
         accelerator.init_trackers(
             project_name=args.wandb_project_name,
             config=vars(args),
             init_kwargs={"wandb": {"name": args.wandb_run_name}},
         )
     else:
-        accelerator = Accelerator()
+        accelerator: Accelerator = Accelerator()
     device = accelerator.device
 
     # setup logging
@@ -365,8 +365,24 @@ def main(args) -> None:
                 optimizer.zero_grad()
 
                 # NOTE: This only handles deepspeed zero and zero2, zero3 will require change
-                if accelerator.is_local_main_process:
-                    if args.sequential == 1 and epoch_num % args.save_every == 0:
+                if args.sequential == 1 and epoch_num % args.save_every == 0:
+                    # NOTE: special case for zero 3
+                    if accelerator.deepspeed_config is not None and accelerator.deepspeed_config['zero_optimization']['stage'] == 3:
+                        print("Zero 3 optim: Saving model shards from all GPUs!")
+                        model_tokenizer_save_dir = Path(
+                            os.path.join(args.model_save_dir, f"idx_{epoch_num}")
+                        )
+                        model_tokenizer_save_dir.mkdir(parents=True, exist_ok=True)
+                        unwrapped_model = accelerator.unwrap_model(model)
+                        unwrapped_model.save_pretrained(
+                            model_tokenizer_save_dir,
+                            is_main_process=accelerator.is_main_process,
+                            save_function=accelerator.save,
+                            state_dict=accelerator.get_state_dict(model),
+                        )
+                        tokenizer.save_pretrained(model_tokenizer_save_dir)
+                        print(f"Saved zero-3 model at step {epoch_num}.")
+                    elif accelerator.is_local_main_process:
                         accelerator.wait_for_everyone()  # for model saving
                         # NOTE: Batch unlearning, save for every epoch
                         model_tokenizer_save_dir = Path(
@@ -424,8 +440,25 @@ def main(args) -> None:
                 optimizer.zero_grad()
                 idx += 1
                 final_model_tag = idx
-                if accelerator.is_local_main_process:
-                    if idx % args.save_every == 0:
+                if idx % args.save_every == 0:
+                    # NOTE: special case for zero 3
+                    if accelerator.deepspeed_config is not None and accelerator.deepspeed_config['zero_optimization']['stage'] == 3:
+                        print("Zero 3 optim: Saving model shards from all GPUs!")
+                        model_tokenizer_save_dir = Path(
+                            os.path.join(args.model_save_dir, f"idx_{epoch_num}")
+                        )
+                        model_tokenizer_save_dir.mkdir(parents=True, exist_ok=True)
+                        unwrapped_model = accelerator.unwrap_model(model)
+                        unwrapped_model.save_pretrained(
+                            model_tokenizer_save_dir,
+                            is_main_process=accelerator.is_main_process,
+                            save_function=accelerator.save,
+                            state_dict=accelerator.get_state_dict(model),
+                        )
+                        tokenizer.save_pretrained(model_tokenizer_save_dir)
+                        print(f"Saved zero-3 model at step {epoch_num}.")
+                    elif accelerator.is_local_main_process:
+                        # If not using zero 2, just save the entire model on the main process (its not sharded)
                         accelerator.wait_for_everyone()  # for model saving
                         # Save model and tokenizer.
                         model_tokenizer_save_dir = Path(
@@ -468,7 +501,27 @@ def main(args) -> None:
         model = model.merge_and_unload()
 
     # Save final model.
-    if accelerator.is_local_main_process:
+    # NOTE: special case for zero 3
+    if accelerator.deepspeed_config is not None and accelerator.deepspeed_config['zero_optimization']['stage'] == 3:
+        print("Zero 3 optim: Saving model shards from all GPUs!")
+        model_tokenizer_save_dir = Path(
+            os.path.join(args.model_save_dir, f"idx_{epoch_num}")
+        )
+        model_tokenizer_save_dir.mkdir(parents=True, exist_ok=True)
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            model_tokenizer_save_dir,
+            is_main_process=accelerator.is_main_process,
+            save_function=accelerator.save,
+            state_dict=accelerator.get_state_dict(model),
+        )
+        tokenizer.save_pretrained(model_tokenizer_save_dir)
+        print(f"Saved final zero-3 model at step {epoch_num}.")
+        print("Unlearning finished")
+        logger.info("Unlearning finished")
+        if bool(args.wandb_log):
+            accelerator.end_training()
+    elif accelerator.is_local_main_process:
         model_tokenizer_save_dir = Path(
             os.path.join(args.model_save_dir, f"idx_{final_model_tag}")
         )

From 7ac0b9cc94200e69d3f39603ce39c88b4e460b31 Mon Sep 17 00:00:00 2001
From: Willmish <Willmish@users.noreply.github.com>
Date: Wed, 18 Sep 2024 12:00:43 +0000
Subject: [PATCH 2/3] :art: Format Python code with psf/black

---
 llm_unlearn_ucl/unlearn_harm.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/llm_unlearn_ucl/unlearn_harm.py b/llm_unlearn_ucl/unlearn_harm.py
index 794092f..b8169f0 100644
--- a/llm_unlearn_ucl/unlearn_harm.py
+++ b/llm_unlearn_ucl/unlearn_harm.py
@@ -367,7 +367,11 @@ def main(args) -> None:
                 # NOTE: This only handles deepspeed zero and zero2, zero3 will require change
                 if args.sequential == 1 and epoch_num % args.save_every == 0:
                     # NOTE: special case for zero 3
-                    if accelerator.deepspeed_config is not None and accelerator.deepspeed_config['zero_optimization']['stage'] == 3:
+                    if (
+                        accelerator.deepspeed_config is not None
+                        and accelerator.deepspeed_config["zero_optimization"]["stage"]
+                        == 3
+                    ):
                         print("Zero 3 optim: Saving model shards from all GPUs!")
                         model_tokenizer_save_dir = Path(
                             os.path.join(args.model_save_dir, f"idx_{epoch_num}")
@@ -442,7 +446,11 @@ def main(args) -> None:
                 final_model_tag = idx
                 if idx % args.save_every == 0:
                     # NOTE: special case for zero 3
-                    if accelerator.deepspeed_config is not None and accelerator.deepspeed_config['zero_optimization']['stage'] == 3:
+                    if (
+                        accelerator.deepspeed_config is not None
+                        and accelerator.deepspeed_config["zero_optimization"]["stage"]
+                        == 3
+                    ):
                         print("Zero 3 optim: Saving model shards from all GPUs!")
                         model_tokenizer_save_dir = Path(
                             os.path.join(args.model_save_dir, f"idx_{epoch_num}")
@@ -502,7 +510,10 @@ def main(args) -> None:
 
     # Save final model.
     # NOTE: special case for zero 3
-    if accelerator.deepspeed_config is not None and accelerator.deepspeed_config['zero_optimization']['stage'] == 3:
+    if (
+        accelerator.deepspeed_config is not None
+        and accelerator.deepspeed_config["zero_optimization"]["stage"] == 3
+    ):
         print("Zero 3 optim: Saving model shards from all GPUs!")
         model_tokenizer_save_dir = Path(
             os.path.join(args.model_save_dir, f"idx_{epoch_num}")

From 837dd6efdcbc4cbe7df313eab2c54b1f0509db36 Mon Sep 17 00:00:00 2001
From: Szymon Duchniewicz <szymon.duchniewicz.20@ucl.ac.uk>
Date: Wed, 18 Sep 2024 15:13:31 +0100
Subject: [PATCH 3/3] Add deepspeed 4 gpu config.

Signed-off-by: Szymon Duchniewicz <szymon.duchniewicz.20@ucl.ac.uk>
---
 llm_unlearn_ucl/deepspeed_4gpu.yaml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 llm_unlearn_ucl/deepspeed_4gpu.yaml

diff --git a/llm_unlearn_ucl/deepspeed_4gpu.yaml b/llm_unlearn_ucl/deepspeed_4gpu.yaml
new file mode 100644
index 0000000..018beb9
--- /dev/null
+++ b/llm_unlearn_ucl/deepspeed_4gpu.yaml
@@ -0,0 +1,23 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false