From a2b6581493a4439e67bed4c385c9180bcfad81d6 Mon Sep 17 00:00:00 2001
From: Mehran Maghoumi <Maghoumi@users.noreply.github.com>
Date: Wed, 9 Oct 2024 11:19:30 -0700
Subject: [PATCH] [Tutorials] Improve the usability of the SDG tutorial's
 command (#286)

Addressing some feedbacks from an internal review.

Signed-off-by: Mehran Maghoumi <Maghoumi@users.noreply.github.com>
Signed-off-by: Vinay Raman <viraman@nvidia.com>
---
 tutorials/peft-curation-with-sdg/README.md | 28 +++++++++++++---------
 tutorials/peft-curation-with-sdg/main.py   |  3 +++
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/tutorials/peft-curation-with-sdg/README.md b/tutorials/peft-curation-with-sdg/README.md
index e356d5ca..8963d74b 100644
--- a/tutorials/peft-curation-with-sdg/README.md
+++ b/tutorials/peft-curation-with-sdg/README.md
@@ -66,21 +66,27 @@ python tutorials/peft-curation-with-sdg/main.py \
     --api-key YOUR_BUILD.NVIDIA.COM_API_KEY \
     --device gpu
 
-# To control the amount of synthetic data to generate using LLaMa 3.1 405B
+# Here are some examples that:
+# - Use the GPU and enable semantic deduplication
+# - Do 1 round of synthetic data generation
+# - Generate synthetic data using 0.1% of the real data
+# - Use the specified model from build.nvidia.com for synthetic data generation
+
+# Using LLaMa 3.1 405B:
 python tutorials/peft-curation-with-sdg/main.py \
     --api-key YOUR_BUILD.NVIDIA.COM_API_KEY \
-    --device gpu \  # Use the GPU and enable semantic deduplication
-    --synth-gen-rounds 1 \ # Do 1 round of synthetic data generation
-    --synth-gen-ratio 0.001 \  # Generate synthetic data using 0.1% of the real data
-    --synth-gen-model "meta/llama-3.1-405b-instruct" # Use LLaMa 3.1 405B
+    --device gpu \
+    --synth-gen-rounds 1 \
+    --synth-gen-ratio 0.001 \
+    --synth-gen-model "meta/llama-3.1-405b-instruct"
 
-# To control the amount of synthetic data to generate using Nemotron-4 340B
+# Using Nemotron-4 340B:
 python tutorials/peft-curation-with-sdg/main.py \
     --api-key YOUR_BUILD.NVIDIA.COM_API_KEY \
-    --device gpu \  # Use the GPU and enable semantic deduplication
-    --synth-gen-rounds 1 \ # Do 1 round of synthetic data generation
-    --synth-gen-ratio 0.001 \  # Generate synthetic data using 0.1% of the real data
-    --synth-gen-model "nvidia/nemotron-4-340b-instruct" # Use Nemotron-4 340B
+    --device gpu \
+    --synth-gen-rounds 1 \
+    --synth-gen-ratio 0.001 \
+    --synth-gen-model "nvidia/nemotron-4-340b-instruct"
 ```
 
 By default, this tutorial will use at most 8 workers to run the curation pipeline. If you face any
@@ -91,4 +97,4 @@ Once the code finishes executing, the curated dataset will be available under `d
 By default, the script outputs splits for training (80%), validation (10%) and testing (10%).
 
 ## Next Step: Fine-tune Your Own Model
-The curated dataset from this tutorial can be readily used for model customization and fine-tuning using the [NeMo Framework](https://github.com/NVIDIA/NeMo). Please refer to the [law title generation tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/llm/llama-3/sdg-law-title-generation/llama3-sdg-lora-nemofw.ipynb) in the NeMo Framework repository to learn more.
+The curated dataset from this tutorial can be readily used for model customization and fine-tuning using the [NeMo Framework](https://github.com/NVIDIA/NeMo). Please refer to the [law title generation tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/llm/llama-3/sdg-law-title-generation/llama3-sdg-lora-nemofw.ipynb) in the NeMo Framework repository to learn more. In that tutorial, you will learn more about using the data you just curated to fine-tune a model that can read a legal question and generate a title for that question.
diff --git a/tutorials/peft-curation-with-sdg/main.py b/tutorials/peft-curation-with-sdg/main.py
index 5e134666..e51512c8 100644
--- a/tutorials/peft-curation-with-sdg/main.py
+++ b/tutorials/peft-curation-with-sdg/main.py
@@ -402,6 +402,9 @@ def main():
     curated_dir = os.path.dirname(train_fp_curated)
     os.system(f"cp {val_fp} {curated_dir}")
     os.system(f"cp {test_fp} {curated_dir}")
+    print(
+        "--------------------------------------------------------------------------------"
+    )
     print(f"Curated files are saved in '{curated_dir}'.")