From a2b6581493a4439e67bed4c385c9180bcfad81d6 Mon Sep 17 00:00:00 2001 From: Mehran Maghoumi Date: Wed, 9 Oct 2024 11:19:30 -0700 Subject: [PATCH] [Tutorials] Improve the usability of the SDG tutorial's command (#286) Addressing some feedbacks from an internal review. Signed-off-by: Mehran Maghoumi Signed-off-by: Vinay Raman --- tutorials/peft-curation-with-sdg/README.md | 28 +++++++++++++--------- tutorials/peft-curation-with-sdg/main.py | 3 +++ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/tutorials/peft-curation-with-sdg/README.md b/tutorials/peft-curation-with-sdg/README.md index e356d5ca..8963d74b 100644 --- a/tutorials/peft-curation-with-sdg/README.md +++ b/tutorials/peft-curation-with-sdg/README.md @@ -66,21 +66,27 @@ python tutorials/peft-curation-with-sdg/main.py \ --api-key YOUR_BUILD.NVIDIA.COM_API_KEY \ --device gpu -# To control the amount of synthetic data to generate using LLaMa 3.1 405B +# Here are some examples that: +# - Use the GPU and enable semantic deduplication +# - Do 1 round of synthetic data generation +# - Generate synthetic data using 0.1% of the real data +# - Use the specified model from build.nvidia.com for synthetic data generation + +# Using LLaMa 3.1 405B: python tutorials/peft-curation-with-sdg/main.py \ --api-key YOUR_BUILD.NVIDIA.COM_API_KEY \ - --device gpu \ # Use the GPU and enable semantic deduplication - --synth-gen-rounds 1 \ # Do 1 round of synthetic data generation - --synth-gen-ratio 0.001 \ # Generate synthetic data using 0.1% of the real data - --synth-gen-model "meta/llama-3.1-405b-instruct" # Use LLaMa 3.1 405B + --device gpu \ + --synth-gen-rounds 1 \ + --synth-gen-ratio 0.001 \ + --synth-gen-model "meta/llama-3.1-405b-instruct" -# To control the amount of synthetic data to generate using Nemotron-4 340B +# Using Nemotron-4 340B: python tutorials/peft-curation-with-sdg/main.py \ --api-key YOUR_BUILD.NVIDIA.COM_API_KEY \ - --device gpu \ # Use the GPU and enable semantic deduplication - --synth-gen-rounds 1 \ # Do 1 round of synthetic data generation - --synth-gen-ratio 0.001 \ # Generate synthetic data using 0.1% of the real data - --synth-gen-model "nvidia/nemotron-4-340b-instruct" # Use Nemotron-4 340B + --device gpu \ + --synth-gen-rounds 1 \ + --synth-gen-ratio 0.001 \ + --synth-gen-model "nvidia/nemotron-4-340b-instruct" ``` By default, this tutorial will use at most 8 workers to run the curation pipeline. If you face any @@ -91,4 +97,4 @@ Once the code finishes executing, the curated dataset will be available under `d By default, the script outputs splits for training (80%), validation (10%) and testing (10%). ## Next Step: Fine-tune Your Own Model -The curated dataset from this tutorial can be readily used for model customization and fine-tuning using the [NeMo Framework](https://github.com/NVIDIA/NeMo). Please refer to the [law title generation tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/llm/llama-3/sdg-law-title-generation/llama3-sdg-lora-nemofw.ipynb) in the NeMo Framework repository to learn more. +The curated dataset from this tutorial can be readily used for model customization and fine-tuning using the [NeMo Framework](https://github.com/NVIDIA/NeMo). Please refer to the [law title generation tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/llm/llama-3/sdg-law-title-generation/llama3-sdg-lora-nemofw.ipynb) in the NeMo Framework repository to learn more. In that tutorial, you will learn more about using the data you just curated to fine-tune a model that can read a legal question and generate a title for that question. diff --git a/tutorials/peft-curation-with-sdg/main.py b/tutorials/peft-curation-with-sdg/main.py index 5e134666..e51512c8 100644 --- a/tutorials/peft-curation-with-sdg/main.py +++ b/tutorials/peft-curation-with-sdg/main.py @@ -402,6 +402,9 @@ def main(): curated_dir = os.path.dirname(train_fp_curated) os.system(f"cp {val_fp} {curated_dir}") os.system(f"cp {test_fp} {curated_dir}") + print( + "--------------------------------------------------------------------------------" + ) print(f"Curated files are saved in '{curated_dir}'.")