From 611570405aa1f6114a251b7fe9c48f1a58d96ace Mon Sep 17 00:00:00 2001 From: James Kunstle Date: Wed, 16 Oct 2024 22:59:53 -0700 Subject: [PATCH] adds basic smoketests for main_ds and data_process CLI args During development it's convenient to be able to run full distributed training, even on a smaller dataset, just to make sure that nothing obviously fails. This will also capture support for flash attention on the machine that it's run on, and for granite models. Signed-off-by: James Kunstle --- tests/README.md | 17 ++++++ tests/smoketest.sh | 145 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 tests/README.md create mode 100755 tests/smoketest.sh diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..0d4af588 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,17 @@ +# Overview + +`smoketest.sh` cd's into the source directory and runs the script-entrypoint for `main_ds.py` and `data_process.py`. Testing will break if file names or locations in the source tree change. + +Existing tests are "smoke tests," meant to demonstrate that training completes (returns 0) or not. This is helpful to check if all required dependencies are installed. + +Tests add features over time: + +1. No Flash Attention or Granite +2. No Granite but Flash Attention enabled +3. Granite and Flash Attention enabled + +# Usage + +The testing script can be run without parameters as `./smoketest.sh`. By default, this will run all tests with `FSDP` as the distributed training backend. To change the distributed training backend to the other available option, one can run the script as `./smoketest.sh deepspeed`. + +> NOTE: You'll need to install the training library to run the test. Inside a virtual environment and at inside the repo, please run `pip3 install -e .` to install the package in editable mode. \ No newline at end of file diff --git a/tests/smoketest.sh b/tests/smoketest.sh new file mode 100755 index 00000000..1b482e5a --- /dev/null +++ b/tests/smoketest.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash +set -eux + +# ############### User-modifiable parameters ############### +# Change these as needed +NUM_GPUS=8 +MAX_BATCH_LEN=60000 +NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size. + +# ############### Read-only parameters ############### +MODEL_NAME="instructlab/granite-7b-lab" +# gets directory of current file. +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +CORRECT_WORKING_DIR="$SCRIPT_DIR/../src/instructlab/training/" +SAMPLE_DATA_PATH="$SCRIPT_DIR/../sample-data/train_all_pruned_SDG.jsonl" +TMP_DIR=$(mktemp -d) +CHECKPOINTS_DIR="$TMP_DIR/checkpoints" +DATA_DIR="$TMP_DIR/data" +COMPUTED_DATA_PATH="$DATA_DIR/data.jsonl" +DEFAULT_DISTRIB_FRAMEWORK='fsdp' +DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP + +# ############### Test Functions ############### + +function setup_tmpdir () { + mkdir "$CHECKPOINTS_DIR" + mkdir "$DATA_DIR" +} + +function prepare_data () { + # preprocesses .jsonl messages data so that it's a valid + # input to the model (inputs tokenized, formatted with mask, etc.) + # then, data is trimmed to a determined length to make training + # go faster. + + python3 data_process.py \ + --data_path="$SAMPLE_DATA_PATH" \ + --data_output_path="$DATA_DIR" \ + --max_seq_len=4096 \ + --model_name_or_path="$MODEL_NAME" + + # trim data so we only keep the first 'n' samples. + # should be enough data for training to be meaningful but not enough + # that training takes a large amount of time. + echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH" + + echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES" +} + +function _cleanup_saved_checkpoints() { + echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR" + rm -rf "$CHECKPOINTS_DIR" + mkdir "$CHECKPOINTS_DIR" +} + +function test_standard_loop () { + # Tests most common training parameters. + # - FSDP + # - Save full state + # - Save hf_format + # - padding-free + # - flash attention + + torchrun \ + --standalone \ + --nproc_per_node="$NUM_GPUS" \ + main_ds.py \ + --model_name_or_path="$MODEL_NAME" \ + --data_path="$COMPUTED_DATA_PATH" \ + --output_dir="$CHECKPOINTS_DIR" \ + --num_epochs=1 \ + --effective_batch_size=128 \ + --save_samples=0 \ + --checkpoint_at_epoch \ + --accelerate_full_state_at_epoch \ + --distributed_training_framework="$DISTRIB_FRAMEWORK" \ + --max_batch_len="$MAX_BATCH_LEN" \ + --is_granite \ +} + +function test_standard_loop_nongranite () { + # Tests most common training parameters without + # using Granite + # - FSDP + # - Save full state + # - Save hf_format + # - flash attention + + torchrun \ + --standalone \ + --nproc_per_node="$NUM_GPUS" \ + main_ds.py \ + --model_name_or_path="$MODEL_NAME" \ + --data_path="$COMPUTED_DATA_PATH" \ + --output_dir="$CHECKPOINTS_DIR" \ + --num_epochs=1 \ + --effective_batch_size=128 \ + --save_samples=0 \ + --checkpoint_at_epoch \ + --accelerate_full_state_at_epoch \ + --distributed_training_framework="$DISTRIB_FRAMEWORK" \ + --max_batch_len="$MAX_BATCH_LEN" \ + # --is_granite \ +} + +function test_standard_loop_noflashattention_nogranite () { + # Tests most common training parameters without + # using Granite or Flash Attention + # - FSDP + # - Save full state + # - Save hf_format + + torchrun \ + --standalone \ + --nproc_per_node="$NUM_GPUS" \ + main_ds.py \ + --model_name_or_path="$MODEL_NAME" \ + --data_path="$COMPUTED_DATA_PATH" \ + --output_dir="$CHECKPOINTS_DIR" \ + --num_epochs=1 \ + --effective_batch_size=128 \ + --save_samples=0 \ + --checkpoint_at_epoch \ + --accelerate_full_state_at_epoch \ + --distributed_training_framework="$DISTRIB_FRAMEWORK" \ + --max_batch_len="$MAX_BATCH_LEN" \ + --disable_flash_attn + # --is_granite \ +} + +# ############### Setup and tests ############### +setup_tmpdir +trap "rm -rf $TMP_DIR" EXIT + +#NOTE (jkunstle): script is run as though it's +# in the same source dir as main_ds and data_process. +cd "$CORRECT_WORKING_DIR" +echo "CURRENT WORKING DIRECTORY: $(pwd)" + +prepare_data +test_standard_loop_noflashattention_nogranite +_cleanup_saved_checkpoints +test_standard_loop_nongranite +_cleanup_saved_checkpoints +test_standard_loop \ No newline at end of file