-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adds basic smoketests for main_ds and data_process CLI args
During development it's convenient to be able to run full distributed training, even on a smaller dataset, just to make sure that nothing obviously fails. This will also capture support for flash attention on the machine that it's run on, and for granite models. Signed-off-by: James Kunstle <[email protected]>
- Loading branch information
1 parent
e680bd8
commit 6115704
Showing
2 changed files
with
162 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Overview | ||
|
||
`smoketest.sh` cd's into the source directory and runs the script-entrypoint for `main_ds.py` and `data_process.py`. Testing will break if file names or locations in the source tree change. | ||
|
||
Existing tests are "smoke tests," meant to demonstrate that training completes (returns 0) or not. This is helpful to check if all required dependencies are installed. | ||
|
||
Tests add features over time: | ||
|
||
1. No Flash Attention or Granite | ||
2. No Granite but Flash Attention enabled | ||
3. Granite and Flash Attention enabled | ||
|
||
# Usage | ||
Check failure on line 13 in tests/README.md
|
||
|
||
The testing script can be run without parameters as `./smoketest.sh`. By default, this will run all tests with `FSDP` as the distributed training backend. To change the distributed training backend to the other available option, one can run the script as `./smoketest.sh deepspeed`. | ||
Check failure on line 15 in tests/README.md
|
||
|
||
> NOTE: You'll need to install the training library to run the test. Inside a virtual environment and at inside the repo, please run `pip3 install -e .` to install the package in editable mode. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
#!/usr/bin/env bash | ||
set -eux | ||
|
||
# ############### User-modifiable parameters ############### | ||
# Change these as needed | ||
NUM_GPUS=8 | ||
MAX_BATCH_LEN=60000 | ||
NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size. | ||
|
||
# ############### Read-only parameters ############### | ||
MODEL_NAME="instructlab/granite-7b-lab" | ||
# gets directory of current file. | ||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" | ||
CORRECT_WORKING_DIR="$SCRIPT_DIR/../src/instructlab/training/" | ||
SAMPLE_DATA_PATH="$SCRIPT_DIR/../sample-data/train_all_pruned_SDG.jsonl" | ||
TMP_DIR=$(mktemp -d) | ||
CHECKPOINTS_DIR="$TMP_DIR/checkpoints" | ||
DATA_DIR="$TMP_DIR/data" | ||
COMPUTED_DATA_PATH="$DATA_DIR/data.jsonl" | ||
DEFAULT_DISTRIB_FRAMEWORK='fsdp' | ||
DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP | ||
|
||
# ############### Test Functions ############### | ||
|
||
function setup_tmpdir () { | ||
mkdir "$CHECKPOINTS_DIR" | ||
mkdir "$DATA_DIR" | ||
} | ||
|
||
function prepare_data () { | ||
# preprocesses .jsonl messages data so that it's a valid | ||
# input to the model (inputs tokenized, formatted with mask, etc.) | ||
# then, data is trimmed to a determined length to make training | ||
# go faster. | ||
|
||
python3 data_process.py \ | ||
--data_path="$SAMPLE_DATA_PATH" \ | ||
--data_output_path="$DATA_DIR" \ | ||
--max_seq_len=4096 \ | ||
--model_name_or_path="$MODEL_NAME" | ||
|
||
# trim data so we only keep the first 'n' samples. | ||
# should be enough data for training to be meaningful but not enough | ||
# that training takes a large amount of time. | ||
echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH" | ||
|
||
echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES" | ||
} | ||
|
||
function _cleanup_saved_checkpoints() { | ||
echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR" | ||
rm -rf "$CHECKPOINTS_DIR" | ||
mkdir "$CHECKPOINTS_DIR" | ||
} | ||
|
||
function test_standard_loop () { | ||
# Tests most common training parameters. | ||
# - FSDP | ||
# - Save full state | ||
# - Save hf_format | ||
# - padding-free | ||
# - flash attention | ||
|
||
torchrun \ | ||
--standalone \ | ||
--nproc_per_node="$NUM_GPUS" \ | ||
main_ds.py \ | ||
--model_name_or_path="$MODEL_NAME" \ | ||
--data_path="$COMPUTED_DATA_PATH" \ | ||
--output_dir="$CHECKPOINTS_DIR" \ | ||
--num_epochs=1 \ | ||
--effective_batch_size=128 \ | ||
--save_samples=0 \ | ||
--checkpoint_at_epoch \ | ||
--accelerate_full_state_at_epoch \ | ||
--distributed_training_framework="$DISTRIB_FRAMEWORK" \ | ||
--max_batch_len="$MAX_BATCH_LEN" \ | ||
--is_granite \ | ||
} | ||
|
||
function test_standard_loop_nongranite () { | ||
# Tests most common training parameters without | ||
# using Granite | ||
# - FSDP | ||
# - Save full state | ||
# - Save hf_format | ||
# - flash attention | ||
|
||
torchrun \ | ||
--standalone \ | ||
--nproc_per_node="$NUM_GPUS" \ | ||
main_ds.py \ | ||
--model_name_or_path="$MODEL_NAME" \ | ||
--data_path="$COMPUTED_DATA_PATH" \ | ||
--output_dir="$CHECKPOINTS_DIR" \ | ||
--num_epochs=1 \ | ||
--effective_batch_size=128 \ | ||
--save_samples=0 \ | ||
--checkpoint_at_epoch \ | ||
--accelerate_full_state_at_epoch \ | ||
--distributed_training_framework="$DISTRIB_FRAMEWORK" \ | ||
--max_batch_len="$MAX_BATCH_LEN" \ | ||
# --is_granite \ | ||
} | ||
|
||
function test_standard_loop_noflashattention_nogranite () { | ||
# Tests most common training parameters without | ||
# using Granite or Flash Attention | ||
# - FSDP | ||
# - Save full state | ||
# - Save hf_format | ||
|
||
torchrun \ | ||
--standalone \ | ||
--nproc_per_node="$NUM_GPUS" \ | ||
main_ds.py \ | ||
--model_name_or_path="$MODEL_NAME" \ | ||
--data_path="$COMPUTED_DATA_PATH" \ | ||
--output_dir="$CHECKPOINTS_DIR" \ | ||
--num_epochs=1 \ | ||
--effective_batch_size=128 \ | ||
--save_samples=0 \ | ||
--checkpoint_at_epoch \ | ||
--accelerate_full_state_at_epoch \ | ||
--distributed_training_framework="$DISTRIB_FRAMEWORK" \ | ||
--max_batch_len="$MAX_BATCH_LEN" \ | ||
--disable_flash_attn | ||
# --is_granite \ | ||
} | ||
|
||
# ############### Setup and tests ############### | ||
setup_tmpdir | ||
trap "rm -rf $TMP_DIR" EXIT | ||
|
||
#NOTE (jkunstle): script is run as though it's | ||
# in the same source dir as main_ds and data_process. | ||
cd "$CORRECT_WORKING_DIR" | ||
echo "CURRENT WORKING DIRECTORY: $(pwd)" | ||
|
||
prepare_data | ||
test_standard_loop_noflashattention_nogranite | ||
_cleanup_saved_checkpoints | ||
test_standard_loop_nongranite | ||
_cleanup_saved_checkpoints | ||
test_standard_loop |