From 611570405aa1f6114a251b7fe9c48f1a58d96ace Mon Sep 17 00:00:00 2001
From: James Kunstle <jkunstle@redhat.com>
Date: Wed, 16 Oct 2024 22:59:53 -0700
Subject: [PATCH] adds basic smoketests for main_ds and data_process CLI args

During development it's convenient to be able to run full distributed
training, even on a smaller dataset, just to make sure that nothing
obviously fails. This will also capture support for flash attention on
the machine that it's run on, and for granite models.

Signed-off-by: James Kunstle <jkunstle@redhat.com>
---
 tests/README.md    |  17 ++++++
 tests/smoketest.sh | 145 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+)
 create mode 100644 tests/README.md
 create mode 100755 tests/smoketest.sh

diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000..0d4af588
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,17 @@
+# Overview
+
+`smoketest.sh` cd's into the source directory and runs the script-entrypoint for `main_ds.py` and `data_process.py`. Testing will break if file names or locations in the source tree change.
+
+Existing tests are "smoke tests," meant to demonstrate that training completes (returns 0) or not. This is helpful to check if all required dependencies are installed.
+
+Tests add features over time:
+
+1. No Flash Attention or Granite
+2. No Granite but Flash Attention enabled
+3. Granite and Flash Attention enabled
+
+# Usage
+
+The testing script can be run without parameters as `./smoketest.sh`. By default, this will run all tests with `FSDP` as the distributed training backend. To change the distributed training backend to the other available option, one can run the script as `./smoketest.sh deepspeed`. 
+
+> NOTE: You'll need to install the training library to run the test. Inside a virtual environment and at inside the repo, please run `pip3 install -e .` to install the package in editable mode.
\ No newline at end of file
diff --git a/tests/smoketest.sh b/tests/smoketest.sh
new file mode 100755
index 00000000..1b482e5a
--- /dev/null
+++ b/tests/smoketest.sh
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+set -eux
+
+# ############### User-modifiable parameters ############### 
+# Change these as needed
+NUM_GPUS=8
+MAX_BATCH_LEN=60000
+NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.
+
+# ############### Read-only parameters ############### 
+MODEL_NAME="instructlab/granite-7b-lab"
+# gets directory of current file.
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+CORRECT_WORKING_DIR="$SCRIPT_DIR/../src/instructlab/training/"
+SAMPLE_DATA_PATH="$SCRIPT_DIR/../sample-data/train_all_pruned_SDG.jsonl"
+TMP_DIR=$(mktemp -d)
+CHECKPOINTS_DIR="$TMP_DIR/checkpoints"
+DATA_DIR="$TMP_DIR/data"
+COMPUTED_DATA_PATH="$DATA_DIR/data.jsonl"
+DEFAULT_DISTRIB_FRAMEWORK='fsdp'
+DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP
+
+# ############### Test Functions ############### 
+
+function setup_tmpdir () {
+    mkdir "$CHECKPOINTS_DIR"
+    mkdir "$DATA_DIR"
+}
+
+function prepare_data () {
+    # preprocesses .jsonl messages data so that it's a valid
+    # input to the model (inputs tokenized, formatted with mask, etc.)
+    # then, data is trimmed to a determined length to make training
+    # go faster.
+    
+    python3 data_process.py \
+    --data_path="$SAMPLE_DATA_PATH" \
+    --data_output_path="$DATA_DIR" \
+    --max_seq_len=4096 \
+    --model_name_or_path="$MODEL_NAME"
+
+    # trim data so we only keep the first 'n' samples.
+    # should be enough data for training to be meaningful but not enough
+    # that training takes a large amount of time.
+    echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH"
+
+    echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES"
+}
+
+function _cleanup_saved_checkpoints() {
+    echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR"
+    rm -rf "$CHECKPOINTS_DIR"
+    mkdir "$CHECKPOINTS_DIR"
+}
+
+function test_standard_loop () {
+    # Tests most common training parameters.
+    # - FSDP
+    # - Save full state
+    # - Save hf_format
+    # - padding-free
+    # - flash attention
+
+    torchrun \
+    --standalone \
+    --nproc_per_node="$NUM_GPUS" \
+    main_ds.py \
+    --model_name_or_path="$MODEL_NAME" \
+    --data_path="$COMPUTED_DATA_PATH" \
+    --output_dir="$CHECKPOINTS_DIR" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
+    --max_batch_len="$MAX_BATCH_LEN" \
+    --is_granite \
+}
+
+function test_standard_loop_nongranite () {
+    # Tests most common training parameters without
+    # using Granite
+    # - FSDP
+    # - Save full state
+    # - Save hf_format
+    # - flash attention
+
+    torchrun \
+    --standalone \
+    --nproc_per_node="$NUM_GPUS" \
+    main_ds.py \
+    --model_name_or_path="$MODEL_NAME" \
+    --data_path="$COMPUTED_DATA_PATH" \
+    --output_dir="$CHECKPOINTS_DIR" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
+    --max_batch_len="$MAX_BATCH_LEN" \
+    # --is_granite \
+}
+
+function test_standard_loop_noflashattention_nogranite () {
+    # Tests most common training parameters without
+    # using Granite or Flash Attention
+    # - FSDP
+    # - Save full state
+    # - Save hf_format
+
+    torchrun \
+    --standalone \
+    --nproc_per_node="$NUM_GPUS" \
+    main_ds.py \
+    --model_name_or_path="$MODEL_NAME" \
+    --data_path="$COMPUTED_DATA_PATH" \
+    --output_dir="$CHECKPOINTS_DIR" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
+    --max_batch_len="$MAX_BATCH_LEN" \
+    --disable_flash_attn
+    # --is_granite \
+}
+
+# ############### Setup and tests ############### 
+setup_tmpdir
+trap "rm -rf $TMP_DIR" EXIT
+
+#NOTE (jkunstle): script is run as though it's
+# in the same source dir as main_ds and data_process.
+cd "$CORRECT_WORKING_DIR"
+echo "CURRENT WORKING DIRECTORY: $(pwd)"
+
+prepare_data
+test_standard_loop_noflashattention_nogranite
+_cleanup_saved_checkpoints
+test_standard_loop_nongranite
+_cleanup_saved_checkpoints
+test_standard_loop
\ No newline at end of file