Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release Candidate v0.0.1 #88

Merged
merged 2 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions benchmarking/benchmark_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,14 @@ def parse_args():

def extract_params_from_filename(filename: str) -> Dict[str, Any]:
pattern = r"""
benchmark_
.*?benchmark_ # Any prefix before benchmark_
(?P<timestamp>\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}) # Timestamp
(_(?P<mesh_device>N150|N300|T3K_LINE|T3K_RING|TG))? # MESH_DEVICE
_isl-(?P<isl>\d+) # Input sequence length
_osl-(?P<osl>\d+) # Output sequence length
_bsz-(?P<bsz>\d+) # Batch size
_n-(?P<n>\d+) # Number of requests
_maxcon-(?P<maxcon>\d+) # Max concurrency
_n-(?P<n>\d+) # Number of requests
\.json$
"""
match = re.search(pattern, filename, re.VERBOSE)
if not match:
Expand All @@ -67,7 +68,7 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]:
"mesh_device": match.group("mesh_device"),
"input_sequence_length": int(match.group("isl")),
"output_sequence_length": int(match.group("osl")),
"batch_size": int(match.group("bsz")),
"batch_size": int(match.group("maxcon")),
"num_requests": int(match.group("n")),
}

Expand Down
11 changes: 9 additions & 2 deletions benchmarking/prompt_client_online_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,15 @@ def run_sequence_length_test(
tokenizer = AutoTokenizer.from_pretrained(model)

# pre-capture traces so benchmark does not include 1st run trace capture time
# TODO: add support for image input to capture_traces
prompt_client.capture_traces(context_lens=[(input_len, output_len)])
image_resolutions = []
if images:
image_resolutions = [
(prompt_config.image_width, prompt_config.image_height)
]

prompt_client.capture_traces(
context_lens=[(input_len, output_len)], image_resolutions=image_resolutions
)
# Process batches
try:
responses = batch_processor.process_batch(
Expand Down
4 changes: 3 additions & 1 deletion benchmarking/vllm_online_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,10 @@ def main():
/ f"vllm_online_benchmark_{run_timestamp}_{mesh_device}_isl-{isl}_osl-{osl}_maxcon-{max_concurrent}_n-{num_prompts}.json"
)
logger.info(f"\nRunning benchmark {i}/{len(combinations)}")
vllm_dir = os.environ.get("vllm_dir")
assert vllm_dir is not None, "vllm_dir must be set."
run_benchmark(
benchmark_script="/home/user/vllm/benchmarks/benchmark_serving.py",
benchmark_script=f"{vllm_dir}/benchmarks/benchmark_serving.py",
params=params,
model=env_config.vllm_model,
port=env_config.service_port,
Expand Down
21 changes: 3 additions & 18 deletions evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,9 @@ For instructions on building the Docker image see: [vllm-tt-metal-llama3/docs/de

## Step 2: Run Docker container for LM evals development

note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`.
Follow run guide: [vllm-tt-metal-llama3/README.md](../vllm-tt-metal-llama3/README.md)

```bash
cd tt-inference-server
export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/
docker run \
--rm \
-it \
--env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \
--cap-add ALL \
--device /dev/tenstorrent:/dev/tenstorrent \
--volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
--volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \
--shm-size 32G \
ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG}
```

The default Docker image command will start the vLLM server.
note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`.

## Step 3: Inside container set up llama-recipes LM evalulation harness templates

Expand All @@ -44,7 +29,7 @@ To access Meta Llama 3.1 evals, you must:
#### Hugging Face authentication - option 1: HF_TOKEN (if not already passed into Docker container)
```bash
# set up HF Token if not already set up in .env, needed for datasets
echo "HF_TOKEN=hf_<your_token>" >> vllm-tt-metal-llama3/.env
echo "HF_TOKEN=hf_<your_token>"
```

#### Hugging Face authentication - option 2: huggingface_hub login
Expand Down
4 changes: 2 additions & 2 deletions evals/run_evals.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ lm_eval \
--gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
--tasks meta_gpqa \
--batch_size auto \
--output_path /home/user/cache_root/eval_output \
--output_path ${CACHE_ROOT}/eval_output \
--include_path ./work_dir \
--seed 42 \
--log_samples
Expand All @@ -57,7 +57,7 @@ lm_eval \
--gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
--tasks meta_ifeval \
--batch_size auto \
--output_path /home/user/cache_root/eval_output \
--output_path ${CACHE_ROOT}/eval_output \
--include_path ./work_dir \
--seed 42 \
--log_samples
Expand Down
2 changes: 1 addition & 1 deletion evals/run_evals_vision.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ lm_eval \
--gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
--tasks mmmu_val \
--batch_size auto \
--output_path /home/user/cache_root/eval_output \
--output_path /home/container_app_user/cache_root/eval_output \
--seed 42 \
--log_samples

Expand Down
116 changes: 69 additions & 47 deletions setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,19 @@ set -euo pipefail # Exit on error, print commands, unset variables treated as e
usage() {
echo "Usage: $0 <model_type>"
echo "Available model types:"
echo " llama-3.3-70b-instruct"
echo " llama-3.2-11b-vision-instruct"
echo " llama-3.2-3b-instruct"
echo " llama-3.2-1b-instruct"
echo " llama-3.1-70b-instruct"
echo " llama-3.1-70b"
echo " llama-3.1-8b-instruct"
echo " llama-3.1-8b"
echo " llama-3-70b-instruct"
echo " llama-3-70b"
echo " llama-3-8b-instruct"
echo " llama-3-8b"
echo " DeepSeek-R1-Distill-Llama-70B"
echo " Llama-3.3-70B-Instruct"
echo " Llama-3.2-11B-Vision-Instruct"
echo " Llama-3.2-3B-Instruct"
echo " Llama-3.2-1B-Instruct"
echo " Llama-3.1-70B-Instruct"
echo " Llama-3.1-70B"
echo " Llama-3.1-8B-Instruct"
echo " Llama-3.1-8B"
echo " Llama-3-70B-Instruct"
echo " Llama-3-70B"
echo " Llama-3-8B-Instruct"
echo " Llama-3-8B"
echo
exit 1
}
Expand Down Expand Up @@ -74,6 +75,7 @@ get_hf_env_vars() {
echo "HF_TOKEN environment variable is not set. Please set it before running the script."
read -r -s -p "Enter your HF_TOKEN: " input_hf_token
echo
echo "entered HF_TOKEN contains: ${#input_hf_token} characters, expected 37."
if [ -z "${input_hf_token:-}" ]; then
echo "⛔ HF_TOKEN cannot be empty. Please try again."
exit 1
Expand Down Expand Up @@ -111,84 +113,104 @@ setup_model_environment() {
# Set environment variables based on the model selection
# note: MODEL_NAME is the directory name for the model weights
case "$1" in
"llama-3.3-70b-instruct")
"DeepSeek-R1-Distill-Llama-70B")
IMPL_ID="tt-metal"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is IMPL? I would assume it's the implementation of the model, but since it's "tt-metal", i'm not sure?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes thats the intent, these are tt-metal implementations (ttnn to be more precise). This is to distinguish between tt-forge implementations or others in future.

MODEL_NAME="DeepSeek-R1-Distill-Llama-70B"
HF_MODEL_REPO_ID="deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
META_MODEL_NAME=""
META_DIR_FILTER=""
REPACKED=1
;;
"Llama-3.3-70B-Instruct")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3.3-70B-Instruct"
HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct"
META_MODEL_NAME=""
META_DIR_FILTER=""
REPACKED=1
;;
"llama-3.2-11b-vision-instruct")
"Llama-3.2-11B-Vision-Instruct")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3.2-11B-Vision-Instruct"
HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct"
META_MODEL_NAME=""
META_DIR_FILTER=""
REPACKED=0
;;
"llama-3.2-3b-instruct")
"Llama-3.2-3B-Instruct")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3.2-3B-Instruct"
HF_MODEL_REPO_ID="meta-llama/Llama-3.2-3B-Instruct"
META_MODEL_NAME=""
META_DIR_FILTER=""
REPACKED=0
;;
"llama-3.2-1b-instruct")
"Llama-3.2-1B-Instruct")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3.2-1B-Instruct"
HF_MODEL_REPO_ID="meta-llama/Llama-3.2-1B-Instruct"
META_MODEL_NAME=""
META_DIR_FILTER=""
REPACKED=0
;;
"llama-3.1-70b-instruct")
"Llama-3.1-70B-Instruct")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3.1-70B-Instruct"
HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct"
META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct"
META_DIR_FILTER="llama3_1"
REPACKED=1
;;
"llama-3.1-70b")
"Llama-3.1-70B")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3.1-70B"
HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B"
META_MODEL_NAME="Meta-Llama-3.1-70B"
META_DIR_FILTER="llama3_1"
REPACKED=1
;;
"llama-3.1-8b-instruct")
"Llama-3.1-8B-Instruct")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3.1-8B-Instruct"
HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B-Instruct"
META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct"
META_DIR_FILTER="llama3_1"
REPACKED=0
;;
"llama-3.1-8b")
"Llama-3.1-8B")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3.1-8B"
HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B"
META_MODEL_NAME="Meta-Llama-3.1-8B"
META_DIR_FILTER="llama3_1"
REPACKED=0
;;
"llama-3-70b-instruct")
"Llama-3-70B-Instruct")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3-70B-Instruct"
HF_MODEL_REPO_ID="meta-llama/Llama-3-70B-Instruct"
META_MODEL_NAME="Meta-Llama-3-70B-Instruct"
META_DIR_FILTER="llama3"
REPACKED=1
;;
"llama-3-70b")
"Llama-3-70B")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3-70B"
HF_MODEL_REPO_ID="meta-llama/Llama-3-70B"
META_MODEL_NAME="Meta-Llama-3-70B"
META_DIR_FILTER="llama3"
REPACKED=1
;;
"llama-3-8b-instruct")
"Llama-3-8B-Instruct")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3-8B-Instruct"
HF_MODEL_REPO_ID="meta-llama/Llama-3-8B-Instruct"
META_MODEL_NAME="Meta-Llama-3-8B-Instruct"
META_DIR_FILTER="llama3"
REPACKED=0
;;
"llama-3-8b")
"Llama-3-8B")
IMPL_ID="tt-metal"
MODEL_NAME="Llama-3-8B"
HF_MODEL_REPO_ID="meta-llama/Llama-3-8B"
META_MODEL_NAME="Meta-Llama-3-8B"
Expand All @@ -201,32 +223,32 @@ setup_model_environment() {
exit 1
;;
esac
# Initialize OVERWRITE_ENV
OVERWRITE_ENV=false

# Set default values for environment variables
DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume
MODEL_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs"

# Safely handle potentially unset environment variables using default values
PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
# Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
echo # move to a new line after input
# Set environment variables with defaults if not already set
MODEL_VERSION="0.0.1"
MODEL_ID="id_${IMPL_ID}-${MODEL_NAME}-v${MODEL_VERSION}"
PERSISTENT_VOLUME="${PERSISTENT_VOLUME_ROOT}/volume_${MODEL_ID}"

# Initialize OVERWRITE_ENV
OVERWRITE_ENV=false
MODEL_ENV_DIR="${PERSISTENT_VOLUME_ROOT}/model_envs"
mkdir -p ${MODEL_ENV_DIR}
ENV_FILE="${MODEL_ENV_DIR}/${MODEL_NAME}.env"
export ENV_FILE
check_and_prompt_env_file


if [ "$OVERWRITE_ENV" = false ]; then
echo "✅ using existing .env file: ${ENV_FILE}."
return 0
fi
# Safely handle potentially unset environment variables using default values
PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
# Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
echo # move to a new line after input
# Set environment variables with defaults if not already set
PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1


read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token
choice_use_hf_token=${input_use_hf_token:-"y"}
Expand Down Expand Up @@ -283,15 +305,15 @@ setup_model_environment() {
cat > ${ENV_FILE} <<EOF
# Environment variables for the model setup
USE_HF_DOWNLOAD=$choice_use_hf_token
HF_MODEL_REPO_ID=$HF_MODEL_REPO_ID
MODEL_NAME=$MODEL_NAME
MODEL_VERSION=${MODEL_VERSION}
IMPL_ID=${IMPL_ID}
MODEL_ID=${MODEL_ID}
META_MODEL_NAME=$META_MODEL_NAME
HF_MODEL_REPO_ID=$HF_MODEL_REPO_ID
REPACKED=${REPACKED}
REPACKED_STR=${REPACKED_STR}
# model runtime variables
LLAMA_VERSION=llama3
TT_METAL_ASYNC_DEVICE_QUEUE=1
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
SERVICE_PORT=7000
# host paths
HOST_HF_HOME=${HF_HOME:-""}
Expand Down Expand Up @@ -467,17 +489,17 @@ setup_weights_huggingface() {
mv "${WEIGHTS_DIR}/consolidated.pth" "${WEIGHTS_DIR}/consolidated.00.pth"
fi

# Step 6: Process and copy weights
# Step 6: Cleanup HF setup venv
deactivate
rm -rf ${VENV_NAME}

# Step 7: Process and copy weights
if [ "${REPACKED}" -eq 1 ]; then
REPACKED_WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
mkdir -p "${REPACKED_WEIGHTS_DIR}"
repack_weights "${WEIGHTS_DIR}" "${REPACKED_WEIGHTS_DIR}"
fi

# Step 7: Cleanup
deactivate
rm -rf ${VENV_NAME}

echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
echo "✅ setup_weights_huggingface completed!"
}
Expand Down
4 changes: 2 additions & 2 deletions tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ export VLLM_COMMIT_SHA=<vllm-commit>
Add a volume mounting the `test` directory in the container before running with the following in the docker run command:

```bash
--volume $PWD/tests:/home/user/tests
--volume $PWD/tests:/home/container_app_user/tests
```

## 3. Run The Mock Model

Once in the docker container, run the mock script with:

```bash
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/user/tests/mock_vllm_offline_inference_tt.py
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/container_app_user/tests/mock_vllm_offline_inference_tt.py
```

# Build mock model container
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmark_vllm_offline_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def parse_args():
parser.add_argument(
"--prompts_json",
type=str,
default="/home/user/vllm/tt_metal/prompts.json",
default="/home/container_app_user/vllm/tt_metal/prompts.json",
help="Path to JSON file containing prompts",
)
parser.add_argument(
Expand Down
Loading