From 5f876c648e76245b5be05f835858f7c3285d6326 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 14 Jan 2025 16:38:10 -0500 Subject: [PATCH 1/3] Llama 3.x model support, setup.sh script multiple model support using HF download change log: - add multiple model support using persistent_volume/model_envs/*.env - setup using Hugging Face huggingface-cli to download models: llama model install script support for llama CLI and huggingface hub #14 - add model setup for llama 3.x - address Initial vLLM setup fails due to missing HuggingFace permissions #37 - address Docker run support for HF_TOKEN authentication using env var pass in #23 - renamed vllm-tt-metal-llama3-70 to vllm-tt-metal-llama3 for all llama 3.x models - updated documentation for v0 drop - add Docker Ubuntu 22.04 option for vLLM llama 3.x --- README.md | 2 +- scripts/add_spdx_header.py | 2 +- setup.sh | 562 ++++++++++++++++++ tests/mock.vllm.openai.api.dockerfile | 4 +- vllm-tt-metal-llama3-70b/setup.sh | 388 ------------ .../README.md | 16 +- .../docs/development.md | 25 +- .../requirements.txt | 0 .../src/__init__.py | 0 .../src/example_openai_client_alpaca_eval.py | 0 .../example_requests_client_alpaca_eval.py | 0 .../src/run_vllm_api_server.py | 60 +- ...m.llama3.src.ubuntu-20.04-amd64.Dockerfile | 11 +- ...m.llama3.src.ubuntu-22.04-amd64.Dockerfile | 112 ++++ 14 files changed, 750 insertions(+), 432 deletions(-) create mode 100755 setup.sh delete mode 100755 vllm-tt-metal-llama3-70b/setup.sh rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/README.md (90%) rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/docs/development.md (78%) rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/requirements.txt (100%) rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/src/__init__.py (100%) rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/src/example_openai_client_alpaca_eval.py (100%) rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/src/example_requests_client_alpaca_eval.py (100%) rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/src/run_vllm_api_server.py (62%) rename vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile => vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile (89%) create mode 100644 vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile diff --git a/README.md b/README.md index 42b0aaef..fcc8e524 100644 --- a/README.md +++ b/README.md @@ -15,5 +15,5 @@ Please follow setup instructions found in each model folder's README.md doc ## Model Implementations | Model | Hardware | |----------------|-----------------------------| -| [LLaMa 3.1 70B](vllm-tt-metal-llama3-70b/README.md) | TT-QuietBox & TT-LoudBox | +| [LLaMa 3.1 70B](vllm-tt-metal-llama3/README.md) | TT-QuietBox & TT-LoudBox | | [Mistral 7B](tt-metal-mistral-7b/README.md) | n150 and n300| \ No newline at end of file diff --git a/scripts/add_spdx_header.py b/scripts/add_spdx_header.py index ad3d7e9c..3d1a63df 100644 --- a/scripts/add_spdx_header.py +++ b/scripts/add_spdx_header.py @@ -30,7 +30,7 @@ def add_spdx_header(file_path): repo_root = Path(__file__).resolve().parent.parent directories_to_process = [ repo_root / "tt-metal-llama3-70b", - repo_root / "vllm-tt-metal-llama3-70b", + repo_root / "vllm-tt-metal-llama3", repo_root / "tt-metal-mistral-7b", repo_root / "tt-metal-yolov4", repo_root / "tests", diff --git a/setup.sh b/setup.sh new file mode 100755 index 00000000..3b062ada --- /dev/null +++ b/setup.sh @@ -0,0 +1,562 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +set -euo pipefail # Exit on error, print commands, unset variables treated as errors, and exit on pipeline failure + +# Function to display usage information +usage() { + echo "Usage: $0 " + echo "Available model types:" + echo " llama-3.3-70b-instruct" + echo " llama-3.2-11b-vision-instruct" + echo " llama-3.2-3b-instruct" + echo " llama-3.2-1b-instruct" + echo " llama-3.1-70b-instruct" + echo " llama-3.1-70b" + echo " llama-3.1-8b-instruct" + echo " llama-3.1-8b" + echo " llama-3-70b-instruct" + echo " llama-3-70b" + echo " llama-3-8b-instruct" + echo " llama-3-8b" + echo + exit 1 +} + +# globals +readonly REPO_ROOT=$(dirname "$(realpath "$0")") + +check_and_prompt_env_file() { + local MODEL_NAME_KEY="MODEL_NAME" + local MODEL_NAME="" + # Check if .env file exists + if [[ -f "${ENV_FILE}" ]]; then + # Extract the MODEL_NAME value from .env + echo "found ENV_FILE: ${ENV_FILE}" + FOUND_MODEL_NAME=$(grep "^$MODEL_NAME_KEY=" "$ENV_FILE" | cut -d '=' -f2) || FOUND_MODEL_NAME="" + # If MODEL_NAME is found, display it + if [[ -n "$FOUND_MODEL_NAME" ]]; then + echo "The existing file ${ENV_FILE} contains MODEL_NAME: $FOUND_MODEL_NAME" + # Prompt the user to overwrite or exit + local choice="" + read -p "Do you want to overwrite the existing file ${ENV_FILE}? (y/n) [default: y]:" choice + choice=${choice:-y} + # Handle user's choice + case "$choice" in + y|Y ) + echo "Overwriting the ${ENV_FILE} file ..." + # Logic to overwrite .env goes here + OVERWRITE_ENV=true + ;; + n|N ) + OVERWRITE_ENV=false + ;; + * ) + echo "⛔ Invalid option. Exiting." + exit 1 + ;; + esac + else + echo "MODEL_NAME not found in ${ENV_FILE}. Overwritting." + OVERWRITE_ENV=true + fi + else + echo "${ENV_FILE} does not exist. Proceeding to create a new one." + OVERWRITE_ENV=true + fi +} + +get_hf_env_vars() { + # get HF_TOKEN + if [ -z "${HF_TOKEN:-}" ]; then + echo "HF_TOKEN environment variable is not set. Please set it before running the script." + read -r -s -p "Enter your HF_TOKEN: " input_hf_token + echo + if [ -z "${input_hf_token:-}" ]; then + echo "⛔ HF_TOKEN cannot be empty. Please try again." + exit 1 + elif [[ ! "$input_hf_token" == hf_* ]]; then + echo "⛔ HF_TOKEN must start with 'hf_'. Please try again." + exit 1 + fi + HF_TOKEN=${input_hf_token} + echo "✅ HF_TOKEN set." + fi + # get HF_HOME + if [ -z "${HF_HOME:-}" ]; then + echo "HF_HOME environment variable is not set. Please set it before running the script." + read -r -p "Enter your HF_HOME [default: $HOME/.cache/huggingface]:" input_hf_home + echo + input_hf_home=${input_hf_home:-"$HOME/.cache/huggingface"} + if [ ! -d "$input_hf_home" ] || [ ! -w "$input_hf_home" ]; then + echo "⛔ HF_HOME must be a valid directory and writable by the user. Please try again." + exit 1 + fi + HF_HOME=${input_hf_home} + echo "✅ HF_HOME set." + fi +} + +# Function to set environment variables based on the model selection and write them to .env +setup_model_environment() { + # Set environment variables based on the model selection + # note: MODEL_NAME is the lower cased basename of the HF repo ID + case "$1" in + "llama-3.3-70b-instruct") + MODEL_NAME="llama-3.3-70b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct" + META_MODEL_NAME="" + META_DIR_FILTER="" + REPACKED=1 + ;; + "llama-3.2-11b-vision-instruct") + MODEL_NAME="llama-3.2-11b-vision-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct" + META_MODEL_NAME="" + META_DIR_FILTER="" + REPACKED=0 + ;; + "llama-3.2-3b-instruct") + MODEL_NAME="llama-3.2-3b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.2-3B-Instruct" + META_MODEL_NAME="" + META_DIR_FILTER="" + REPACKED=0 + ;; + "llama-3.2-1b-instruct") + MODEL_NAME="llama-3.2-1b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.2-1B-Instruct" + META_MODEL_NAME="" + META_DIR_FILTER="" + REPACKED=0 + ;; + "llama-3.1-70b-instruct") + MODEL_NAME="llama-3.1-70b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct" + META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct" + META_DIR_FILTER="llama3_1" + REPACKED=1 + ;; + "llama-3.1-70b") + MODEL_NAME="llama-3.1-70b" + HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B" + META_MODEL_NAME="Meta-Llama-3.1-70B" + META_DIR_FILTER="llama3_1" + REPACKED=1 + ;; + "llama-3.1-8b-instruct") + MODEL_NAME="llama-3.1-8b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B-Instruct" + META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct" + META_DIR_FILTER="llama3_1" + REPACKED=0 + ;; + "llama-3.1-8b") + MODEL_NAME="llama-3.1-8b" + HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B" + META_MODEL_NAME="Meta-Llama-3.1-8B" + META_DIR_FILTER="llama3_1" + REPACKED=0 + ;; + "llama-3-70b-instruct") + MODEL_NAME="llama-3-70b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3-70B-Instruct" + META_MODEL_NAME="Meta-Llama-3-70B-Instruct" + META_DIR_FILTER="llama3" + REPACKED=1 + ;; + "llama-3-70b") + MODEL_NAME="llama-3-70b" + HF_MODEL_REPO_ID="meta-llama/Llama-3-70B" + META_MODEL_NAME="Meta-Llama-3-70B" + META_DIR_FILTER="llama3" + REPACKED=1 + ;; + "llama-3-8b-instruct") + MODEL_NAME="llama-3-8b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3-8B-Instruct" + META_MODEL_NAME="Meta-Llama-3-8B-Instruct" + META_DIR_FILTER="llama3" + REPACKED=0 + ;; + "llama-3-8b") + MODEL_NAME="llama-3-8b" + HF_MODEL_REPO_ID="meta-llama/Llama-3-8B" + META_MODEL_NAME="Meta-Llama-3-8B" + META_DIR_FILTER="llama3" + REPACKED=0 + ;; + *) + echo "⛔ Invalid model choice." + usage + exit 1 + ;; + esac + # Initialize OVERWRITE_ENV + OVERWRITE_ENV=false + + # Set default values for environment variables + DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume + MODEL_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs" + + mkdir -p ${MODEL_ENV_DIR} + ENV_FILE="${MODEL_ENV_DIR}/${MODEL_NAME}.env" + export ENV_FILE + check_and_prompt_env_file + + + if [ "$OVERWRITE_ENV" = false ]; then + echo "✅ using existing .env file: ${ENV_FILE}." + return 0 + fi + # Safely handle potentially unset environment variables using default values + PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT} + # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default + read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT + PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT} + echo # move to a new line after input + # Set environment variables with defaults if not already set + PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1 + + + read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token + choice_use_hf_token=${input_use_hf_token:-"y"} + echo # move to a new line after input + # Handle user's choice + case "$choice_use_hf_token" in + y|Y ) + echo "Using 🤗 Hugging Face Token." + get_hf_env_vars + # default location for HF e.g. ~/.cache/huggingface/models/meta-llama/Llama-3.3-70B-Instruct + # LLAMA_WEIGHTS_DIR=${HF_HOME}/local_dir/${HF_MODEL_REPO_ID} + WEIGHTS_DIR=${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME} + ;; + n|N ) + if [ -z "${META_DIR_FILTER:-}" ]; then + echo "⛔ MODEL_NAME=${MODEL_NAME} does not support using direct Meta authorization model download. Please use Hugging Face method." + fi + echo "Using direct authorization from Meta. You will need their URL Authorization token, typically from their website or email." + # Prompt user for LLAMA_REPO if not already set or use default + read -r -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO + LLAMA_REPO=${INPUT_LLAMA_REPO:-$LLAMA_REPO} + LLAMA_DIR=${LLAMA_DIR:-${LLAMA_REPO}/models/${META_DIR_FILTER}} + LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-${LLAMA_DIR}/${META_MODEL_NAME}} + echo # move to a new line after input + ;; + * ) + echo "⛔ Invalid option. Exiting." + exit 1 + ;; + esac + + # Prompt user for JWT_SECRET securely + read -sp "Enter your JWT_SECRET: " JWT_SECRET + echo # move to a new line after input + # Verify the JWT_SECRET is not empty + if [ -z "${JWT_SECRET:-}" ]; then + echo "⛔ JWT_SECRET cannot be empty. Please try again." + exit 1 + fi + + if [ "${REPACKED}" -eq 1 ]; then + echo "REPACKED is enabled." + REPACKED_STR="repacked-" + else + echo "REPACKED is disabled." + REPACKED_STR="" + fi + + # Write environment variables to .env file + echo "Writing environment variables to ${ENV_FILE} ..." + cat > ${ENV_FILE} < /dev/null 2>&1; then + echo "Creating group 'dockermount' ..." + sudo groupadd dockermount + else + echo "Group 'dockermount' already exists." + fi + + # Add host user to 'dockermount' group + echo "Adding user: '$USER' to 'dockermount' group ..." + sudo usermod -aG dockermount "$USER" + + # Get container user with UID 1000 and add to group + CONTAINER_UID=1000 + CONTAINER_USER=$(getent passwd ${CONTAINER_UID} | cut -d: -f1) + if [ -n "$CONTAINER_USER" ]; then + echo "Adding container user: '$CONTAINER_USER' (UID ${CONTAINER_UID}) to 'dockermount' group ..." + sudo usermod -aG dockermount "$CONTAINER_USER" + else + echo "No user found with UID ${CONTAINER_UID}." + fi + + # Set file ownership and permissions + echo "Setting file ownership and permissions for container and host access ..." + if [ ! -d "${PERSISTENT_VOLUME}" ]; then + # if the user point the PERSISTENT_VOLUME + sudo mkdir -p "${PERSISTENT_VOLUME}" + fi + sudo chown -R ${CONTAINER_UID}:dockermount "${PERSISTENT_VOLUME}" + sudo chmod -R 775 "${PERSISTENT_VOLUME}" + + echo "✅ setup_permissions completed!" +} + +# Shared function for repacking weights +repack_weights() { + local source_dir="$1" + local target_dir="$2" + + # Create target directory if it doesn't exist + mkdir -p "${target_dir}" + + # Copy required files + cp "${source_dir}/tokenizer.model" "${target_dir}/tokenizer.model" + cp "${source_dir}/params.json" "${target_dir}/params.json" + + # Set up Python environment for repacking + VENV_NAME=".venv_repack" + echo "Setting up python venv for repacking: ${VENV_NAME}" + python3 -m venv ${VENV_NAME} + source ${VENV_NAME}/bin/activate + pip install --upgrade setuptools wheel pip==21.2.4 tqdm + pip install --index-url https://download.pytorch.org/whl/cpu torch==2.2.1 + + # Download repacking script + curl -O https://raw.githubusercontent.com/tenstorrent/tt-metal/refs/heads/main/models/demos/t3000/llama2_70b/scripts/repack_weights.py + + echo "Repacking weights..." + python repack_weights.py "${source_dir}" "${target_dir}" 5 + + # Cleanup + deactivate + rm -rf ${VENV_NAME} repack_weights.py + + echo "✅ Weight repacking completed!" +} + +setup_weights_meta() { + # Step 1: Set up Llama model repository path + echo "Using repository path: $LLAMA_REPO" + + # Step 2: Clone the repository (if it doesn't already exist) + if [ ! -d "$LLAMA_REPO" ]; then + echo "Cloning the Llama repository to: $LLAMA_REPO" + git clone https://github.com/meta-llama/llama-models.git "$LLAMA_REPO" + cd "$LLAMA_REPO" + # checkout commit before ./download.sh was removed + git checkout 685ac4c107c75ce8c291248710bf990a876e1623 + else + echo "🔔 Llama repository already exists at $LLAMA_REPO" + fi + + # Step 3: Check if weights are already downloaded + if [ -d "${LLAMA_WEIGHTS_DIR}" ] && [ "$(ls -A "${LLAMA_WEIGHTS_DIR}")" ]; then + echo "Weights already downloaded at ${LLAMA_WEIGHTS_DIR}" + echo "Skipping download." + else + echo "Running the download script to download models at ${LLAMA_DIR}/download.sh ..." + cd "$LLAMA_DIR" + ./download.sh + cd - + fi + + if [ "${REPACKED}" -eq 1 ]; then + WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" + else + WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}" + cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" + fi + + echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + echo "✅ setup_weights_meta completed!" +} + +setup_weights_huggingface() { + # Step 1: Verify HF_TOKEN and HF_HOME are set + if [ -z "${HF_TOKEN:-}" ] || [ -z "${HOST_HF_HOME:-}" ]; then + echo "⛔ HF_TOKEN or HF_HOME not set. Please ensure both environment variables are set." + exit 1 + fi + + # Step 3: Create python virtual environment for huggingface downloads + VENV_NAME=".venv_hf_setup" + echo "Setting up python venv for Hugging Face downloads: ${VENV_NAME}" + python3 -m venv ${VENV_NAME} + source ${VENV_NAME}/bin/activate + + # Step 4: Install required packages + pip install --upgrade pip setuptools wheel + pip install "huggingface_hub[cli]" + + # Step 5: Download model using huggingface-cli + echo "Downloading model from Hugging Face Hub..." + # stop timeout issue: https://huggingface.co/docs/huggingface_hub/en/guides/cli#download-timeout + export HF_HUB_DOWNLOAD_TIMEOUT=60 + # using default HF naming convention for model weights + huggingface-cli download "${HF_MODEL_REPO_ID}" \ + original/params.json \ + original/tokenizer.model \ + original/consolidated.* \ + --cache-dir="${HOST_HF_HOME}" \ + --token="${HF_TOKEN}" + + if [ $? -ne 0 ]; then + echo "⛔ Error occured during: huggingface-cli download ${HF_MODEL_REPO_ID}" + echo "🔔 check for common issues:" + echo " 1. 401 Unauthorized error occurred." + echo " For example:" + echo " huggingface_hub.errors.GatedRepoError: 401 Client Error. Cannot access gated repo" + echo " ❗ In this case, go to the repo URL in your web browser and click through the access request form." + echo " 2. check correct HF_TOKEN is set in the .env file: ${ENV_FILE}" + exit 1 + fi + + # symlinks are broken for huggingface-cli download with --local-dir option + # see: https://github.com/huggingface/huggingface_hub/pull/2223 + # to use symlinks, find most recent snapshot and create symlink to that + mkdir -p "${WEIGHTS_DIR}" + LOCAL_REPO_NAME=$(echo "${HF_MODEL_REPO_ID}" | sed 's|/|--|g') + SNAPSHOT_DIR="${HOST_HF_HOME}/models--${LOCAL_REPO_NAME}/snapshots" + # note: ls -td will sort by modification date descending, potential edge case + # if desired snapshot is not most recent modified or ls sorts differently + MOST_RECENT_SNAPSHOT=$(ls -td -- ${SNAPSHOT_DIR}/* | head -n 1) + echo "create symlink: ${MOST_RECENT_SNAPSHOT}/original/ -> ${WEIGHTS_DIR}" + for item in ${MOST_RECENT_SNAPSHOT}/original/*; do + ln -s "$item" "${WEIGHTS_DIR}" + done + + # Step 6: Process and copy weights + if [ "${REPACKED}" -eq 1 ]; then + REPACKED_WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + mkdir -p "${REPACKED_WEIGHTS_DIR}" + repack_weights "${WEIGHTS_DIR}" "${REPACKED_WEIGHTS_DIR}" + fi + + # Step 7: Cleanup + deactivate + rm -rf ${VENV_NAME} + + echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + echo "✅ setup_weights_huggingface completed!" +} + +setup_tt_metal_cache() { + # check if tt_metal_cache already exists + TT_METAL_CACHE_DIR="${PERSISTENT_VOLUME}/tt_metal_cache/cache_${REPACKED_STR}$MODEL_NAME" + if [ -d "${TT_METAL_CACHE_DIR}" ]; then + echo "✅ tt_metal_cache already exists at: ${TT_METAL_CACHE_DIR}." + return 0 + fi + + # create tt_metal_cache directory + mkdir -p "${TT_METAL_CACHE_DIR}" + echo "✅ setup_tt_metal_cache completed!" +} + +setup_weights() { + load_env + + # check if model weights already exist + if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then + echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + echo "🔔 check if directory contents are correct." + echo "contents:" + echo "ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})" + echo + echo "If directory does not have correct weights, to re-download or copy the model weights delete the directory." + else + echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}" + mkdir -p "${PERSISTENT_VOLUME}/model_weights/" + # Determine which setup method to use based on HF_TOKEN presence + if [ "${USE_HF_DOWNLOAD}" == "y" ]; then + setup_weights_huggingface + else + setup_weights_meta + fi + fi + + setup_tt_metal_cache +} + +# ============================================================================== +# Main script logic +# ============================================================================== + +# Ensure script is being executed, not sourced +if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then + echo "⛔ Error: This script is being sourced. Please make execute it:" + echo "chmod +x ./setup.sh && ./setup.sh" + set +euo pipefail # Unset 'set -euo pipefail' when sourcing so it doesnt exit or mess up sourcing shell + return 1; # 'return' works when sourced; 'exit' would terminate the shell +fi + +if [ $# -lt 1 ]; then + usage +fi + +# Set up environment variables for the chosen model +MODEL_TYPE=$1 +setup_model_environment "$MODEL_TYPE" +setup_weights +# Call the script again with sudo to execute the sudo-required commands +echo "Switching to sudo portion to set file permissions and complete setup." +setup_permissions diff --git a/tests/mock.vllm.openai.api.dockerfile b/tests/mock.vllm.openai.api.dockerfile index 2f027be0..9b4e94e6 100644 --- a/tests/mock.vllm.openai.api.dockerfile +++ b/tests/mock.vllm.openai.api.dockerfile @@ -95,8 +95,8 @@ RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install compresse ARG APP_DIR="${HOME_DIR}/app" WORKDIR ${APP_DIR} ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR} -COPY --chown=user:user "vllm-tt-metal-llama3-70b/src" "${APP_DIR}/src" -COPY --chown=user:user "vllm-tt-metal-llama3-70b/requirements.txt" "${APP_DIR}/requirements.txt" +COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src" +COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt" COPY --chown=user:user "utils" "${APP_DIR}/utils" COPY --chown=user:user "tests" "${APP_DIR}/tests" RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ diff --git a/vllm-tt-metal-llama3-70b/setup.sh b/vllm-tt-metal-llama3-70b/setup.sh deleted file mode 100755 index ee102dff..00000000 --- a/vllm-tt-metal-llama3-70b/setup.sh +++ /dev/null @@ -1,388 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: Apache-2.0 -# -# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC - -set -euo pipefail # Exit on error, print commands, unset variables treated as errors, and exit on pipeline failure - -# Function to display usage information -usage() { - echo "Usage: $0 " - echo "Available model types:" - echo " llama-3.1-70b-instruct" - echo " llama-3.1-70b" - echo " llama-3.1-8b-instruct" - echo " llama-3.1-8b" - echo " llama-3-70b-instruct" - echo " llama-3-70b" - echo " llama-3-8b-instruct" - echo " llama-3-8b" - echo - echo "Options:" - echo " setup_permissions Run the script to set file permissions after first run setup (requires sudo)." - exit 1 -} - -# globals -readonly MODEL_PATH=$(dirname "$(realpath "$0")") -readonly REPO_ROOT=$(dirname "${MODEL_PATH}") -readonly ENV_FILE="${MODEL_PATH}/.env" -echo "REPO_ROOT: ${REPO_ROOT}" -echo "MODEL_PATH: ${MODEL_PATH}" -echo "ENV_FILE: ${ENV_FILE}" - -check_and_prompt_env_file() { - local MODEL_NAME_KEY="MODEL_NAME" - local MODEL_NAME="" - - # Check if .env file exists - if [[ -f "$ENV_FILE" ]]; then - # Extract the MODEL_NAME value from .env - FOUND_MODEL_NAME=$(grep "^$MODEL_NAME_KEY=" "$ENV_FILE" | cut -d '=' -f2) - - # If MODEL_NAME is found, display it - if [[ -n "$FOUND_MODEL_NAME" ]]; then - echo "The existing file ${ENV_FILE} contains MODEL_NAME: $FOUND_MODEL_NAME" - # Prompt the user to overwrite or exit - local choice="" - read -p "Do you want to overwrite the existing file ${ENV_FILE}? (y/n) [default: y]:" choice - choice=${choice:-y} - # Handle user's choice - case "$choice" in - y|Y ) - echo "Overwriting the ${ENV_FILE} file ..." - # Logic to overwrite .env goes here - OVERWRITE_ENV=true - ;; - n|N ) - OVERWRITE_ENV=false - ;; - * ) - echo "⛔ Invalid option. Exiting." - exit 1 - ;; - esac - else - echo "MODEL_NAME not found in ${ENV_FILE}. Overwritting." - OVERWRITE_ENV=true - fi - - else - echo "${ENV_FILE} does not exist. Proceeding to create a new one." - OVERWRITE_ENV=true - fi -} - - -# Function to set environment variables based on the model selection and write them to .env -setup_model_environment() { - # Set default values for environment variables - DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume - DEFAULT_LLAMA_REPO=~/llama-models - # Set environment variables based on the model selection - case "$1" in - "llama-3.1-70b-instruct") - MODEL_NAME="llama-3.1-70b-instruct" - META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct" - META_DIR_FILTER="llama3_1" - REPACKED=1 - ;; - "llama-3.1-70b") - MODEL_NAME="llama-3.1-70b" - META_MODEL_NAME="Meta-Llama-3.1-70B" - META_DIR_FILTER="llama3_1" - REPACKED=1 - ;; - "llama-3.1-8b-instruct") - MODEL_NAME="llama-3.1-8b-instruct" - META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct" - META_DIR_FILTER="llama3_1" - REPACKED=0 - ;; - "llama-3.1-8b") - MODEL_NAME="llama-3.1-8b" - META_MODEL_NAME="Meta-Llama-3.1-8B" - META_DIR_FILTER="llama3_1" - REPACKED=0 - ;; - "llama-3-70b-instruct") - MODEL_NAME="llama-3-70b-instruct" - META_MODEL_NAME="Meta-Llama-3-70B-Instruct" - META_DIR_FILTER="llama3" - REPACKED=1 - ;; - "llama-3-70b") - MODEL_NAME="llama-3-70b" - META_MODEL_NAME="Meta-Llama-3-70B" - META_DIR_FILTER="llama3" - REPACKED=1 - ;; - "llama-3-8b-instruct") - MODEL_NAME="llama-3-8b-instruct" - META_MODEL_NAME="Meta-Llama-3-8B-Instruct" - META_DIR_FILTER="llama3" - REPACKED=0 - ;; - "llama-3-8b") - MODEL_NAME="llama-3-8b" - META_MODEL_NAME="Meta-Llama-3-8B" - META_DIR_FILTER="llama3" - REPACKED=0 - ;; - *) - echo "⛔ Invalid model choice." - usage - exit 1 - ;; - esac - - # Initialize OVERWRITE_ENV - OVERWRITE_ENV=false - - check_and_prompt_env_file - - if [ "$OVERWRITE_ENV" = false ]; then - echo "✅ using existing .env file: ${ENV_FILE}." - return 0 - fi - - # Safely handle potentially unset environment variables using default values - PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT} - LLAMA_REPO=${LLAMA_REPO:-$DEFAULT_LLAMA_REPO} - - # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default - read -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT - PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT} - echo - # Prompt user for LLAMA_REPO if not already set or use default - read -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO - LLAMA_REPO=${INPUT_LLAMA_REPO:-$LLAMA_REPO} - echo # move to a new line after input - - # Set environment variables with defaults if not already set - LLAMA_DIR=${LLAMA_DIR:-${LLAMA_REPO}/models/${META_DIR_FILTER}} - LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-${LLAMA_DIR}/${META_MODEL_NAME}} - PERSISTENT_VOLUME=${PERSISTENT_VOLUME:-${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1} - - # Prompt user for JWT_SECRET securely - read -sp "Enter your JWT_SECRET: " JWT_SECRET - echo # move to a new line after input - # Verify the JWT_SECRET is not empty - if [ -z "$JWT_SECRET" ]; then - echo "⛔ JWT_SECRET cannot be empty. Please try again." - exit 1 - fi - - if [ "${REPACKED}" -eq 1 ]; then - echo "REPACKED is enabled." - REPACKED_STR="repacked-" - else - echo "REPACKED is disabled." - REPACKED_STR="" - fi - - # Write environment variables to .env file - echo "Writing environment variables to ${ENV_FILE} ..." - cat > ${ENV_FILE} < /dev/null 2>&1; then - echo "Creating group 'dockermount' ..." - sudo groupadd dockermount - else - echo "Group 'dockermount' already exists." - fi - - # Add host user to 'dockermount' group - echo "Adding user: '$USER' to 'dockermount' group ..." - sudo usermod -aG dockermount "$USER" - - # Get container user with UID 1000 and add to group - CONTAINER_USER=$(getent passwd 1000 | cut -d: -f1) - if [ -n "$CONTAINER_USER" ]; then - echo "Adding container user: '$CONTAINER_USER' (UID 1000) to 'dockermount' group ..." - sudo usermod -aG dockermount "$CONTAINER_USER" - else - echo "No user found with UID 1000." - fi - - # Set file ownership and permissions - echo "Setting file ownership and permissions for container and host access ..." - if [ ! -d "${PERSISTENT_VOLUME}" ]; then - # if the user point the PERSISTENT_VOLUME - sudo mkdir -p "${PERSISTENT_VOLUME}" - fi - sudo chown -R ${CONTAINER_USER}:dockermount "${PERSISTENT_VOLUME}" - sudo chmod -R 775 "${PERSISTENT_VOLUME}" - - echo "✅ setup_permissions completed!" -} - -setup_weights() { - # St`ep 1: Load environment variables from .env file - load_env - - # check if model weights already exist - if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then - echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}." - echo "contents:" - echo - echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})" - echo - echo "If directory does not have correct weigths, to re-download or copy the model weights delete the directory." - echo "✅ Model weights setup is already complete, check if directory contents are correct." - return 0 - fi - - # TODO: support HF_TOKEN for downloading models - # Step 2: Set up Llama model repository path - echo "Using repository path: $LLAMA_REPO" - - # Step 3: Clone the repository (if it doesn't already exist) - if [ ! -d "$LLAMA_REPO" ]; then - echo "Cloning the Llama repository to: $LLAMA_REPO" - git clone https://github.com/meta-llama/llama-models.git "$LLAMA_REPO" - cd "$LLAMA_REPO" - # checkout commit before ./download.sh was removed - git checkout 685ac4c107c75ce8c291248710bf990a876e1623 - else - echo "🔔 Llama repository already exists at $LLAMA_REPO" - fi - - # Step 4: Check if weights are already downloaded - if [ -d "${LLAMA_WEIGHTS_DIR}" ] && [ "$(ls -A "${LLAMA_WEIGHTS_DIR}")" ]; then - echo "Weights already downloaded at ${LLAMA_WEIGHTS_DIR}" - echo "Skipping download." - else - # Step 5: Run the download script and select models - echo "Running the download script to download models at ${LLAMA_DIR}/download.sh ..." - cd "$LLAMA_DIR" - ./download.sh - cd - - fi - - # Step 6: Set up persistent volume root - echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}" - mkdir -p "${PERSISTENT_VOLUME}/model_weights/" - - # Step 7: Create directories for weights, tokenizer, and params - echo "Create directories for weights, tokenizer, and params." - - if [ "${REPACKED}" -eq 1 ]; then - WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" - mkdir -p "${WEIGHTS_DIR}" - cp "${LLAMA_WEIGHTS_DIR}/tokenizer.model" "${WEIGHTS_DIR}/tokenizer.model" - cp "${LLAMA_WEIGHTS_DIR}/params.json" "${WEIGHTS_DIR}/params.json" - # Step 8: repack weights into repacked dir once instead of copying them - VENV_NAME="venv_setup" - echo "setting up repacking python venv: ${VENV_NAME}" - python3 -m venv ${VENV_NAME} - source ${VENV_NAME}/bin/activate - # pip==21.2.4 is needed to avoid the following error: - # ERROR: Package 'networkx' requires a different Python: 3.8.10 not in '>=3.9' - pip install --upgrade setuptools wheel pip==21.2.4 tqdm - # repack script dependency - # pip does not support +cpu build variant qualifier, need to specify cpu index url - pip install --index-url https://download.pytorch.org/whl/cpu torch==2.2.1 - curl -O https://raw.githubusercontent.com/tenstorrent/tt-metal/refs/heads/main/models/demos/t3000/llama2_70b/scripts/repack_weights.py - echo "repacking weights..." - python repack_weights.py "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" 5 - deactivate - rm -rf ${VENV_NAME} repack_weights.py - else - WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}" - cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" - - fi - - echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" - - # create a tmp python venv with dependencies to run repack script - echo "✅ setup_weights completed!" -} - -setup_tt_metal_cache() { - # check if tt_metal_cache already exists - TT_METAL_CACHE_DIR="${PERSISTENT_VOLUME}/tt_metal_cache/cache_${REPACKED_STR}$MODEL_NAME" - if [ -d "${TT_METAL_CACHE_DIR}" ]; then - echo "✅ tt_metal_cache already exists at: ${TT_METAL_CACHE_DIR}." - return 0 - fi - - # create tt_metal_cache directory - mkdir -p "${TT_METAL_CACHE_DIR}" - echo "✅ setup_tt_metal_cache completed!" -} - -# Ensure script is being executed, not sourced -if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then - echo "⛔ Error: This script is being sourced. Please make execute it:" - echo "chmod +x ./setup.sh && ./setup.sh" - set +euo pipefail # Unset 'set -euo pipefail' when sourcing so it doesnt exit or mess up sourcing shell - return 1; # 'return' works when sourced; 'exit' would terminate the shell -fi - -# Main script logic -if [ $# -lt 1 ]; then - usage -fi - -if [ "$1" == "setup_permissions" ]; then - setup_permissions - exit 0 -fi - -# Set up environment variables for the chosen model -MODEL_TYPE=$1 -setup_model_environment "$MODEL_TYPE" -setup_weights -setup_tt_metal_cache -# Call the script again with sudo to execute the sudo-required commands -echo "Switching to sudo portion to set file permissions and complete setup." -setup_permissions diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3/README.md similarity index 90% rename from vllm-tt-metal-llama3-70b/README.md rename to vllm-tt-metal-llama3/README.md index 3fa10b39..525281f6 100644 --- a/vllm-tt-metal-llama3-70b/README.md +++ b/vllm-tt-metal-llama3/README.md @@ -25,18 +25,18 @@ Run the container from the project root at `tt-inference-server`: ```bash cd tt-inference-server # make sure if you already set up the model weights and cache you use the correct persistent volume -export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/ +export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/ docker run \ --rm \ -it \ - --env-file vllm-tt-metal-llama3-70b/.env \ + --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \ --cap-add ALL \ --device /dev/tenstorrent:/dev/tenstorrent \ --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \ - --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \ + --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \ --shm-size 32G \ --publish 7000:7000 \ - ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.2-tt-metal-385904186f81-384f1790c3be + ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50 ``` By default the Docker container will start running the entrypoint command wrapped in `src/run_vllm_api_server.py`. @@ -106,16 +106,16 @@ Either download the Docker image from GitHub Container Registry (recommended for ```bash # pull image from GHCR -docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.2-tt-metal-385904186f81-384f1790c3be +docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50 ``` #### Option B: Build Docker Image -For instructions on building the Docker imagem locally see: [vllm-tt-metal-llama3-70b/docs/development](../vllm-tt-metal-llama3-70b/docs/development.md#step-1-build-docker-image) +For instructions on building the Docker imagem locally see: [vllm-tt-metal-llama3/docs/development](../vllm-tt-metal-llama3/docs/development.md#step-1-build-docker-image) ### 5. Automated Setup: environment variables and weights files -The script `vllm-tt-metal-llama3-70b/setup.sh` automates: +The script `setup.sh` automates: 1. interactively creating the .env file, 2. downloading the Llama model weights, @@ -123,7 +123,7 @@ The script `vllm-tt-metal-llama3-70b/setup.sh` automates: 4. creating the default persistent storage directory structure and permissions. ```bash -cd tt-inference-server/vllm-tt-metal-llama3-70b +cd tt-inference-server chmod +x setup.sh ./setup.sh llama-3.1-70b-instruct ``` diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3/docs/development.md similarity index 78% rename from vllm-tt-metal-llama3-70b/docs/development.md rename to vllm-tt-metal-llama3/docs/development.md index 55d8b1d3..f7dd6c15 100644 --- a/vllm-tt-metal-llama3-70b/docs/development.md +++ b/vllm-tt-metal-llama3/docs/development.md @@ -1,6 +1,6 @@ -# Development vllm-tt-metal-llama3-70B +# Development vllm-tt-metal-llama3 -Containerization in: https://github.com/tenstorrent/tt-inference-server/blob/tstesco/vllm-llama3-70b/vllm-tt-metal-llama3-70b/vllm.llama3.src.base.inference.v0.52.0.Dockerfile +Containerization in: https://github.com/tenstorrent/tt-inference-server/blob/tstesco/vllm-llama3-70b/vllm-tt-metal-llama3/vllm.llama3.src.base.inference.v0.52.0.Dockerfile tt-metal and vLLM are under active development in lock-step: https://github.com/tenstorrent/vllm/tree/dev/tt_metal @@ -13,23 +13,24 @@ When building, update the commit SHA and get correct SHA from model developers o # set build context to repo root cd tt-inference-server # build image -export TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34 -export TT_METAL_COMMIT_SHA_OR_TAG=385904186f81fed15d5c87c162221d4f34387164 +export TT_METAL_DOCKERFILE_VERSION=v0.53.0 +export TT_METAL_COMMIT_SHA_OR_TAG=v0.54.0-rc2 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12} -export TT_VLLM_COMMIT_SHA_OR_TAG=384f1790c3be16e1d1b10de07252be2e66d00935 +export TT_VLLM_COMMIT_SHA_OR_TAG=953161188c50f10da95a88ab305e23977ebd3750 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12} -export IMAGE_VERSION=v0.0.3 +export OS_VERSION=ubuntu-20.04-amd64 +export IMAGE_VERSION=v0.0.1 docker build \ - -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \ + -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm-${OS_VERSION}:${IMAGE_VERSION}-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \ --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \ --build-arg TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} \ --build-arg TT_VLLM_COMMIT_SHA_OR_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG} \ - . -f vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile + . -f vllm-tt-metal-llama3/vllm.llama3.src.${OS_VERSION}.Dockerfile ``` ### push image (only for admin deployment to GHCR) ```bash -docker push ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} +docker push ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm-${OS_VERSION}:${IMAGE_VERSION}-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} ``` ## Step 2: Run container for LM evals development @@ -38,15 +39,15 @@ note: this requires running `setup.sh` to set up the weights for a particular mo ```bash cd tt-inference-server -export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/ +export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/ docker run \ --rm \ -it \ - --env-file tt-metal-llama3-70b/.env \ + --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \ --cap-add ALL \ --device /dev/tenstorrent:/dev/tenstorrent \ --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \ - --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \ + --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \ --shm-size 32G \ ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} bash ``` diff --git a/vllm-tt-metal-llama3-70b/requirements.txt b/vllm-tt-metal-llama3/requirements.txt similarity index 100% rename from vllm-tt-metal-llama3-70b/requirements.txt rename to vllm-tt-metal-llama3/requirements.txt diff --git a/vllm-tt-metal-llama3-70b/src/__init__.py b/vllm-tt-metal-llama3/src/__init__.py similarity index 100% rename from vllm-tt-metal-llama3-70b/src/__init__.py rename to vllm-tt-metal-llama3/src/__init__.py diff --git a/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py b/vllm-tt-metal-llama3/src/example_openai_client_alpaca_eval.py similarity index 100% rename from vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py rename to vllm-tt-metal-llama3/src/example_openai_client_alpaca_eval.py diff --git a/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py b/vllm-tt-metal-llama3/src/example_requests_client_alpaca_eval.py similarity index 100% rename from vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py rename to vllm-tt-metal-llama3/src/example_requests_client_alpaca_eval.py diff --git a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py b/vllm-tt-metal-llama3/src/run_vllm_api_server.py similarity index 62% rename from vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py rename to vllm-tt-metal-llama3/src/run_vllm_api_server.py index 992874b1..dfe08930 100644 --- a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py +++ b/vllm-tt-metal-llama3/src/run_vllm_api_server.py @@ -12,11 +12,14 @@ from utils.logging_utils import set_vllm_logging_config -# importing from tt-metal install path -from models.demos.t3000.llama2_70b.tt.llama_generation import TtLlamaModelForGeneration +# Import and register models from tt-metal +from models.demos.t3000.llama2_70b.tt.generator_vllm import TtLlamaForCausalLM +from models.demos.llama3.tt.generator_vllm import TtMllamaForConditionalGeneration -# register the model -ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaModelForGeneration) +ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaForCausalLM) +ModelRegistry.register_model( + "TTMllamaForConditionalGeneration", TtMllamaForConditionalGeneration +) def get_encoded_api_key(jwt_secret): @@ -27,7 +30,43 @@ def get_encoded_api_key(jwt_secret): return encoded_jwt +def get_hf_model_id(): + model = os.environ.get("HF_MODEL_REPO_ID") + if not model: + print("Must set environment variable: HF_MODEL_REPO_ID") + sys.exit() + return model + + +def model_setup(hf_model_id): + # TODO: check HF repo access with HF_TOKEN supplied + print(f"using model: {hf_model_id}") + args = { + "model": hf_model_id, + "block_size": "64", + "max_num_seqs": "32", + "max_model_len": "131072", + "max_num_batched_tokens": "131072", + "num_scheduler_steps": "10", + "max-log-len": "64", + "port": os.getenv("SERVICE_PORT", "7000"), + "download-dir": os.getenv("CACHE_DIR", None), + "api-key": get_encoded_api_key(os.getenv("JWT_SECRET", None)), + } + if hf_model_id == "meta-llama/Llama-3.2-11B-Vision-Instruct": + if os.environ.get("MESH_DEVICE") is None: + os.environ["MESH_DEVICE"] = "N300" + else: + assert os.environ["MESH_DEVICE"] in [ + "N300", + "T3K_LINE", + ], "Invalid MESH_DEVICE for multi-modal inference" + + return args + + def main(): + hf_model_id = get_hf_model_id() # set up logging config_path, log_path = set_vllm_logging_config(level="DEBUG") print(f"setting vllm logging config at: {config_path}") @@ -41,18 +80,7 @@ def main(): # timeout is 3x VLLM_RPC_TIMEOUT os.environ["VLLM_RPC_TIMEOUT"] = "200000" # 200000ms = 200s # vLLM CLI arguments - args = { - "model": "meta-llama/Llama-3.1-70B-Instruct", - "block_size": "64", - "max_num_seqs": "32", - "max_model_len": "131072", - "max_num_batched_tokens": "131072", - "num_scheduler_steps": "10", - "max-log-len": "32", - "port": os.getenv("SERVICE_PORT", "7000"), - "download-dir": os.getenv("CACHE_DIR", None), - "api-key": get_encoded_api_key(os.getenv("JWT_SECRET", None)), - } + args = model_setup(hf_model_id) for key, value in args.items(): if value is not None: sys.argv.extend(["--" + key, value]) diff --git a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile similarity index 89% rename from vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile rename to vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile index 2184d356..4f2dce30 100644 --- a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile +++ b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile @@ -90,18 +90,21 @@ RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\ && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \ && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ." -# extra vllm dependencies -RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install compressed-tensors" +# extra vllm and model dependencies +RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ + && pip install compressed-tensors \ + && pip install -r /tt-metal/models/demos/llama3/requirements.txt" ARG APP_DIR="${HOME_DIR}/app" WORKDIR ${APP_DIR} ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR} -COPY --chown=user:user "vllm-tt-metal-llama3-70b/src" "${APP_DIR}/src" -COPY --chown=user:user "vllm-tt-metal-llama3-70b/requirements.txt" "${APP_DIR}/requirements.txt" +COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src" +COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt" COPY --chown=user:user "utils" "${APP_DIR}/utils" COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking" COPY --chown=user:user "evals" "${APP_DIR}/evals" COPY --chown=user:user "tests" "${APP_DIR}/tests" +COPY --chown=user:user "locust" "${APP_DIR}/locust" RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ && pip install --default-timeout=240 --no-cache-dir -r requirements.txt" diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile new file mode 100644 index 00000000..2e87a84c --- /dev/null +++ b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# default base image, override with --build-arg TT_METAL_DOCKERFILE_VERSION= +ARG TT_METAL_DOCKERFILE_VERSION + +FROM local/tt-metal/tt-metalium/ubuntu-22.04-amd64:$TT_METAL_DOCKERFILE_VERSION + +# Build stage +LABEL maintainer="Tom Stesco " +# connect Github repo with package +LABEL org.opencontainers.image.source=https://github.com/tenstorrent/tt-inference-server + +ARG DEBIAN_FRONTEND=noninteractive +# default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG= +ARG TT_METAL_COMMIT_SHA_OR_TAG +ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev + +# make build commit SHA available in the image for reference and debugging +ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} +ENV SHELL=/bin/bash +ENV TZ=America/Los_Angeles +# tt-metal build vars +ENV ARCH_NAME=wormhole_b0 +ENV TT_METAL_HOME=/tt-metal +ENV CONFIG=Release +ENV TT_METAL_ENV=dev +ENV LOGURU_LEVEL=INFO +# derived vars +ENV PYTHONPATH=${TT_METAL_HOME} +# note: PYTHON_ENV_DIR is used by create_venv.sh +ENV PYTHON_ENV_DIR=${TT_METAL_HOME}/python_env +ENV LD_LIBRARY_PATH=${TT_METAL_HOME}/build/lib + +# extra system deps +RUN apt-get update && apt-get install -y \ + libsndfile1 \ + wget \ + nano \ + acl \ + jq \ + vim \ + # user deps + htop \ + screen \ + tmux \ + unzip \ + zip \ + curl \ + iputils-ping \ + rsync \ + && rm -rf /var/lib/apt/lists/* + +# build tt-metal +RUN git clone https://github.com/tenstorrent-metal/tt-metal.git ${TT_METAL_HOME} \ + && cd ${TT_METAL_HOME} \ + && git checkout ${TT_METAL_COMMIT_SHA_OR_TAG} \ + && git submodule update --init --recursive \ + && git submodule foreach 'git lfs fetch --all && git lfs pull' \ + && bash ./build_metal.sh \ + && bash ./create_venv.sh + +# user setup +ARG HOME_DIR=/home/user +RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \ + && mkdir -p ${HOME_DIR} \ + && chown -R user:user ${HOME_DIR} \ + && chown -R user:user ${TT_METAL_HOME} + +USER user + +# tt-metal python env default +RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc + +# install tt-smi +RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ + && pip3 install --upgrade pip \ + && pip3 install git+https://github.com/tenstorrent/tt-smi" + +# runtime required for tt-metal on WH +ENV WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml + +WORKDIR ${HOME_DIR} +# vllm install, see: https://github.com/tenstorrent/vllm/blob/dev/tt_metal/README.md +ENV vllm_dir=${HOME_DIR}/vllm +ENV PYTHONPATH=${PYTHONPATH}:${vllm_dir} +ENV VLLM_TARGET_DEVICE="tt" +RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\ + && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \ + && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ." + +# extra vllm and model dependencies +RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ + && pip install compressed-tensors \ + && pip install -r /tt-metal/models/demos/llama3/requirements.txt" + +ARG APP_DIR="${HOME_DIR}/app" +WORKDIR ${APP_DIR} +ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR} +COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src" +COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt" +COPY --chown=user:user "utils" "${APP_DIR}/utils" +COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking" +COPY --chown=user:user "evals" "${APP_DIR}/evals" +COPY --chown=user:user "tests" "${APP_DIR}/tests" +COPY --chown=user:user "locust" "${APP_DIR}/locust" +RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ +&& pip install --default-timeout=240 --no-cache-dir -r requirements.txt" + +WORKDIR "${APP_DIR}/src" +CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python run_vllm_api_server.py"] From d850b73e90e6ea2c3f3fa399c1382d0aa425bd3b Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 14 Jan 2025 19:17:20 -0500 Subject: [PATCH 2/3] use vllm.llama3.src.shared.Dockerfile for shared build steps for ubuntu 22.04 and 20.04 Dockerfiles --- .../vllm.llama3.src.shared.Dockerfile | 107 +++++++++++++++++ ...m.llama3.src.ubuntu-20.04-amd64.Dockerfile | 106 +---------------- ...m.llama3.src.ubuntu-22.04-amd64.Dockerfile | 109 +----------------- 3 files changed, 115 insertions(+), 207 deletions(-) create mode 100644 vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile new file mode 100644 index 00000000..c9532dcc --- /dev/null +++ b/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# shared build stage, FROM is set by the OS specific Dockerfiles +LABEL maintainer="Tom Stesco " +# connect Github repo with package +LABEL org.opencontainers.image.source=https://github.com/tenstorrent/tt-inference-server + +ARG DEBIAN_FRONTEND=noninteractive +# default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG= +ARG TT_METAL_COMMIT_SHA_OR_TAG +ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev + +# make build commit SHA available in the image for reference and debugging +ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} +ENV SHELL=/bin/bash +ENV TZ=America/Los_Angeles +# tt-metal build vars +ENV ARCH_NAME=wormhole_b0 +ENV TT_METAL_HOME=/tt-metal +ENV CONFIG=Release +ENV TT_METAL_ENV=dev +ENV LOGURU_LEVEL=INFO +# derived vars +ENV PYTHONPATH=${TT_METAL_HOME} +# note: PYTHON_ENV_DIR is used by create_venv.sh +ENV PYTHON_ENV_DIR=${TT_METAL_HOME}/python_env +ENV LD_LIBRARY_PATH=${TT_METAL_HOME}/build/lib + +# extra system deps +RUN apt-get update && apt-get install -y \ + libsndfile1 \ + wget \ + nano \ + acl \ + jq \ + vim \ + # user deps + htop \ + screen \ + tmux \ + unzip \ + zip \ + curl \ + iputils-ping \ + rsync \ + && rm -rf /var/lib/apt/lists/* + +# build tt-metal +RUN git clone https://github.com/tenstorrent-metal/tt-metal.git ${TT_METAL_HOME} \ + && cd ${TT_METAL_HOME} \ + && git checkout ${TT_METAL_COMMIT_SHA_OR_TAG} \ + && git submodule update --init --recursive \ + && git submodule foreach 'git lfs fetch --all && git lfs pull' \ + && bash ./build_metal.sh \ + && bash ./create_venv.sh + +# user setup +ARG HOME_DIR=/home/user +RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \ + && mkdir -p ${HOME_DIR} \ + && chown -R user:user ${HOME_DIR} \ + && chown -R user:user ${TT_METAL_HOME} + +USER user + +# tt-metal python env default +RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc + +# install tt-smi +RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ + && pip3 install --upgrade pip \ + && pip3 install git+https://github.com/tenstorrent/tt-smi" + +# runtime required for tt-metal on WH +ENV WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml + +WORKDIR ${HOME_DIR} +# vllm install, see: https://github.com/tenstorrent/vllm/blob/dev/tt_metal/README.md +ENV vllm_dir=${HOME_DIR}/vllm +ENV PYTHONPATH=${PYTHONPATH}:${vllm_dir} +ENV VLLM_TARGET_DEVICE="tt" +RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\ + && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \ + && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ." + +# extra vllm and model dependencies +RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ + && pip install compressed-tensors \ + && pip install -r /tt-metal/models/demos/llama3/requirements.txt" + +ARG APP_DIR="${HOME_DIR}/app" +WORKDIR ${APP_DIR} +ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR} +COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src" +COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt" +COPY --chown=user:user "utils" "${APP_DIR}/utils" +COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking" +COPY --chown=user:user "evals" "${APP_DIR}/evals" +COPY --chown=user:user "tests" "${APP_DIR}/tests" +COPY --chown=user:user "locust" "${APP_DIR}/locust" +RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ +&& pip install --default-timeout=240 --no-cache-dir -r requirements.txt" + +WORKDIR "${APP_DIR}/src" +CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python run_vllm_api_server.py"] diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile index 4f2dce30..49e0fc43 100644 --- a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile +++ b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile @@ -7,106 +7,6 @@ ARG TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34 FROM ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:$TT_METAL_DOCKERFILE_VERSION-dev -# Build stage -LABEL maintainer="Tom Stesco " -# connect Github repo with package -LABEL org.opencontainers.image.source=https://github.com/tenstorrent/tt-inference-server - -ARG DEBIAN_FRONTEND=noninteractive -# default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG= -ARG TT_METAL_COMMIT_SHA_OR_TAG -ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev - -# make build commit SHA available in the image for reference and debugging -ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} -ENV SHELL=/bin/bash -ENV TZ=America/Los_Angeles -# tt-metal build vars -ENV ARCH_NAME=wormhole_b0 -ENV TT_METAL_HOME=/tt-metal -ENV CONFIG=Release -ENV TT_METAL_ENV=dev -ENV LOGURU_LEVEL=INFO -# derived vars -ENV PYTHONPATH=${TT_METAL_HOME} -# note: PYTHON_ENV_DIR is used by create_venv.sh -ENV PYTHON_ENV_DIR=${TT_METAL_HOME}/python_env -ENV LD_LIBRARY_PATH=${TT_METAL_HOME}/build/lib - -# extra system deps -RUN apt-get update && apt-get install -y \ - libsndfile1 \ - wget \ - nano \ - acl \ - jq \ - vim \ - # user deps - htop \ - screen \ - tmux \ - unzip \ - zip \ - curl \ - iputils-ping \ - rsync \ - && rm -rf /var/lib/apt/lists/* - -# build tt-metal -RUN git clone https://github.com/tenstorrent-metal/tt-metal.git ${TT_METAL_HOME} \ - && cd ${TT_METAL_HOME} \ - && git checkout ${TT_METAL_COMMIT_SHA_OR_TAG} \ - && git submodule update --init --recursive \ - && git submodule foreach 'git lfs fetch --all && git lfs pull' \ - && bash ./build_metal.sh \ - && bash ./create_venv.sh - -# user setup -ARG HOME_DIR=/home/user -RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \ - && mkdir -p ${HOME_DIR} \ - && chown -R user:user ${HOME_DIR} \ - && chown -R user:user ${TT_METAL_HOME} - -USER user - -# tt-metal python env default -RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc - -# install tt-smi -RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ - && pip3 install --upgrade pip \ - && pip3 install git+https://github.com/tenstorrent/tt-smi" - -# runtime required for tt-metal on WH -ENV WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml - -WORKDIR ${HOME_DIR} -# vllm install, see: https://github.com/tenstorrent/vllm/blob/dev/tt_metal/README.md -ENV vllm_dir=${HOME_DIR}/vllm -ENV PYTHONPATH=${PYTHONPATH}:${vllm_dir} -ENV VLLM_TARGET_DEVICE="tt" -RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\ - && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \ - && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ." - -# extra vllm and model dependencies -RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ - && pip install compressed-tensors \ - && pip install -r /tt-metal/models/demos/llama3/requirements.txt" - -ARG APP_DIR="${HOME_DIR}/app" -WORKDIR ${APP_DIR} -ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR} -COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src" -COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt" -COPY --chown=user:user "utils" "${APP_DIR}/utils" -COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking" -COPY --chown=user:user "evals" "${APP_DIR}/evals" -COPY --chown=user:user "tests" "${APP_DIR}/tests" -COPY --chown=user:user "locust" "${APP_DIR}/locust" -RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ -&& pip install --default-timeout=240 --no-cache-dir -r requirements.txt" - -WORKDIR "${APP_DIR}/src" -CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python run_vllm_api_server.py"] +# include shared instructions +COPY vllm.llama3.src.shared.Dockerfile /vllm.llama3.src.shared.Dockerfile +RUN cat /vllm.llama3.src.shared.Dockerfile | docker build - < . diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile index 2e87a84c..dfcdce9d 100644 --- a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile +++ b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile @@ -2,111 +2,12 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC -# default base image, override with --build-arg TT_METAL_DOCKERFILE_VERSION= +# set with --build-arg TT_METAL_DOCKERFILE_VERSION= +# NOTE: tt-metal Ubuntu 22.04 Dockerfile must be built locally until release images are published ARG TT_METAL_DOCKERFILE_VERSION FROM local/tt-metal/tt-metalium/ubuntu-22.04-amd64:$TT_METAL_DOCKERFILE_VERSION -# Build stage -LABEL maintainer="Tom Stesco " -# connect Github repo with package -LABEL org.opencontainers.image.source=https://github.com/tenstorrent/tt-inference-server - -ARG DEBIAN_FRONTEND=noninteractive -# default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG= -ARG TT_METAL_COMMIT_SHA_OR_TAG -ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev - -# make build commit SHA available in the image for reference and debugging -ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} -ENV SHELL=/bin/bash -ENV TZ=America/Los_Angeles -# tt-metal build vars -ENV ARCH_NAME=wormhole_b0 -ENV TT_METAL_HOME=/tt-metal -ENV CONFIG=Release -ENV TT_METAL_ENV=dev -ENV LOGURU_LEVEL=INFO -# derived vars -ENV PYTHONPATH=${TT_METAL_HOME} -# note: PYTHON_ENV_DIR is used by create_venv.sh -ENV PYTHON_ENV_DIR=${TT_METAL_HOME}/python_env -ENV LD_LIBRARY_PATH=${TT_METAL_HOME}/build/lib - -# extra system deps -RUN apt-get update && apt-get install -y \ - libsndfile1 \ - wget \ - nano \ - acl \ - jq \ - vim \ - # user deps - htop \ - screen \ - tmux \ - unzip \ - zip \ - curl \ - iputils-ping \ - rsync \ - && rm -rf /var/lib/apt/lists/* - -# build tt-metal -RUN git clone https://github.com/tenstorrent-metal/tt-metal.git ${TT_METAL_HOME} \ - && cd ${TT_METAL_HOME} \ - && git checkout ${TT_METAL_COMMIT_SHA_OR_TAG} \ - && git submodule update --init --recursive \ - && git submodule foreach 'git lfs fetch --all && git lfs pull' \ - && bash ./build_metal.sh \ - && bash ./create_venv.sh - -# user setup -ARG HOME_DIR=/home/user -RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \ - && mkdir -p ${HOME_DIR} \ - && chown -R user:user ${HOME_DIR} \ - && chown -R user:user ${TT_METAL_HOME} - -USER user - -# tt-metal python env default -RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc - -# install tt-smi -RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ - && pip3 install --upgrade pip \ - && pip3 install git+https://github.com/tenstorrent/tt-smi" - -# runtime required for tt-metal on WH -ENV WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml - -WORKDIR ${HOME_DIR} -# vllm install, see: https://github.com/tenstorrent/vllm/blob/dev/tt_metal/README.md -ENV vllm_dir=${HOME_DIR}/vllm -ENV PYTHONPATH=${PYTHONPATH}:${vllm_dir} -ENV VLLM_TARGET_DEVICE="tt" -RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\ - && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \ - && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ." - -# extra vllm and model dependencies -RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ - && pip install compressed-tensors \ - && pip install -r /tt-metal/models/demos/llama3/requirements.txt" - -ARG APP_DIR="${HOME_DIR}/app" -WORKDIR ${APP_DIR} -ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR} -COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src" -COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt" -COPY --chown=user:user "utils" "${APP_DIR}/utils" -COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking" -COPY --chown=user:user "evals" "${APP_DIR}/evals" -COPY --chown=user:user "tests" "${APP_DIR}/tests" -COPY --chown=user:user "locust" "${APP_DIR}/locust" -RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ -&& pip install --default-timeout=240 --no-cache-dir -r requirements.txt" - -WORKDIR "${APP_DIR}/src" -CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python run_vllm_api_server.py"] +# include shared instructions +COPY vllm.llama3.src.shared.Dockerfile /vllm.llama3.src.shared.Dockerfile +RUN cat /vllm.llama3.src.shared.Dockerfile | docker build - < . From 11141de96438fc2d61823a6bbf5ace4a0cc955d9 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 15 Jan 2025 00:58:43 +0000 Subject: [PATCH 3/3] use full url TT_METAL_DOCKERFILE_URL to allow for 1 Dockerfile for multiple base images --- tests/README.md | 4 ++-- tests/mock.vllm.openai.api.dockerfile | 7 +++--- vllm-tt-metal-llama3/docs/development.md | 23 +++++++++++++++---- ....Dockerfile => vllm.llama3.src.Dockerfile} | 6 +++++ ...m.llama3.src.ubuntu-20.04-amd64.Dockerfile | 12 ---------- ...m.llama3.src.ubuntu-22.04-amd64.Dockerfile | 13 ----------- 6 files changed, 31 insertions(+), 34 deletions(-) rename vllm-tt-metal-llama3/{vllm.llama3.src.shared.Dockerfile => vllm.llama3.src.Dockerfile} (91%) delete mode 100644 vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile delete mode 100644 vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile diff --git a/tests/README.md b/tests/README.md index 0d0ac168..d4240bd9 100644 --- a/tests/README.md +++ b/tests/README.md @@ -35,14 +35,14 @@ WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/user/tests/mock_ # set build context to repo root cd tt-inference-server # build image -export TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34 +export TT_METAL_DOCKERFILE_URL=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:v0.53.0-rc34-dev export TT_METAL_COMMIT_SHA_OR_TAG=385904186f81fed15d5c87c162221d4f34387164 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12} export TT_VLLM_COMMIT_SHA_OR_TAG=384f1790c3be16e1d1b10de07252be2e66d00935 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12} docker build \ -t ghcr.io/tenstorrent/tt-inference-server/mock.vllm.openai.api:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \ - --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \ + --build-arg TT_METAL_DOCKERFILE_URL=${TT_METAL_DOCKERFILE_URL} \ --build-arg TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} \ --build-arg TT_VLLM_COMMIT_SHA_OR_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG} \ . -f tests/mock.vllm.openai.api.dockerfile diff --git a/tests/mock.vllm.openai.api.dockerfile b/tests/mock.vllm.openai.api.dockerfile index 9b4e94e6..8cf9f718 100644 --- a/tests/mock.vllm.openai.api.dockerfile +++ b/tests/mock.vllm.openai.api.dockerfile @@ -2,10 +2,11 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC -# default base image, override with --build-arg TT_METAL_DOCKERFILE_VERSION= -ARG TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34 +# default base image, override with --build-arg TT_METAL_DOCKERFILE_URL= +# NOTE: tt-metal Ubuntu 22.04 Dockerfile must be built locally until release images are published +ARG TT_METAL_DOCKERFILE_URL=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:v0.53.0-rc34-dev -FROM ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:$TT_METAL_DOCKERFILE_VERSION-dev +FROM ${TT_METAL_DOCKERFILE_URL} # Build stage LABEL maintainer="Tom Stesco " diff --git a/vllm-tt-metal-llama3/docs/development.md b/vllm-tt-metal-llama3/docs/development.md index f7dd6c15..3a1b9a5e 100644 --- a/vllm-tt-metal-llama3/docs/development.md +++ b/vllm-tt-metal-llama3/docs/development.md @@ -13,21 +13,36 @@ When building, update the commit SHA and get correct SHA from model developers o # set build context to repo root cd tt-inference-server # build image -export TT_METAL_DOCKERFILE_VERSION=v0.53.0 +export TT_METAL_DOCKERFILE_URL=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:v0.53.0-rc34-dev export TT_METAL_COMMIT_SHA_OR_TAG=v0.54.0-rc2 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12} export TT_VLLM_COMMIT_SHA_OR_TAG=953161188c50f10da95a88ab305e23977ebd3750 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12} -export OS_VERSION=ubuntu-20.04-amd64 export IMAGE_VERSION=v0.0.1 docker build \ -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm-${OS_VERSION}:${IMAGE_VERSION}-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \ - --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \ + --build-arg TT_METAL_DOCKERFILE_URL=${TT_METAL_DOCKERFILE_URL} \ --build-arg TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} \ --build-arg TT_VLLM_COMMIT_SHA_OR_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG} \ - . -f vllm-tt-metal-llama3/vllm.llama3.src.${OS_VERSION}.Dockerfile + . -f vllm-tt-metal-llama3/vllm.llama3.src.Dockerfile ``` +### Ubuntu 22.04 base image + +In the tt-metal repo there is a Ubuntu 22.04 Dockerfile: https://github.com/tenstorrent/tt-metal/blob/main/dockerfile/ubuntu-22.04-amd64.Dockerfile +This Dockerfile installs the python dependencies for Ubuntu 22.04 running Python 3.10: https://github.com/tenstorrent/tt-metal/blob/main/scripts/docker/requirements-22.04.txt + +The Ubuntu 22.04 images are not yet published to GHCR as the Ubuntu 20.04 images are (https://github.com/tenstorrent/tt-metal/pkgs/container/tt-metal%2Ftt-metalium%2Fubuntu-20.04-amd64) + +You can build local tt-metal ubuntu 22.04 base image: +```bash +git clone --depth 1 --branch ${TT_METAL_COMMIT_SHA_OR_TAG} https://github.com/tenstorrent/tt-metal.git +cd tt-metal +docker build -t local/tt-metal/tt-metalium/ubuntu-22.04-amd64:latest -f dockerfile/ubuntu-22.04-amd64.Dockerfile . +``` + +You can then repeat the steps above to build with, e.g. `TT_METAL_DOCKERFILE_URL=local/tt-metal/tt-metalium/ubuntu-22.04-amd64:latest` + ### push image (only for admin deployment to GHCR) ```bash docker push ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm-${OS_VERSION}:${IMAGE_VERSION}-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.Dockerfile similarity index 91% rename from vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile rename to vllm-tt-metal-llama3/vllm.llama3.src.Dockerfile index c9532dcc..aa556360 100644 --- a/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile +++ b/vllm-tt-metal-llama3/vllm.llama3.src.Dockerfile @@ -2,6 +2,12 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +# default base image, override with --build-arg TT_METAL_DOCKERFILE_URL= +# NOTE: tt-metal Ubuntu 22.04 Dockerfile must be built locally until release images are published +ARG TT_METAL_DOCKERFILE_URL=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:v0.53.0-rc34-dev + +FROM ${TT_METAL_DOCKERFILE_URL} + # shared build stage, FROM is set by the OS specific Dockerfiles LABEL maintainer="Tom Stesco " # connect Github repo with package diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile deleted file mode 100644 index 49e0fc43..00000000 --- a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC - -# default base image, override with --build-arg TT_METAL_DOCKERFILE_VERSION= -ARG TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34 - -FROM ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:$TT_METAL_DOCKERFILE_VERSION-dev - -# include shared instructions -COPY vllm.llama3.src.shared.Dockerfile /vllm.llama3.src.shared.Dockerfile -RUN cat /vllm.llama3.src.shared.Dockerfile | docker build - < . diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile deleted file mode 100644 index dfcdce9d..00000000 --- a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC - -# set with --build-arg TT_METAL_DOCKERFILE_VERSION= -# NOTE: tt-metal Ubuntu 22.04 Dockerfile must be built locally until release images are published -ARG TT_METAL_DOCKERFILE_VERSION - -FROM local/tt-metal/tt-metalium/ubuntu-22.04-amd64:$TT_METAL_DOCKERFILE_VERSION - -# include shared instructions -COPY vllm.llama3.src.shared.Dockerfile /vllm.llama3.src.shared.Dockerfile -RUN cat /vllm.llama3.src.shared.Dockerfile | docker build - < .