diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py index e580046..76679b0 100644 --- a/benchmarking/benchmark_summary.py +++ b/benchmarking/benchmark_summary.py @@ -45,13 +45,14 @@ def parse_args(): def extract_params_from_filename(filename: str) -> Dict[str, Any]: pattern = r""" - benchmark_ + .*?benchmark_ # Any prefix before benchmark_ (?P\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}) # Timestamp (_(?PN150|N300|T3K_LINE|T3K_RING|TG))? # MESH_DEVICE _isl-(?P\d+) # Input sequence length _osl-(?P\d+) # Output sequence length - _bsz-(?P\d+) # Batch size - _n-(?P\d+) # Number of requests + _maxcon-(?P\d+) # Max concurrency + _n-(?P\d+) # Number of requests + \.json$ """ match = re.search(pattern, filename, re.VERBOSE) if not match: @@ -67,7 +68,7 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]: "mesh_device": match.group("mesh_device"), "input_sequence_length": int(match.group("isl")), "output_sequence_length": int(match.group("osl")), - "batch_size": int(match.group("bsz")), + "batch_size": int(match.group("maxcon")), "num_requests": int(match.group("n")), } diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index b5e9edc..27ba806 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -102,8 +102,15 @@ def run_sequence_length_test( tokenizer = AutoTokenizer.from_pretrained(model) # pre-capture traces so benchmark does not include 1st run trace capture time - # TODO: add support for image input to capture_traces - prompt_client.capture_traces(context_lens=[(input_len, output_len)]) + image_resolutions = [] + if images: + image_resolutions = [ + (prompt_config.image_width, prompt_config.image_height) + ] + + prompt_client.capture_traces( + context_lens=[(input_len, output_len)], image_resolutions=image_resolutions + ) # Process batches try: responses = batch_processor.process_batch( diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 338755b..096ae6c 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -126,8 +126,10 @@ def main(): / f"vllm_online_benchmark_{run_timestamp}_{mesh_device}_isl-{isl}_osl-{osl}_maxcon-{max_concurrent}_n-{num_prompts}.json" ) logger.info(f"\nRunning benchmark {i}/{len(combinations)}") + vllm_dir = os.environ.get("vllm_dir") + assert vllm_dir is not None, "vllm_dir must be set." run_benchmark( - benchmark_script="/home/user/vllm/benchmarks/benchmark_serving.py", + benchmark_script=f"{vllm_dir}/benchmarks/benchmark_serving.py", params=params, model=env_config.vllm_model, port=env_config.service_port, diff --git a/evals/README.md b/evals/README.md index 8ad95bb..9e7d407 100644 --- a/evals/README.md +++ b/evals/README.md @@ -13,24 +13,9 @@ For instructions on building the Docker image see: [vllm-tt-metal-llama3/docs/de ## Step 2: Run Docker container for LM evals development -note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`. +Follow run guide: [vllm-tt-metal-llama3/README.md](../vllm-tt-metal-llama3/README.md) -```bash -cd tt-inference-server -export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/ -docker run \ - --rm \ - -it \ - --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \ - --cap-add ALL \ - --device /dev/tenstorrent:/dev/tenstorrent \ - --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \ - --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \ - --shm-size 32G \ - ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} -``` - -The default Docker image command will start the vLLM server. +note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`. ## Step 3: Inside container set up llama-recipes LM evalulation harness templates @@ -44,7 +29,7 @@ To access Meta Llama 3.1 evals, you must: #### Hugging Face authentication - option 1: HF_TOKEN (if not already passed into Docker container) ```bash # set up HF Token if not already set up in .env, needed for datasets -echo "HF_TOKEN=hf_" >> vllm-tt-metal-llama3/.env +echo "HF_TOKEN=hf_" ``` #### Hugging Face authentication - option 2: huggingface_hub login diff --git a/evals/run_evals.sh b/evals/run_evals.sh index 640d6df..6470267 100644 --- a/evals/run_evals.sh +++ b/evals/run_evals.sh @@ -45,7 +45,7 @@ lm_eval \ --gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \ --tasks meta_gpqa \ --batch_size auto \ ---output_path /home/user/cache_root/eval_output \ +--output_path ${CACHE_ROOT}/eval_output \ --include_path ./work_dir \ --seed 42 \ --log_samples @@ -57,7 +57,7 @@ lm_eval \ --gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \ --tasks meta_ifeval \ --batch_size auto \ ---output_path /home/user/cache_root/eval_output \ +--output_path ${CACHE_ROOT}/eval_output \ --include_path ./work_dir \ --seed 42 \ --log_samples diff --git a/evals/run_evals_vision.sh b/evals/run_evals_vision.sh index f4615ec..ee1b4d8 100644 --- a/evals/run_evals_vision.sh +++ b/evals/run_evals_vision.sh @@ -35,7 +35,7 @@ lm_eval \ --gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \ --tasks mmmu_val \ --batch_size auto \ ---output_path /home/user/cache_root/eval_output \ +--output_path /home/container_app_user/cache_root/eval_output \ --seed 42 \ --log_samples diff --git a/setup.sh b/setup.sh index df7cf64..60e0c77 100755 --- a/setup.sh +++ b/setup.sh @@ -9,18 +9,19 @@ set -euo pipefail # Exit on error, print commands, unset variables treated as e usage() { echo "Usage: $0 " echo "Available model types:" - echo " llama-3.3-70b-instruct" - echo " llama-3.2-11b-vision-instruct" - echo " llama-3.2-3b-instruct" - echo " llama-3.2-1b-instruct" - echo " llama-3.1-70b-instruct" - echo " llama-3.1-70b" - echo " llama-3.1-8b-instruct" - echo " llama-3.1-8b" - echo " llama-3-70b-instruct" - echo " llama-3-70b" - echo " llama-3-8b-instruct" - echo " llama-3-8b" + echo " DeepSeek-R1-Distill-Llama-70B" + echo " Llama-3.3-70B-Instruct" + echo " Llama-3.2-11B-Vision-Instruct" + echo " Llama-3.2-3B-Instruct" + echo " Llama-3.2-1B-Instruct" + echo " Llama-3.1-70B-Instruct" + echo " Llama-3.1-70B" + echo " Llama-3.1-8B-Instruct" + echo " Llama-3.1-8B" + echo " Llama-3-70B-Instruct" + echo " Llama-3-70B" + echo " Llama-3-8B-Instruct" + echo " Llama-3-8B" echo exit 1 } @@ -74,6 +75,7 @@ get_hf_env_vars() { echo "HF_TOKEN environment variable is not set. Please set it before running the script." read -r -s -p "Enter your HF_TOKEN: " input_hf_token echo + echo "entered HF_TOKEN contains: ${#input_hf_token} characters, expected 37." if [ -z "${input_hf_token:-}" ]; then echo "⛔ HF_TOKEN cannot be empty. Please try again." exit 1 @@ -111,84 +113,104 @@ setup_model_environment() { # Set environment variables based on the model selection # note: MODEL_NAME is the directory name for the model weights case "$1" in - "llama-3.3-70b-instruct") + "DeepSeek-R1-Distill-Llama-70B") + IMPL_ID="tt-metal" + MODEL_NAME="DeepSeek-R1-Distill-Llama-70B" + HF_MODEL_REPO_ID="deepseek-ai/DeepSeek-R1-Distill-Llama-70B" + META_MODEL_NAME="" + META_DIR_FILTER="" + REPACKED=1 + ;; + "Llama-3.3-70B-Instruct") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3.3-70B-Instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct" META_MODEL_NAME="" META_DIR_FILTER="" REPACKED=1 ;; - "llama-3.2-11b-vision-instruct") + "Llama-3.2-11B-Vision-Instruct") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3.2-11B-Vision-Instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct" META_MODEL_NAME="" META_DIR_FILTER="" REPACKED=0 ;; - "llama-3.2-3b-instruct") + "Llama-3.2-3B-Instruct") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3.2-3B-Instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.2-3B-Instruct" META_MODEL_NAME="" META_DIR_FILTER="" REPACKED=0 ;; - "llama-3.2-1b-instruct") + "Llama-3.2-1B-Instruct") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3.2-1B-Instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.2-1B-Instruct" META_MODEL_NAME="" META_DIR_FILTER="" REPACKED=0 ;; - "llama-3.1-70b-instruct") + "Llama-3.1-70B-Instruct") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3.1-70B-Instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct" META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct" META_DIR_FILTER="llama3_1" REPACKED=1 ;; - "llama-3.1-70b") + "Llama-3.1-70B") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3.1-70B" HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B" META_MODEL_NAME="Meta-Llama-3.1-70B" META_DIR_FILTER="llama3_1" REPACKED=1 ;; - "llama-3.1-8b-instruct") + "Llama-3.1-8B-Instruct") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3.1-8B-Instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B-Instruct" META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct" META_DIR_FILTER="llama3_1" REPACKED=0 ;; - "llama-3.1-8b") + "Llama-3.1-8B") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3.1-8B" HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B" META_MODEL_NAME="Meta-Llama-3.1-8B" META_DIR_FILTER="llama3_1" REPACKED=0 ;; - "llama-3-70b-instruct") + "Llama-3-70B-Instruct") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3-70B-Instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3-70B-Instruct" META_MODEL_NAME="Meta-Llama-3-70B-Instruct" META_DIR_FILTER="llama3" REPACKED=1 ;; - "llama-3-70b") + "Llama-3-70B") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3-70B" HF_MODEL_REPO_ID="meta-llama/Llama-3-70B" META_MODEL_NAME="Meta-Llama-3-70B" META_DIR_FILTER="llama3" REPACKED=1 ;; - "llama-3-8b-instruct") + "Llama-3-8B-Instruct") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3-8B-Instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3-8B-Instruct" META_MODEL_NAME="Meta-Llama-3-8B-Instruct" META_DIR_FILTER="llama3" REPACKED=0 ;; - "llama-3-8b") + "Llama-3-8B") + IMPL_ID="tt-metal" MODEL_NAME="Llama-3-8B" HF_MODEL_REPO_ID="meta-llama/Llama-3-8B" META_MODEL_NAME="Meta-Llama-3-8B" @@ -201,32 +223,32 @@ setup_model_environment() { exit 1 ;; esac - # Initialize OVERWRITE_ENV - OVERWRITE_ENV=false # Set default values for environment variables DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume - MODEL_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs" - + # Safely handle potentially unset environment variables using default values + PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT} + # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default + read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT + PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT} + echo # move to a new line after input + # Set environment variables with defaults if not already set + MODEL_VERSION="0.0.1" + MODEL_ID="id_${IMPL_ID}-${MODEL_NAME}-v${MODEL_VERSION}" + PERSISTENT_VOLUME="${PERSISTENT_VOLUME_ROOT}/volume_${MODEL_ID}" + + # Initialize OVERWRITE_ENV + OVERWRITE_ENV=false + MODEL_ENV_DIR="${PERSISTENT_VOLUME_ROOT}/model_envs" mkdir -p ${MODEL_ENV_DIR} ENV_FILE="${MODEL_ENV_DIR}/${MODEL_NAME}.env" export ENV_FILE check_and_prompt_env_file - if [ "$OVERWRITE_ENV" = false ]; then echo "✅ using existing .env file: ${ENV_FILE}." return 0 fi - # Safely handle potentially unset environment variables using default values - PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT} - # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default - read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT - PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT} - echo # move to a new line after input - # Set environment variables with defaults if not already set - PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1 - read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token choice_use_hf_token=${input_use_hf_token:-"y"} @@ -283,15 +305,15 @@ setup_model_environment() { cat > ${ENV_FILE} < Add a volume mounting the `test` directory in the container before running with the following in the docker run command: ```bash ---volume $PWD/tests:/home/user/tests +--volume $PWD/tests:/home/container_app_user/tests ``` ## 3. Run The Mock Model @@ -26,7 +26,7 @@ Add a volume mounting the `test` directory in the container before running with Once in the docker container, run the mock script with: ```bash -WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/user/tests/mock_vllm_offline_inference_tt.py +WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/container_app_user/tests/mock_vllm_offline_inference_tt.py ``` # Build mock model container diff --git a/tests/benchmark_vllm_offline_inference.py b/tests/benchmark_vllm_offline_inference.py index 73ef98f..e59c115 100644 --- a/tests/benchmark_vllm_offline_inference.py +++ b/tests/benchmark_vllm_offline_inference.py @@ -30,7 +30,7 @@ def parse_args(): parser.add_argument( "--prompts_json", type=str, - default="/home/user/vllm/tt_metal/prompts.json", + default="/home/container_app_user/vllm/tt_metal/prompts.json", help="Path to JSON file containing prompts", ) parser.add_argument( diff --git a/tests/mock.vllm.openai.api.dockerfile b/tests/mock.vllm.openai.api.dockerfile index 3806700..57039fc 100644 --- a/tests/mock.vllm.openai.api.dockerfile +++ b/tests/mock.vllm.openai.api.dockerfile @@ -17,6 +17,8 @@ ARG DEBIAN_FRONTEND=noninteractive # default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG= ARG TT_METAL_COMMIT_SHA_OR_TAG ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev +# CONTAINER_APP_UID is a random ID, change this and rebuild if it collides with host +ARG CONTAINER_APP_UID=15863 # make build commit SHA available in the image for reference and debugging ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} @@ -62,13 +64,14 @@ RUN git clone --depth 1 https://github.com/tenstorrent-metal/tt-metal.git ${TT_M && bash ./create_venv.sh # user setup -ARG HOME_DIR=/home/user -RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \ +ENV CONTAINER_APP_USERNAME=container_app_user +ARG HOME_DIR=/home/${CONTAINER_APP_USERNAME} +RUN useradd -u ${CONTAINER_APP_UID} -s /bin/bash -d ${HOME_DIR} ${CONTAINER_APP_USERNAME} \ && mkdir -p ${HOME_DIR} \ - && chown -R user:user ${HOME_DIR} \ - && chown -R user:user ${TT_METAL_HOME} - -USER user + && chown -R ${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} ${HOME_DIR} \ + && chown -R ${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} ${TT_METAL_HOME} + +USER ${CONTAINER_APP_USERNAME} # tt-metal python env default RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc @@ -96,21 +99,27 @@ RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install compresse ARG APP_DIR="${HOME_DIR}/app" WORKDIR ${APP_DIR} ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR} -COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src" -COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt" -COPY --chown=user:user "utils" "${APP_DIR}/utils" -COPY --chown=user:user "tests" "${APP_DIR}/tests" +COPY --chown=${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} "vllm-tt-metal-llama3/src" "${APP_DIR}/src" +COPY --chown=${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt" +COPY --chown=${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} "utils" "${APP_DIR}/utils" +COPY --chown=${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} "tests" "${APP_DIR}/tests" RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ && pip install --default-timeout=240 --no-cache-dir -r requirements.txt" -WORKDIR "${APP_DIR}/tests" -CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python mock_vllm_api_server.py"] - # Default environment variables for the Llama-3.1-70b-instruct inference server # Note: LLAMA3_CKPT_DIR and similar variables get set by mock_vllm_api_server.py -ENV CACHE_ROOT=/home/user/cache_root -ENV HF_HOME=/home/user/cache_root/huggingface +ENV CACHE_ROOT=${HOME_DIR}/cache_root +ENV HF_HOME=${CACHE_ROOT}/huggingface ENV MODEL_WEIGHTS_ID=id_repacked-Llama-3.1-70B-Instruct -ENV MODEL_WEIGHTS_PATH=/home/user/cache_root/model_weights/repacked-Llama-3.1-70B-Instruct +ENV MODEL_WEIGHTS_PATH=${CACHE_ROOT}/model_weights/repacked-Llama-3.1-70B-Instruct ENV LLAMA_VERSION=llama3 ENV SERVICE_PORT=7000 + +# Switch back to root for entrypoint +USER root + +COPY docker-entrypoint.sh /usr/local/bin/ +RUN chmod +x /usr/local/bin/docker-entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] +CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python ${APP_DIR}/tests/mock_vllm_api_server.py"] diff --git a/tests/mock_vllm_offline_inference_tt.py b/tests/mock_vllm_offline_inference_tt.py index 71c3de9..dc2ae9a 100644 --- a/tests/mock_vllm_offline_inference_tt.py +++ b/tests/mock_vllm_offline_inference_tt.py @@ -214,7 +214,7 @@ async def generate_tokens_async( parser.add_argument( "--prompts_json", type=str, - default="/home/user/vllm/tt_metal/prompts.json", + default="/home/container_app_user/vllm/tt_metal/prompts.json", help="Path to JSON file containing prompts", ) parser.add_argument( diff --git a/utils/capture_traces.py b/utils/capture_traces.py index f3703b1..699cfdf 100644 --- a/utils/capture_traces.py +++ b/utils/capture_traces.py @@ -2,6 +2,7 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +import argparse import logging from utils.prompt_configs import EnvironmentConfig @@ -14,11 +15,38 @@ logger.setLevel(logging.INFO) -def capture_input_sizes(): +def add_cli_args(parser): + parser.add_argument( + "--include_images", + action="store_true", + help="Include randomly generated images with prompts", + ) + parser.add_argument( + "--image_width", + type=int, + default=256, + help="Width of generated images", + ) + parser.add_argument( + "--image_height", + type=int, + default=256, + help="Height of generated images", + ) + return parser + + +def capture_input_sizes(arg): env_config = EnvironmentConfig() prompt_client = PromptClient(env_config) - prompt_client.capture_traces() + image_resolutions = [] + if args.include_images: + image_resolutions = [(args.image_width, args.image_height)] + prompt_client.capture_traces(image_resolutions=image_resolutions) if __name__ == "__main__": - capture_input_sizes() + parser = argparse.ArgumentParser() + parser = add_cli_args(parser) + args = parser.parse_args() + capture_input_sizes(args) diff --git a/utils/prompt_client.py b/utils/prompt_client.py index 53d70d0..4b7d2ee 100644 --- a/utils/prompt_client.py +++ b/utils/prompt_client.py @@ -98,13 +98,18 @@ def wait_for_healthy(self, timeout: int = 300, interval: int = 10) -> bool: def capture_traces( self, context_lens: List[Tuple[int, int]] = None, - prompts_per_size: int = 1, + image_resolutions: List[Tuple[int, int]] = None, ) -> None: - logger.info("Capturing input sizes ...") + """Capture traces for text and/or image inputs at different sizes. - # Default input sizes based on get_padded_prefill_len() + Args: + context_lens: List of (input_seq_len, output_seq_len) tuples for text lengths + image_resolutions: List of (width, height) tuples for image resolutions + """ + logger.info("Capturing traces for input configurations...") + + # Default input sizes if none provided if context_lens is None: - # generate 4 osl tokens by default for each isl context_lens = [ (32, 4), (64, 4), @@ -121,49 +126,106 @@ def capture_traces( if not self.wait_for_healthy(): raise RuntimeError("vLLM did not start correctly!") + # Import image generation only if needed + if image_resolutions: + from utils.prompt_generation import generate_random_images + + # Process each text length configuration for isl, osl in context_lens: - logger.info(f"Capture trace: isl={isl}, osl={osl}") + logger.info( + f"Capturing traces for input_seq_len={isl}, output_seq_len={osl}" + ) - # Create prompt config for current size + # Create prompt config prompt_config = PromptConfig( input_seq_len=isl, max_prompt_length=isl, - num_prompts=prompts_per_size, + num_prompts=1, distribution="fixed", dataset="random", tokenizer_model=self.env_config.vllm_model, template=None, save_path=None, print_prompts=False, + use_chat_api=bool(image_resolutions), # Use chat API if we have images ) # Generate prompts for current size prompts, prompt_lengths = generate_prompts(prompt_config) - # Process each prompt - for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)): - try: - logger.info( - f"Starting trace capture for: input_seq_len:={prompt_len}, output_seq_len:={osl}" - ) - response_data = self.call_inference( - prompt=prompt, - images=[], - response_idx=i, - prompt_len=prompt_len, - max_tokens=osl, - stream=True, - vll_model=self.env_config.vllm_model, - tokenizer=None, - force_max_tokens=True, - ) - logger.info( - f"tokens generated: {response_data['output_seq_len']}, " - f"TTFT: {response_data['ttft_ms']:.3f} ms, " - f"TPOT: {response_data['tpot_ms']:.3f} ms" - ) - except Exception as e: - logger.error(f"Error processing prompt: {e}") + # If no image resolutions specified, do text-only traces + if not image_resolutions: + for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)): + try: + logger.info( + f"Starting text trace capture: " + f"input_seq_len={prompt_len}, output_seq_len={osl}" + ) + response_data = self.call_inference( + prompt=prompt, + images=[], + response_idx=i, + prompt_len=prompt_len, + max_tokens=osl, + stream=True, + vll_model=self.env_config.vllm_model, + tokenizer=None, + force_max_tokens=True, + use_chat_api=False, + ) + logger.info( + f"Text trace completed: " + f"tokens_generated={response_data['output_seq_len']}, " + f"TTFT={response_data['ttft_ms']:.3f}ms, " + f"TPOT={response_data['tpot_ms']:.3f}ms\n" + ) + except Exception as e: + logger.error(f"Error processing text prompt: {e}") + continue + else: + # Process each image resolution with the current text length + for width, height in image_resolutions: + for i, (prompt, prompt_len) in enumerate( + zip(prompts, prompt_lengths) + ): + try: + # Generate random image at current resolution + image_data = generate_random_images( + width=width, + height=height, + base64_encoded=True, + ) + + logger.info( + f"Starting image + text trace capture: " + f"input_seq_len={prompt_len}, output_seq_len={osl}, " + f"image_size={width}x{height}" + ) + + response_data = self.call_inference( + prompt=prompt, + images=[image_data], + response_idx=i, + prompt_len=prompt_len, + max_tokens=osl, + stream=True, + vll_model=self.env_config.vllm_model, + tokenizer=None, + force_max_tokens=True, + use_chat_api=True, + ) + + logger.info( + f"Image + Text trace completed: " + f"tokens_generated={response_data['output_seq_len']}, " + f"TTFT={response_data['ttft_ms']:.3f}ms, " + f"TPOT={response_data['tpot_ms']:.3f}ms\n" + ) + except Exception as e: + logger.error( + f"Error processing prompt with image {width}x{height}: {e}" + ) + continue def call_inference( self, diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index 63b451a..f8ce8a9 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -214,8 +214,14 @@ def main(): if not args.skip_trace_precapture: # pre-capture traces to not include 1st run trace capture time + image_resolutions = [] + if images: + image_resolutions = [ + (prompt_config.image_width, prompt_config.image_height) + ] prompt_client.capture_traces( - context_lens=[(args.input_seq_len, args.output_seq_len)] + context_lens=[(args.input_seq_len, args.output_seq_len)], + image_resolutions=image_resolutions, ) # Process batches diff --git a/utils/prompt_configs.py b/utils/prompt_configs.py index f43cb89..f35a9c5 100644 --- a/utils/prompt_configs.py +++ b/utils/prompt_configs.py @@ -44,7 +44,7 @@ def get_mesh_device(): # need record of what MESH_DEVICE configuration is running raise ValueError( "environment variable MESH_DEVICE must be set.\n", - "Possible values: N150, N300, T3K_LINE", + "Possible values: N150, N300, T3K_LINE, T3K_RING", ) return mesh_device diff --git a/vllm-tt-metal-llama3/README.md b/vllm-tt-metal-llama3/README.md index 62f6079..6d5de44 100644 --- a/vllm-tt-metal-llama3/README.md +++ b/vllm-tt-metal-llama3/README.md @@ -24,18 +24,19 @@ Run the container from the project root at `tt-inference-server`: ```bash cd tt-inference-server # make sure if you already set up the model weights and cache you use the correct persistent volume -export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-Llama-3.3-70B-Instructv0.0.1/ +export MODEL_NAME=Llama-3.3-70B-Instruct +export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-${MODEL_NAME}-v0.0.1/ docker run \ --rm \ -it \ - --env-file persistent_volume/model_envs/Llama-3.3-70B-Instruct.env \ + --env-file persistent_volume/model_envs/${MODEL_NAME}.env \ --cap-add ALL \ --device /dev/tenstorrent:/dev/tenstorrent \ --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \ - --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \ + --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/container_app_user/cache_root:rw \ --shm-size 32G \ --publish 7000:7000 \ - ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50 + ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64:v0.0.1-47fb1a2fb6e0-2f33504bad49 ``` By default the Docker container will start running the entrypoint command wrapped in `src/run_vllm_api_server.py`. @@ -65,7 +66,6 @@ python example_requests_client_alpaca_eval.py --stream True --n_samples 1 --num_ python example_requests_client_alpaca_eval.py --stream True --n_samples 805 --num_full_iterations 1 --batch_size 32 ``` - ## First run setup Tested starting condition is from a fresh installation of Ubuntu 20.04 with Tenstorrent system dependencies installed. @@ -78,11 +78,17 @@ Recommended to follow postinstall guide to allow $USER to run docker without sud ### 2. Ensure system dependencies installed +Follow TT strating guide software installation at: https://docs.tenstorrent.com/quickstart.html + +Ensure all set up: +- firmware: tt-firmware (https://github.com/tenstorrent/tt-firmware) +- drivers: tt-kmd (https://github.com/tenstorrent/tt-kmd) +- hugepages: see https://docs.tenstorrent.com/quickstart.html#step-4-setup-hugepages and https://github.com/tenstorrent/tt-system-tools - tt-smi: https://github.com/tenstorrent/tt-smi -- firmware: bundle 80.10.1.0 (https://github.com/tenstorrent/tt-firmware/blob/02b4b6ed49b6ea2fb9a8664e99d4fed25e443bd6/experiments/fw_pack-80.10.1.0.fwbundle) -- drivers: tt-kmd version 1.29 (https://github.com/tenstorrent/tt-kmd/tree/ttkmd-1.29) -- topology: ensure mesh topology https://github.com/tenstorrent/tt-topology -- hugepages: https://github.com/tenstorrent/tt-system-tools + +If running on a TT-LoudBox or TT-QuietBox, you will also need: +- topology: tt-topology https://github.com/tenstorrent/tt-topology + - set up mesh topology, see https://github.com/tenstorrent/tt-topology?tab=readme-ov-file#mesh ### 3. CPU performance setting @@ -105,7 +111,7 @@ Either download the Docker image from GitHub Container Registry (recommended for ```bash # pull image from GHCR -docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50 +docker pull ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64:v0.0.1-47fb1a2fb6e0-2f33504bad49 ``` #### Option B: Build Docker Image @@ -124,7 +130,7 @@ The script `setup.sh` automates: ```bash cd tt-inference-server chmod +x setup.sh -./setup.sh llama-3.1-70b-instruct +./setup.sh Llama-3.3-70B-instruct ``` # Additional Documentation diff --git a/vllm-tt-metal-llama3/docs/development.md b/vllm-tt-metal-llama3/docs/development.md index 3537af2..0756e8f 100644 --- a/vllm-tt-metal-llama3/docs/development.md +++ b/vllm-tt-metal-llama3/docs/development.md @@ -54,27 +54,29 @@ docker push ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base ## Step 2: Run container for LM evals development -note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`. +note: this requires running `setup.sh` to set up the weights for a particular model, in this example `Llama-3.3-70B-Instruct`. ```bash cd tt-inference-server -export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-Llama-3.3-70B-Instructv0.0.1/ +export MODEL_NAME=Llama-3.3-70B-Instruct +export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-${MODEL_NAME}-v0.0.1/ + docker run \ --rm \ -it \ - --env-file persistent_volume/model_envs/Llama-3.1-70B-Instruct.env \ + --env-file persistent_volume/model_envs/${MODEL_NAME}.env \ --cap-add ALL \ --device /dev/tenstorrent:/dev/tenstorrent \ --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \ - --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \ + --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/container_app_user/cache_root:rw \ --shm-size 32G \ - ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} bash + ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64:v0.0.1-{TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} bash ``` additionally for development you can mount the volumes: ```bash - --volume $PWD/../vllm:/home/user/vllm \ - --volume $PWD/../lm-evaluation-harness:/home/user/lm-evaluation-harness \ + --volume $PWD/../vllm:/home/container_app_user/vllm \ + --volume $PWD/../lm-evaluation-harness:/home/container_app_user/lm-evaluation-harness \ ``` ## Step 3: Inside container setup and run vLLM @@ -93,7 +95,7 @@ Already built into Docker image, continue to run vLLM. ```bash # option 2: install from github -cd /home/user/vllm +cd /home/container_app_user/vllm git fetch git checkout git pull @@ -104,7 +106,7 @@ echo "done vllm install." ```bash # option 3: install edittable (for development) - mount from outside container -cd /home/user/vllm +cd /home/container_app_user/vllm pip install -e . echo "done vllm install." ``` @@ -113,7 +115,7 @@ echo "done vllm install." ```bash # run vllm serving -cd /home/user/vllm +cd /home/container_app_user/vllm python examples/server_example_tt.py ``` diff --git a/vllm-tt-metal-llama3/src/run_vllm_api_server.py b/vllm-tt-metal-llama3/src/run_vllm_api_server.py index 98b95a4..527d482 100644 --- a/vllm-tt-metal-llama3/src/run_vllm_api_server.py +++ b/vllm-tt-metal-llama3/src/run_vllm_api_server.py @@ -32,6 +32,8 @@ def register_vllm_models(): hf_model_id = get_hf_model_id() if hf_model_id in legacy_impl_models: from models.demos.t3000.llama2_70b.tt.generator_vllm import TtLlamaForCausalLM + + ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaForCausalLM) else: from models.demos.llama3.tt.generator_vllm import ( TtMllamaForConditionalGeneration, @@ -95,10 +97,33 @@ def ensure_mesh_device(hf_model_id): print(f"using MESH_DEVICE:={os.environ.get('MESH_DEVICE')}") +def runtime_settings(hf_model_id): + # default runtime env vars + env_vars = { + "TT_METAL_ASYNC_DEVICE_QUEUE": 1, + "WH_ARCH_YAML": "wormhole_b0_80_arch_eth_dispatch.yaml", + } + env_var_map = { + "meta-llama/Llama-3.1-70B-Instruct": { + "LLAMA_VERSION": "llama3", + }, + "meta-llama/Llama-3.3-70B-Instruct": { + "LLAMA_VERSION": "llama3", + }, + } + env_vars.update(env_var_map.get(hf_model_id, {})) + # Set each environment variable + print("setting runtime environment variables:") + for key, value in env_vars.items(): + print(f"{key}={value}") + os.environ[key] = str(value) + + def model_setup(hf_model_id): # TODO: check HF repo access with HF_TOKEN supplied print(f"using model: {hf_model_id}") ensure_mesh_device(hf_model_id) + runtime_settings(hf_model_id) args = { "model": hf_model_id, "block_size": "64",