From 5f876c648e76245b5be05f835858f7c3285d6326 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 14 Jan 2025 16:38:10 -0500
Subject: [PATCH 1/3] Llama 3.x model support, setup.sh script multiple model
 support using HF download

change log:
- add multiple model support using persistent_volume/model_envs/*.env
- setup using Hugging Face huggingface-cli to download models: llama model install script support for llama CLI and huggingface hub #14
- add model setup for llama 3.x
- address Initial vLLM setup fails due to missing HuggingFace permissions #37
- address Docker run support for HF_TOKEN authentication using env var pass in #23
- renamed vllm-tt-metal-llama3-70 to vllm-tt-metal-llama3 for all llama 3.x models
- updated documentation for v0 drop
- add Docker Ubuntu 22.04 option for vLLM llama 3.x
---
 README.md                                     |   2 +-
 scripts/add_spdx_header.py                    |   2 +-
 setup.sh                                      | 562 ++++++++++++++++++
 tests/mock.vllm.openai.api.dockerfile         |   4 +-
 vllm-tt-metal-llama3-70b/setup.sh             | 388 ------------
 .../README.md                                 |  16 +-
 .../docs/development.md                       |  25 +-
 .../requirements.txt                          |   0
 .../src/__init__.py                           |   0
 .../src/example_openai_client_alpaca_eval.py  |   0
 .../example_requests_client_alpaca_eval.py    |   0
 .../src/run_vllm_api_server.py                |  60 +-
 ...m.llama3.src.ubuntu-20.04-amd64.Dockerfile |  11 +-
 ...m.llama3.src.ubuntu-22.04-amd64.Dockerfile | 112 ++++
 14 files changed, 750 insertions(+), 432 deletions(-)
 create mode 100755 setup.sh
 delete mode 100755 vllm-tt-metal-llama3-70b/setup.sh
 rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/README.md (90%)
 rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/docs/development.md (78%)
 rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/requirements.txt (100%)
 rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/src/__init__.py (100%)
 rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/src/example_openai_client_alpaca_eval.py (100%)
 rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/src/example_requests_client_alpaca_eval.py (100%)
 rename {vllm-tt-metal-llama3-70b => vllm-tt-metal-llama3}/src/run_vllm_api_server.py (62%)
 rename vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile => vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile (89%)
 create mode 100644 vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile

diff --git a/README.md b/README.md
index 42b0aaef..fcc8e524 100644
--- a/README.md
+++ b/README.md
@@ -15,5 +15,5 @@ Please follow setup instructions found in each model folder's README.md doc
 ## Model Implementations
 | Model          | Hardware                    |
 |----------------|-----------------------------|
-| [LLaMa 3.1 70B](vllm-tt-metal-llama3-70b/README.md)  | TT-QuietBox & TT-LoudBox    |
+| [LLaMa 3.1 70B](vllm-tt-metal-llama3/README.md)  | TT-QuietBox & TT-LoudBox    |
 | [Mistral 7B](tt-metal-mistral-7b/README.md) | n150 and n300|
\ No newline at end of file
diff --git a/scripts/add_spdx_header.py b/scripts/add_spdx_header.py
index ad3d7e9c..3d1a63df 100644
--- a/scripts/add_spdx_header.py
+++ b/scripts/add_spdx_header.py
@@ -30,7 +30,7 @@ def add_spdx_header(file_path):
     repo_root = Path(__file__).resolve().parent.parent
     directories_to_process = [
         repo_root / "tt-metal-llama3-70b",
-        repo_root / "vllm-tt-metal-llama3-70b",
+        repo_root / "vllm-tt-metal-llama3",
         repo_root / "tt-metal-mistral-7b",
         repo_root / "tt-metal-yolov4",
         repo_root / "tests",
diff --git a/setup.sh b/setup.sh
new file mode 100755
index 00000000..3b062ada
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,562 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+set -euo pipefail  # Exit on error, print commands, unset variables treated as errors, and exit on pipeline failure
+
+# Function to display usage information
+usage() {
+    echo "Usage: $0 <model_type>"
+    echo "Available model types:"
+    echo "  llama-3.3-70b-instruct"
+    echo "  llama-3.2-11b-vision-instruct"
+    echo "  llama-3.2-3b-instruct"
+    echo "  llama-3.2-1b-instruct"
+    echo "  llama-3.1-70b-instruct"
+    echo "  llama-3.1-70b"
+    echo "  llama-3.1-8b-instruct"
+    echo "  llama-3.1-8b"
+    echo "  llama-3-70b-instruct"
+    echo "  llama-3-70b"
+    echo "  llama-3-8b-instruct"
+    echo "  llama-3-8b"
+    echo
+    exit 1
+}
+
+# globals
+readonly REPO_ROOT=$(dirname "$(realpath "$0")")
+
+check_and_prompt_env_file() {
+    local MODEL_NAME_KEY="MODEL_NAME"
+    local MODEL_NAME=""
+    # Check if .env file exists
+    if [[ -f "${ENV_FILE}" ]]; then
+        # Extract the MODEL_NAME value from .env
+        echo "found ENV_FILE: ${ENV_FILE}"
+        FOUND_MODEL_NAME=$(grep "^$MODEL_NAME_KEY=" "$ENV_FILE" | cut -d '=' -f2) || FOUND_MODEL_NAME=""
+        # If MODEL_NAME is found, display it
+        if [[ -n "$FOUND_MODEL_NAME" ]]; then
+            echo "The existing file ${ENV_FILE} contains MODEL_NAME: $FOUND_MODEL_NAME"
+            # Prompt the user to overwrite or exit
+            local choice=""
+            read -p "Do you want to overwrite the existing file ${ENV_FILE}? (y/n) [default: y]:" choice
+            choice=${choice:-y}
+            # Handle user's choice
+            case "$choice" in
+                y|Y )
+                    echo "Overwriting the ${ENV_FILE} file ..."
+                    # Logic to overwrite .env goes here
+                    OVERWRITE_ENV=true
+                    ;;
+                n|N )
+                    OVERWRITE_ENV=false
+                    ;;
+                * )
+                    echo "⛔ Invalid option. Exiting."
+                    exit 1
+                    ;;
+            esac
+        else
+            echo "MODEL_NAME not found in ${ENV_FILE}. Overwritting."
+            OVERWRITE_ENV=true
+        fi
+    else
+        echo "${ENV_FILE} does not exist. Proceeding to create a new one."
+        OVERWRITE_ENV=true
+    fi
+}
+
+get_hf_env_vars() {
+    # get HF_TOKEN
+    if [ -z "${HF_TOKEN:-}" ]; then
+        echo "HF_TOKEN environment variable is not set. Please set it before running the script."
+        read -r -s -p "Enter your HF_TOKEN: " input_hf_token
+        echo
+        if [ -z "${input_hf_token:-}" ]; then
+            echo "⛔ HF_TOKEN cannot be empty. Please try again."
+            exit 1
+        elif [[ ! "$input_hf_token" == hf_* ]]; then
+            echo "⛔ HF_TOKEN must start with 'hf_'. Please try again."
+            exit 1
+        fi
+        HF_TOKEN=${input_hf_token}
+        echo "✅ HF_TOKEN set."
+    fi
+    # get HF_HOME
+    if [ -z "${HF_HOME:-}" ]; then
+        echo "HF_HOME environment variable is not set. Please set it before running the script."
+        read -r -p "Enter your HF_HOME [default: $HOME/.cache/huggingface]:" input_hf_home
+        echo
+        input_hf_home=${input_hf_home:-"$HOME/.cache/huggingface"}
+        if [ ! -d "$input_hf_home" ] || [ ! -w "$input_hf_home" ]; then
+            echo "⛔ HF_HOME must be a valid directory and writable by the user. Please try again."
+            exit 1
+        fi
+        HF_HOME=${input_hf_home}
+        echo "✅ HF_HOME set."
+    fi
+}
+
+# Function to set environment variables based on the model selection and write them to .env
+setup_model_environment() {
+    # Set environment variables based on the model selection
+    # note: MODEL_NAME is the lower cased basename of the HF repo ID
+    case "$1" in
+        "llama-3.3-70b-instruct")
+        MODEL_NAME="llama-3.3-70b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
+        REPACKED=1
+        ;;
+        "llama-3.2-11b-vision-instruct")
+        MODEL_NAME="llama-3.2-11b-vision-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
+        REPACKED=0
+        ;;
+        "llama-3.2-3b-instruct")
+        MODEL_NAME="llama-3.2-3b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.2-3B-Instruct"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
+        REPACKED=0
+        ;;
+        "llama-3.2-1b-instruct")
+        MODEL_NAME="llama-3.2-1b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.2-1B-Instruct"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
+        REPACKED=0
+        ;;
+        "llama-3.1-70b-instruct")
+        MODEL_NAME="llama-3.1-70b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct"
+        META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct"
+        META_DIR_FILTER="llama3_1"
+        REPACKED=1
+        ;;
+        "llama-3.1-70b")
+        MODEL_NAME="llama-3.1-70b"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B"
+        META_MODEL_NAME="Meta-Llama-3.1-70B"
+        META_DIR_FILTER="llama3_1"
+        REPACKED=1
+        ;;
+        "llama-3.1-8b-instruct")
+        MODEL_NAME="llama-3.1-8b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B-Instruct"
+        META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct"
+        META_DIR_FILTER="llama3_1"
+        REPACKED=0
+        ;;
+        "llama-3.1-8b")
+        MODEL_NAME="llama-3.1-8b"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B"
+        META_MODEL_NAME="Meta-Llama-3.1-8B"
+        META_DIR_FILTER="llama3_1"
+        REPACKED=0
+        ;;
+        "llama-3-70b-instruct")
+        MODEL_NAME="llama-3-70b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3-70B-Instruct"
+        META_MODEL_NAME="Meta-Llama-3-70B-Instruct"
+        META_DIR_FILTER="llama3"
+        REPACKED=1
+        ;;
+        "llama-3-70b")
+        MODEL_NAME="llama-3-70b"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3-70B"
+        META_MODEL_NAME="Meta-Llama-3-70B"
+        META_DIR_FILTER="llama3"
+        REPACKED=1
+        ;;
+        "llama-3-8b-instruct")
+        MODEL_NAME="llama-3-8b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3-8B-Instruct"
+        META_MODEL_NAME="Meta-Llama-3-8B-Instruct"
+        META_DIR_FILTER="llama3"
+        REPACKED=0
+        ;;
+        "llama-3-8b")
+        MODEL_NAME="llama-3-8b"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3-8B"
+        META_MODEL_NAME="Meta-Llama-3-8B"
+        META_DIR_FILTER="llama3"
+        REPACKED=0
+        ;;
+        *)
+        echo "⛔ Invalid model choice."
+        usage
+        exit 1
+        ;;
+    esac
+    # Initialize OVERWRITE_ENV
+    OVERWRITE_ENV=false
+
+    # Set default values for environment variables
+    DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume
+    MODEL_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs"
+    
+    mkdir -p ${MODEL_ENV_DIR}
+    ENV_FILE="${MODEL_ENV_DIR}/${MODEL_NAME}.env"
+    export ENV_FILE
+    check_and_prompt_env_file
+
+
+    if [ "$OVERWRITE_ENV" = false ]; then
+        echo "✅ using existing .env file: ${ENV_FILE}."
+        return 0
+    fi
+    # Safely handle potentially unset environment variables using default values
+    PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
+    # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
+    read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
+    PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
+    echo # move to a new line after input   
+    # Set environment variables with defaults if not already set
+    PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1
+    
+
+    read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token
+    choice_use_hf_token=${input_use_hf_token:-"y"}
+    echo # move to a new line after input
+    # Handle user's choice
+    case "$choice_use_hf_token" in
+        y|Y )
+            echo "Using 🤗 Hugging Face Token."
+            get_hf_env_vars
+            # default location for HF e.g. ~/.cache/huggingface/models/meta-llama/Llama-3.3-70B-Instruct
+            # LLAMA_WEIGHTS_DIR=${HF_HOME}/local_dir/${HF_MODEL_REPO_ID}
+            WEIGHTS_DIR=${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}
+            ;;
+        n|N )
+            if [ -z "${META_DIR_FILTER:-}" ]; then
+                echo "⛔ MODEL_NAME=${MODEL_NAME} does not support using direct Meta authorization model download. Please use Hugging Face method."
+            fi
+            echo "Using direct authorization from Meta. You will need their URL Authorization token, typically from their website or email."
+            # Prompt user for LLAMA_REPO if not already set or use default
+            read -r -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO
+            LLAMA_REPO=${INPUT_LLAMA_REPO:-$LLAMA_REPO}
+            LLAMA_DIR=${LLAMA_DIR:-${LLAMA_REPO}/models/${META_DIR_FILTER}}
+            LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-${LLAMA_DIR}/${META_MODEL_NAME}}
+            echo  # move to a new line after input
+            ;;
+        * )
+            echo "⛔ Invalid option. Exiting."
+            exit 1
+            ;;
+    esac
+
+    # Prompt user for JWT_SECRET securely
+    read -sp "Enter your JWT_SECRET: " JWT_SECRET
+    echo  # move to a new line after input
+    # Verify the JWT_SECRET is not empty
+    if [ -z "${JWT_SECRET:-}" ]; then
+        echo "⛔ JWT_SECRET cannot be empty. Please try again."
+        exit 1
+    fi
+
+    if [ "${REPACKED}" -eq 1 ]; then
+        echo "REPACKED is enabled."
+        REPACKED_STR="repacked-"
+    else
+        echo "REPACKED is disabled."
+        REPACKED_STR=""
+    fi
+
+    # Write environment variables to .env file
+    echo "Writing environment variables to ${ENV_FILE} ..."
+    cat > ${ENV_FILE} <<EOF
+# Environment variables for the model setup
+USE_HF_DOWNLOAD=$choice_use_hf_token
+MODEL_NAME=$MODEL_NAME
+META_MODEL_NAME=$META_MODEL_NAME
+HF_MODEL_REPO_ID=$HF_MODEL_REPO_ID
+HOST_HF_HOME=${HF_HOME:-""}
+# host paths
+LLAMA_REPO=${LLAMA_REPO:-""}
+LLAMA_DIR=${LLAMA_DIR:-""}
+LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-""}
+PERSISTENT_VOLUME_ROOT=$PERSISTENT_VOLUME_ROOT
+PERSISTENT_VOLUME=$PERSISTENT_VOLUME
+WEIGHTS_DIR=${WEIGHTS_DIR:-""}
+# container paths
+REPACKED=${REPACKED}
+REPACKED_STR=${REPACKED_STR}
+CACHE_ROOT=/home/user/cache_root
+HF_HOME=/home/user/cache_root/huggingface
+MODEL_WEIGHTS_ID=id_${REPACKED_STR}$MODEL_NAME
+MODEL_WEIGHTS_PATH=/home/user/cache_root/model_weights/${REPACKED_STR}$MODEL_NAME
+LLAMA_VERSION=llama3
+TT_METAL_ASYNC_DEVICE_QUEUE=1
+WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
+SERVICE_PORT=7000
+LLAMA3_CKPT_DIR=/home/user/cache_root/model_weights/${REPACKED_STR}$MODEL_NAME
+LLAMA3_TOKENIZER_PATH=/home/user/cache_root/model_weights/${REPACKED_STR}$MODEL_NAME/tokenizer.model
+LLAMA3_CACHE_PATH=/home/user/cache_root/tt_metal_cache/cache_${REPACKED_STR}$MODEL_NAME
+# These are secrets and must be stored securely for production environments
+JWT_SECRET=$JWT_SECRET
+HF_TOKEN=${HF_TOKEN:-""}
+EOF
+
+    echo "Environment variables written to: ${ENV_FILE}"
+    echo "✅ setup_model_environment completed!"
+}
+
+# Function to load environment variables from .env file
+load_env() {
+    if [ -f ${ENV_FILE} ]; then
+        echo "Sourcing environment variables from ${ENV_FILE} file..."
+        source ${ENV_FILE}
+    else
+        echo "⛔ ${ENV_FILE} file not found. Please run the setup first."
+        exit 1
+    fi
+}
+
+# SUDO PORTION: Encapsulated in a function to handle all sudo-requiring tasks
+setup_permissions() {
+    # Load environment variables from .env
+    load_env
+
+    echo "Running sudo-required commands..."
+    # Create group 'dockermount' if it doesn't exist
+    if ! getent group dockermount > /dev/null 2>&1; then
+        echo "Creating group 'dockermount' ..."
+        sudo groupadd dockermount
+    else
+        echo "Group 'dockermount' already exists."
+    fi
+
+    # Add host user to 'dockermount' group
+    echo "Adding user: '$USER' to 'dockermount' group ..."
+    sudo usermod -aG dockermount "$USER"
+
+    # Get container user with UID 1000 and add to group
+    CONTAINER_UID=1000
+    CONTAINER_USER=$(getent passwd ${CONTAINER_UID} | cut -d: -f1)
+    if [ -n "$CONTAINER_USER" ]; then
+        echo "Adding container user: '$CONTAINER_USER' (UID ${CONTAINER_UID}) to 'dockermount' group ..."
+        sudo usermod -aG dockermount "$CONTAINER_USER"
+    else
+        echo "No user found with UID ${CONTAINER_UID}."
+    fi
+
+    # Set file ownership and permissions
+    echo "Setting file ownership and permissions for container and host access ..."
+    if [ ! -d "${PERSISTENT_VOLUME}" ]; then
+        # if the user point the PERSISTENT_VOLUME
+        sudo mkdir -p "${PERSISTENT_VOLUME}"
+    fi
+    sudo chown -R ${CONTAINER_UID}:dockermount "${PERSISTENT_VOLUME}"
+    sudo chmod -R 775 "${PERSISTENT_VOLUME}"
+
+    echo "✅ setup_permissions completed!"
+}
+
+# Shared function for repacking weights
+repack_weights() {
+    local source_dir="$1"
+    local target_dir="$2"
+
+    # Create target directory if it doesn't exist
+    mkdir -p "${target_dir}"
+
+    # Copy required files
+    cp "${source_dir}/tokenizer.model" "${target_dir}/tokenizer.model"
+    cp "${source_dir}/params.json" "${target_dir}/params.json"
+    
+    # Set up Python environment for repacking
+    VENV_NAME=".venv_repack"
+    echo "Setting up python venv for repacking: ${VENV_NAME}"
+    python3 -m venv ${VENV_NAME}
+    source ${VENV_NAME}/bin/activate
+    pip install --upgrade setuptools wheel pip==21.2.4 tqdm
+    pip install --index-url https://download.pytorch.org/whl/cpu torch==2.2.1
+    
+    # Download repacking script
+    curl -O https://raw.githubusercontent.com/tenstorrent/tt-metal/refs/heads/main/models/demos/t3000/llama2_70b/scripts/repack_weights.py
+    
+    echo "Repacking weights..."
+    python repack_weights.py "${source_dir}" "${target_dir}" 5
+    
+    # Cleanup
+    deactivate
+    rm -rf ${VENV_NAME} repack_weights.py
+    
+    echo "✅ Weight repacking completed!"
+}
+
+setup_weights_meta() {
+    # Step 1: Set up Llama model repository path
+    echo "Using repository path: $LLAMA_REPO"
+
+    # Step 2: Clone the repository (if it doesn't already exist)
+    if [ ! -d "$LLAMA_REPO" ]; then
+        echo "Cloning the Llama repository to: $LLAMA_REPO"
+        git clone https://github.com/meta-llama/llama-models.git "$LLAMA_REPO"
+        cd "$LLAMA_REPO"
+        # checkout commit before ./download.sh was removed
+        git checkout 685ac4c107c75ce8c291248710bf990a876e1623
+    else
+        echo "🔔 Llama repository already exists at $LLAMA_REPO"
+    fi
+
+    # Step 3: Check if weights are already downloaded
+    if [ -d "${LLAMA_WEIGHTS_DIR}" ] && [ "$(ls -A "${LLAMA_WEIGHTS_DIR}")" ]; then
+        echo "Weights already downloaded at ${LLAMA_WEIGHTS_DIR}"
+        echo "Skipping download."
+    else
+        echo "Running the download script to download models at ${LLAMA_DIR}/download.sh ..."
+        cd "$LLAMA_DIR"
+        ./download.sh
+        cd -
+    fi
+
+    if [ "${REPACKED}" -eq 1 ]; then
+        WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+        repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
+    else
+        WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}"
+        cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
+    fi
+
+    echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+    echo "✅ setup_weights_meta completed!"
+}
+
+setup_weights_huggingface() {
+    # Step 1: Verify HF_TOKEN and HF_HOME are set
+    if [ -z "${HF_TOKEN:-}" ] || [ -z "${HOST_HF_HOME:-}" ]; then
+        echo "⛔ HF_TOKEN or HF_HOME not set. Please ensure both environment variables are set."
+        exit 1
+    fi
+
+    # Step 3: Create python virtual environment for huggingface downloads
+    VENV_NAME=".venv_hf_setup"
+    echo "Setting up python venv for Hugging Face downloads: ${VENV_NAME}"
+    python3 -m venv ${VENV_NAME}
+    source ${VENV_NAME}/bin/activate
+
+    # Step 4: Install required packages
+    pip install --upgrade pip setuptools wheel
+    pip install "huggingface_hub[cli]"
+
+    # Step 5: Download model using huggingface-cli
+    echo "Downloading model from Hugging Face Hub..."
+    # stop timeout issue: https://huggingface.co/docs/huggingface_hub/en/guides/cli#download-timeout
+    export HF_HUB_DOWNLOAD_TIMEOUT=60
+    # using default HF naming convention for model weights
+    huggingface-cli download "${HF_MODEL_REPO_ID}" \
+        original/params.json \
+        original/tokenizer.model \
+        original/consolidated.* \
+        --cache-dir="${HOST_HF_HOME}" \
+        --token="${HF_TOKEN}"
+
+    if [ $? -ne 0 ]; then
+        echo "⛔ Error occured during: huggingface-cli download ${HF_MODEL_REPO_ID}"
+        echo "🔔 check for common issues:"
+        echo "  1. 401 Unauthorized error occurred."
+        echo "    For example:"
+        echo "      huggingface_hub.errors.GatedRepoError: 401 Client Error. Cannot access gated repo"
+        echo "      ❗ In this case, go to the repo URL in your web browser and click through the access request form."
+        echo "  2. check correct HF_TOKEN is set in the .env file: ${ENV_FILE}"
+        exit 1
+    fi
+
+    # symlinks are broken for huggingface-cli download with --local-dir option
+    # see: https://github.com/huggingface/huggingface_hub/pull/2223
+    # to use symlinks, find most recent snapshot and create symlink to that
+    mkdir -p "${WEIGHTS_DIR}"
+    LOCAL_REPO_NAME=$(echo "${HF_MODEL_REPO_ID}" | sed 's|/|--|g')
+    SNAPSHOT_DIR="${HOST_HF_HOME}/models--${LOCAL_REPO_NAME}/snapshots"
+    # note: ls -td will sort by modification date descending, potential edge case
+    # if desired snapshot is not most recent modified or ls sorts differently
+    MOST_RECENT_SNAPSHOT=$(ls -td -- ${SNAPSHOT_DIR}/* | head -n 1)
+    echo "create symlink: ${MOST_RECENT_SNAPSHOT}/original/ -> ${WEIGHTS_DIR}"
+    for item in ${MOST_RECENT_SNAPSHOT}/original/*; do
+        ln -s "$item" "${WEIGHTS_DIR}"
+    done
+
+    # Step 6: Process and copy weights
+    if [ "${REPACKED}" -eq 1 ]; then
+        REPACKED_WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+        mkdir -p "${REPACKED_WEIGHTS_DIR}"
+        repack_weights "${WEIGHTS_DIR}" "${REPACKED_WEIGHTS_DIR}"
+    fi
+
+    # Step 7: Cleanup
+    deactivate
+    rm -rf ${VENV_NAME}
+
+    echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+    echo "✅ setup_weights_huggingface completed!"
+}
+
+setup_tt_metal_cache() {
+    # check if tt_metal_cache already exists
+    TT_METAL_CACHE_DIR="${PERSISTENT_VOLUME}/tt_metal_cache/cache_${REPACKED_STR}$MODEL_NAME"
+    if [ -d "${TT_METAL_CACHE_DIR}" ]; then
+        echo "✅ tt_metal_cache already exists at: ${TT_METAL_CACHE_DIR}."
+        return 0
+    fi
+
+    # create tt_metal_cache directory
+    mkdir -p "${TT_METAL_CACHE_DIR}"
+    echo "✅ setup_tt_metal_cache completed!"
+}
+
+setup_weights() {
+    load_env
+
+    # check if model weights already exist
+    if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then
+        echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+        echo "🔔 check if directory contents are correct."
+        echo "contents:"
+        echo "ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+        echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})"
+        echo
+        echo "If directory does not have correct weights, to re-download or copy the model weights delete the directory."
+    else
+        echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}"
+        mkdir -p "${PERSISTENT_VOLUME}/model_weights/"
+        # Determine which setup method to use based on HF_TOKEN presence
+        if [ "${USE_HF_DOWNLOAD}" == "y" ]; then
+            setup_weights_huggingface
+        else
+            setup_weights_meta
+        fi
+    fi
+    
+    setup_tt_metal_cache
+}
+
+# ==============================================================================
+# Main script logic
+# ==============================================================================
+
+# Ensure script is being executed, not sourced
+if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
+    echo "⛔ Error: This script is being sourced. Please make execute it:"
+    echo "chmod +x ./setup.sh && ./setup.sh"
+    set +euo pipefail  # Unset 'set -euo pipefail' when sourcing so it doesnt exit or mess up sourcing shell
+    return 1;  # 'return' works when sourced; 'exit' would terminate the shell
+fi
+
+if [ $# -lt 1 ]; then
+    usage
+fi
+
+# Set up environment variables for the chosen model
+MODEL_TYPE=$1
+setup_model_environment "$MODEL_TYPE"
+setup_weights
+# Call the script again with sudo to execute the sudo-required commands
+echo "Switching to sudo portion to set file permissions and complete setup."
+setup_permissions
diff --git a/tests/mock.vllm.openai.api.dockerfile b/tests/mock.vllm.openai.api.dockerfile
index 2f027be0..9b4e94e6 100644
--- a/tests/mock.vllm.openai.api.dockerfile
+++ b/tests/mock.vllm.openai.api.dockerfile
@@ -95,8 +95,8 @@ RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install compresse
 ARG APP_DIR="${HOME_DIR}/app"
 WORKDIR ${APP_DIR}
 ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR}
-COPY --chown=user:user "vllm-tt-metal-llama3-70b/src" "${APP_DIR}/src"
-COPY --chown=user:user "vllm-tt-metal-llama3-70b/requirements.txt" "${APP_DIR}/requirements.txt"
+COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src"
+COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt"
 COPY --chown=user:user "utils" "${APP_DIR}/utils"
 COPY --chown=user:user "tests" "${APP_DIR}/tests"
 RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
diff --git a/vllm-tt-metal-llama3-70b/setup.sh b/vllm-tt-metal-llama3-70b/setup.sh
deleted file mode 100755
index ee102dff..00000000
--- a/vllm-tt-metal-llama3-70b/setup.sh
+++ /dev/null
@@ -1,388 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: Apache-2.0
-#
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-set -euo pipefail  # Exit on error, print commands, unset variables treated as errors, and exit on pipeline failure
-
-# Function to display usage information
-usage() {
-    echo "Usage: $0 <model_type>"
-    echo "Available model types:"
-    echo "  llama-3.1-70b-instruct"
-    echo "  llama-3.1-70b"
-    echo "  llama-3.1-8b-instruct"
-    echo "  llama-3.1-8b"
-    echo "  llama-3-70b-instruct"
-    echo "  llama-3-70b"
-    echo "  llama-3-8b-instruct"
-    echo "  llama-3-8b"
-    echo
-    echo "Options:"
-    echo "  setup_permissions      Run the script to set file permissions after first run setup (requires sudo)."
-    exit 1
-}
-
-# globals
-readonly MODEL_PATH=$(dirname "$(realpath "$0")")
-readonly REPO_ROOT=$(dirname "${MODEL_PATH}")
-readonly ENV_FILE="${MODEL_PATH}/.env"
-echo "REPO_ROOT: ${REPO_ROOT}"
-echo "MODEL_PATH: ${MODEL_PATH}"
-echo "ENV_FILE: ${ENV_FILE}"
-
-check_and_prompt_env_file() {
-    local MODEL_NAME_KEY="MODEL_NAME"
-    local MODEL_NAME=""
-    
-    # Check if .env file exists
-    if [[ -f "$ENV_FILE" ]]; then
-        # Extract the MODEL_NAME value from .env
-        FOUND_MODEL_NAME=$(grep "^$MODEL_NAME_KEY=" "$ENV_FILE" | cut -d '=' -f2)
-
-        # If MODEL_NAME is found, display it
-        if [[ -n "$FOUND_MODEL_NAME" ]]; then
-            echo "The existing file ${ENV_FILE} contains MODEL_NAME: $FOUND_MODEL_NAME"
-            # Prompt the user to overwrite or exit
-            local choice=""
-            read -p "Do you want to overwrite the existing file ${ENV_FILE}? (y/n) [default: y]:" choice
-            choice=${choice:-y}
-            # Handle user's choice
-            case "$choice" in
-                y|Y )
-                    echo "Overwriting the ${ENV_FILE} file ..."
-                    # Logic to overwrite .env goes here
-                    OVERWRITE_ENV=true
-                    ;;
-                n|N )
-                    OVERWRITE_ENV=false
-                    ;;
-                * )
-                    echo "⛔ Invalid option. Exiting."
-                    exit 1
-                    ;;
-            esac
-        else
-            echo "MODEL_NAME not found in ${ENV_FILE}. Overwritting."
-            OVERWRITE_ENV=true
-        fi
-        
-    else
-        echo "${ENV_FILE} does not exist. Proceeding to create a new one."
-        OVERWRITE_ENV=true
-    fi
-}
-
-
-# Function to set environment variables based on the model selection and write them to .env
-setup_model_environment() {
-    # Set default values for environment variables
-    DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume
-    DEFAULT_LLAMA_REPO=~/llama-models
-    # Set environment variables based on the model selection
-    case "$1" in
-      "llama-3.1-70b-instruct")
-      MODEL_NAME="llama-3.1-70b-instruct"
-      META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct"
-      META_DIR_FILTER="llama3_1"
-      REPACKED=1
-      ;;
-      "llama-3.1-70b")
-      MODEL_NAME="llama-3.1-70b"
-      META_MODEL_NAME="Meta-Llama-3.1-70B"
-      META_DIR_FILTER="llama3_1"
-      REPACKED=1
-      ;;
-      "llama-3.1-8b-instruct")
-      MODEL_NAME="llama-3.1-8b-instruct"
-      META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct"
-      META_DIR_FILTER="llama3_1"
-      REPACKED=0
-      ;;
-      "llama-3.1-8b")
-      MODEL_NAME="llama-3.1-8b"
-      META_MODEL_NAME="Meta-Llama-3.1-8B"
-      META_DIR_FILTER="llama3_1"
-      REPACKED=0
-      ;;
-      "llama-3-70b-instruct")
-      MODEL_NAME="llama-3-70b-instruct"
-      META_MODEL_NAME="Meta-Llama-3-70B-Instruct"
-      META_DIR_FILTER="llama3"
-      REPACKED=1
-      ;;
-      "llama-3-70b")
-      MODEL_NAME="llama-3-70b"
-      META_MODEL_NAME="Meta-Llama-3-70B"
-      META_DIR_FILTER="llama3"
-      REPACKED=1
-      ;;
-      "llama-3-8b-instruct")
-      MODEL_NAME="llama-3-8b-instruct"
-      META_MODEL_NAME="Meta-Llama-3-8B-Instruct"
-      META_DIR_FILTER="llama3"
-      REPACKED=0
-      ;;
-      "llama-3-8b")
-      MODEL_NAME="llama-3-8b"
-      META_MODEL_NAME="Meta-Llama-3-8B"
-      META_DIR_FILTER="llama3"
-      REPACKED=0
-      ;;
-      *)
-      echo "⛔ Invalid model choice."
-      usage
-      exit 1
-      ;;
-    esac
-
-    # Initialize OVERWRITE_ENV
-    OVERWRITE_ENV=false
-
-    check_and_prompt_env_file
-
-    if [ "$OVERWRITE_ENV" = false ]; then
-        echo "✅ using existing .env file: ${ENV_FILE}."
-        return 0
-    fi
-
-    # Safely handle potentially unset environment variables using default values
-    PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
-    LLAMA_REPO=${LLAMA_REPO:-$DEFAULT_LLAMA_REPO}
-
-    # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
-    read -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
-    PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
-    echo
-    # Prompt user for LLAMA_REPO if not already set or use default
-    read -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO
-    LLAMA_REPO=${INPUT_LLAMA_REPO:-$LLAMA_REPO}
-    echo  # move to a new line after input
-
-    # Set environment variables with defaults if not already set
-    LLAMA_DIR=${LLAMA_DIR:-${LLAMA_REPO}/models/${META_DIR_FILTER}}
-    LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-${LLAMA_DIR}/${META_MODEL_NAME}}
-    PERSISTENT_VOLUME=${PERSISTENT_VOLUME:-${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1}
-
-    # Prompt user for JWT_SECRET securely
-    read -sp "Enter your JWT_SECRET: " JWT_SECRET
-    echo  # move to a new line after input
-    # Verify the JWT_SECRET is not empty
-    if [ -z "$JWT_SECRET" ]; then
-        echo "⛔ JWT_SECRET cannot be empty. Please try again."
-        exit 1
-    fi
-
-    if [ "${REPACKED}" -eq 1 ]; then
-        echo "REPACKED is enabled."
-        REPACKED_STR="repacked-"
-    else
-        echo "REPACKED is disabled."
-        REPACKED_STR=""
-    fi
-
-    # Write environment variables to .env file
-    echo "Writing environment variables to ${ENV_FILE} ..."
-    cat > ${ENV_FILE} <<EOF
-MODEL_NAME=$MODEL_NAME
-META_MODEL_NAME=$META_MODEL_NAME
-# host paths
-LLAMA_REPO=$LLAMA_REPO
-LLAMA_DIR=$LLAMA_DIR
-LLAMA_WEIGHTS_DIR=$LLAMA_WEIGHTS_DIR
-PERSISTENT_VOLUME_ROOT=$PERSISTENT_VOLUME_ROOT
-PERSISTENT_VOLUME=$PERSISTENT_VOLUME
-# container paths
-REPACKED=${REPACKED}
-REPACKED_STR=${REPACKED_STR}
-CACHE_ROOT=/home/user/cache_root
-HF_HOME=/home/user/cache_root/huggingface
-MODEL_WEIGHTS_ID=id_${REPACKED_STR}$MODEL_NAME
-MODEL_WEIGHTS_PATH=/home/user/cache_root/model_weights/${REPACKED_STR}$MODEL_NAME
-LLAMA_VERSION=llama3
-TT_METAL_ASYNC_DEVICE_QUEUE=1
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
-SERVICE_PORT=7000
-LLAMA3_CKPT_DIR=/home/user/cache_root/model_weights/${REPACKED_STR}$MODEL_NAME
-LLAMA3_TOKENIZER_PATH=/home/user/cache_root/model_weights/${REPACKED_STR}$MODEL_NAME/tokenizer.model
-LLAMA3_CACHE_PATH=/home/user/cache_root/tt_metal_cache/cache_${REPACKED_STR}$MODEL_NAME
-# These are secrets and must be stored securely for production environments
-JWT_SECRET=$JWT_SECRET
-EOF
-
-    echo "Environment variables written to: ${ENV_FILE}"
-    echo "✅ setup_model_environment completed!"
-}
-
-# Function to load environment variables from .env file
-load_env() {
-    if [ -f ${ENV_FILE} ]; then
-        echo "Sourcing environment variables from ${ENV_FILE} file..."
-        source ${ENV_FILE}
-    else
-        echo "⛔ ${ENV_FILE} file not found. Please run the setup first."
-        exit 1
-    fi
-}
-
-# SUDO PORTION: Encapsulated in a function to handle all sudo-requiring tasks
-setup_permissions() {
-    # Load environment variables from .env
-    load_env
-
-    echo "Running sudo-required commands..."
-    # Create group 'dockermount' if it doesn't exist
-    if ! getent group dockermount > /dev/null 2>&1; then
-        echo "Creating group 'dockermount' ..."
-        sudo groupadd dockermount
-    else
-        echo "Group 'dockermount' already exists."
-    fi
-
-    # Add host user to 'dockermount' group
-    echo "Adding user: '$USER' to 'dockermount' group ..."
-    sudo usermod -aG dockermount "$USER"
-
-    # Get container user with UID 1000 and add to group
-    CONTAINER_USER=$(getent passwd 1000 | cut -d: -f1)
-    if [ -n "$CONTAINER_USER" ]; then
-        echo "Adding container user: '$CONTAINER_USER' (UID 1000) to 'dockermount' group ..."
-        sudo usermod -aG dockermount "$CONTAINER_USER"
-    else
-        echo "No user found with UID 1000."
-    fi
-
-    # Set file ownership and permissions
-    echo "Setting file ownership and permissions for container and host access ..."
-    if [ ! -d "${PERSISTENT_VOLUME}" ]; then
-        # if the user point the PERSISTENT_VOLUME
-        sudo mkdir -p "${PERSISTENT_VOLUME}"
-    fi
-    sudo chown -R ${CONTAINER_USER}:dockermount "${PERSISTENT_VOLUME}"
-    sudo chmod -R 775 "${PERSISTENT_VOLUME}"
-
-    echo "✅ setup_permissions completed!"
-}
-
-setup_weights() {
-    # St`ep 1: Load environment variables from .env file
-    load_env
-
-    # check if model weights already exist
-    if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then
-        echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}."
-        echo "contents:"
-        echo
-        echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})"
-        echo
-        echo "If directory does not have correct weigths, to re-download or copy the model weights delete the directory."
-        echo "✅ Model weights setup is already complete, check if directory contents are correct."
-        return 0
-    fi
-
-    # TODO: support HF_TOKEN for downloading models
-    # Step 2: Set up Llama model repository path
-    echo "Using repository path: $LLAMA_REPO"
-
-    # Step 3: Clone the repository (if it doesn't already exist)
-    if [ ! -d "$LLAMA_REPO" ]; then
-        echo "Cloning the Llama repository to: $LLAMA_REPO"
-        git clone https://github.com/meta-llama/llama-models.git "$LLAMA_REPO"
-        cd "$LLAMA_REPO"
-        # checkout commit before ./download.sh was removed
-        git checkout 685ac4c107c75ce8c291248710bf990a876e1623
-    else
-        echo "🔔 Llama repository already exists at $LLAMA_REPO"
-    fi
-
-    # Step 4: Check if weights are already downloaded
-    if [ -d "${LLAMA_WEIGHTS_DIR}" ] && [ "$(ls -A "${LLAMA_WEIGHTS_DIR}")" ]; then
-        echo "Weights already downloaded at ${LLAMA_WEIGHTS_DIR}"
-        echo "Skipping download."
-    else
-        # Step 5: Run the download script and select models
-        echo "Running the download script to download models at ${LLAMA_DIR}/download.sh ..."
-        cd "$LLAMA_DIR"
-        ./download.sh
-        cd -
-    fi
-
-    # Step 6: Set up persistent volume root
-    echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}"
-    mkdir -p "${PERSISTENT_VOLUME}/model_weights/"
-
-    # Step 7: Create directories for weights, tokenizer, and params
-    echo "Create directories for weights, tokenizer, and params."
-    
-    if [ "${REPACKED}" -eq 1 ]; then
-        WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
-        mkdir -p "${WEIGHTS_DIR}"
-        cp "${LLAMA_WEIGHTS_DIR}/tokenizer.model" "${WEIGHTS_DIR}/tokenizer.model"
-        cp "${LLAMA_WEIGHTS_DIR}/params.json" "${WEIGHTS_DIR}/params.json"
-        # Step 8: repack weights into repacked dir once instead of copying them
-        VENV_NAME="venv_setup"
-        echo "setting up repacking python venv: ${VENV_NAME}"
-        python3 -m venv ${VENV_NAME}
-        source ${VENV_NAME}/bin/activate
-        # pip==21.2.4 is needed to avoid the following error:
-        # ERROR: Package 'networkx' requires a different Python: 3.8.10 not in '>=3.9'
-        pip install --upgrade setuptools wheel pip==21.2.4 tqdm
-        # repack script dependency
-        # pip does not support +cpu build variant qualifier, need to specify cpu index url
-        pip install --index-url https://download.pytorch.org/whl/cpu torch==2.2.1
-        curl -O https://raw.githubusercontent.com/tenstorrent/tt-metal/refs/heads/main/models/demos/t3000/llama2_70b/scripts/repack_weights.py
-        echo "repacking weights..."
-        python repack_weights.py "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" 5
-        deactivate
-        rm -rf ${VENV_NAME} repack_weights.py
-    else
-        WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}"
-        cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
-        
-    fi
-
-    echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
-
-    # create a tmp python venv with dependencies to run repack script
-    echo "✅ setup_weights completed!"
-}
-
-setup_tt_metal_cache() {
-    # check if tt_metal_cache already exists
-    TT_METAL_CACHE_DIR="${PERSISTENT_VOLUME}/tt_metal_cache/cache_${REPACKED_STR}$MODEL_NAME"
-    if [ -d "${TT_METAL_CACHE_DIR}" ]; then
-        echo "✅ tt_metal_cache already exists at: ${TT_METAL_CACHE_DIR}."
-        return 0
-    fi
-
-    # create tt_metal_cache directory
-    mkdir -p "${TT_METAL_CACHE_DIR}"
-    echo "✅ setup_tt_metal_cache completed!"
-}
-
-# Ensure script is being executed, not sourced
-if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
-    echo "⛔ Error: This script is being sourced. Please make execute it:"
-    echo "chmod +x ./setup.sh && ./setup.sh"
-    set +euo pipefail  # Unset 'set -euo pipefail' when sourcing so it doesnt exit or mess up sourcing shell
-    return 1;  # 'return' works when sourced; 'exit' would terminate the shell
-fi
-
-# Main script logic
-if [ $# -lt 1 ]; then
-    usage
-fi
-
-if [ "$1" == "setup_permissions" ]; then
-    setup_permissions
-    exit 0
-fi
-
-# Set up environment variables for the chosen model
-MODEL_TYPE=$1
-setup_model_environment "$MODEL_TYPE"
-setup_weights
-setup_tt_metal_cache
-# Call the script again with sudo to execute the sudo-required commands
-echo "Switching to sudo portion to set file permissions and complete setup."
-setup_permissions
diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3/README.md
similarity index 90%
rename from vllm-tt-metal-llama3-70b/README.md
rename to vllm-tt-metal-llama3/README.md
index 3fa10b39..525281f6 100644
--- a/vllm-tt-metal-llama3-70b/README.md
+++ b/vllm-tt-metal-llama3/README.md
@@ -25,18 +25,18 @@ Run the container from the project root at `tt-inference-server`:
 ```bash
 cd tt-inference-server
 # make sure if you already set up the model weights and cache you use the correct persistent volume
-export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/
+export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/
 docker run \
   --rm \
   -it \
-  --env-file vllm-tt-metal-llama3-70b/.env \
+  --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \
   --cap-add ALL \
   --device /dev/tenstorrent:/dev/tenstorrent \
   --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
-  --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \
+  --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \
   --shm-size 32G \
   --publish 7000:7000 \
-  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.2-tt-metal-385904186f81-384f1790c3be
+  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50 
 ```
 
 By default the Docker container will start running the entrypoint command wrapped in `src/run_vllm_api_server.py`.
@@ -106,16 +106,16 @@ Either download the Docker image from GitHub Container Registry (recommended for
 
 ```bash
 # pull image from GHCR
-docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.2-tt-metal-385904186f81-384f1790c3be
+docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50
 ```
 
 #### Option B: Build Docker Image
 
-For instructions on building the Docker imagem locally see: [vllm-tt-metal-llama3-70b/docs/development](../vllm-tt-metal-llama3-70b/docs/development.md#step-1-build-docker-image)
+For instructions on building the Docker imagem locally see: [vllm-tt-metal-llama3/docs/development](../vllm-tt-metal-llama3/docs/development.md#step-1-build-docker-image)
 
 ### 5. Automated Setup: environment variables and weights files
 
-The script `vllm-tt-metal-llama3-70b/setup.sh` automates:
+The script `setup.sh` automates:
 
 1. interactively creating the .env file,
 2. downloading the Llama model weights,
@@ -123,7 +123,7 @@ The script `vllm-tt-metal-llama3-70b/setup.sh` automates:
 4. creating the default persistent storage directory structure and permissions.
 
 ```bash
-cd tt-inference-server/vllm-tt-metal-llama3-70b
+cd tt-inference-server
 chmod +x setup.sh
 ./setup.sh llama-3.1-70b-instruct
 ```
diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3/docs/development.md
similarity index 78%
rename from vllm-tt-metal-llama3-70b/docs/development.md
rename to vllm-tt-metal-llama3/docs/development.md
index 55d8b1d3..f7dd6c15 100644
--- a/vllm-tt-metal-llama3-70b/docs/development.md
+++ b/vllm-tt-metal-llama3/docs/development.md
@@ -1,6 +1,6 @@
-# Development vllm-tt-metal-llama3-70B
+# Development vllm-tt-metal-llama3
 
-Containerization in: https://github.com/tenstorrent/tt-inference-server/blob/tstesco/vllm-llama3-70b/vllm-tt-metal-llama3-70b/vllm.llama3.src.base.inference.v0.52.0.Dockerfile 
+Containerization in: https://github.com/tenstorrent/tt-inference-server/blob/tstesco/vllm-llama3-70b/vllm-tt-metal-llama3/vllm.llama3.src.base.inference.v0.52.0.Dockerfile 
 
 tt-metal and vLLM are under active development in lock-step: https://github.com/tenstorrent/vllm/tree/dev/tt_metal 
 
@@ -13,23 +13,24 @@ When building, update the commit SHA and get correct SHA from model developers o
 # set build context to repo root
 cd tt-inference-server
 # build image
-export TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34
-export TT_METAL_COMMIT_SHA_OR_TAG=385904186f81fed15d5c87c162221d4f34387164
+export TT_METAL_DOCKERFILE_VERSION=v0.53.0
+export TT_METAL_COMMIT_SHA_OR_TAG=v0.54.0-rc2
 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12}
-export TT_VLLM_COMMIT_SHA_OR_TAG=384f1790c3be16e1d1b10de07252be2e66d00935
+export TT_VLLM_COMMIT_SHA_OR_TAG=953161188c50f10da95a88ab305e23977ebd3750
 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12}
-export IMAGE_VERSION=v0.0.3
+export OS_VERSION=ubuntu-20.04-amd64
+export IMAGE_VERSION=v0.0.1
 docker build \
-  -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \
+  -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm-${OS_VERSION}:${IMAGE_VERSION}-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \
   --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \
   --build-arg TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} \
   --build-arg TT_VLLM_COMMIT_SHA_OR_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG} \
-  . -f vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile
+  . -f vllm-tt-metal-llama3/vllm.llama3.src.${OS_VERSION}.Dockerfile
 ```
 
 ### push image (only for admin deployment to GHCR)
 ```bash
-docker push ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG}
+docker push ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm-${OS_VERSION}:${IMAGE_VERSION}-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG}
 ```
 
 ## Step 2: Run container for LM evals development
@@ -38,15 +39,15 @@ note: this requires running `setup.sh` to set up the weights for a particular mo
 
 ```bash
 cd tt-inference-server
-export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/
+export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/
 docker run \
   --rm \
   -it \
-  --env-file tt-metal-llama3-70b/.env \
+  --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \
   --cap-add ALL \
   --device /dev/tenstorrent:/dev/tenstorrent \
   --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
-  --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \
+  --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \
   --shm-size 32G \
   ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} bash
 ```
diff --git a/vllm-tt-metal-llama3-70b/requirements.txt b/vllm-tt-metal-llama3/requirements.txt
similarity index 100%
rename from vllm-tt-metal-llama3-70b/requirements.txt
rename to vllm-tt-metal-llama3/requirements.txt
diff --git a/vllm-tt-metal-llama3-70b/src/__init__.py b/vllm-tt-metal-llama3/src/__init__.py
similarity index 100%
rename from vllm-tt-metal-llama3-70b/src/__init__.py
rename to vllm-tt-metal-llama3/src/__init__.py
diff --git a/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py b/vllm-tt-metal-llama3/src/example_openai_client_alpaca_eval.py
similarity index 100%
rename from vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py
rename to vllm-tt-metal-llama3/src/example_openai_client_alpaca_eval.py
diff --git a/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py b/vllm-tt-metal-llama3/src/example_requests_client_alpaca_eval.py
similarity index 100%
rename from vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py
rename to vllm-tt-metal-llama3/src/example_requests_client_alpaca_eval.py
diff --git a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py b/vllm-tt-metal-llama3/src/run_vllm_api_server.py
similarity index 62%
rename from vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py
rename to vllm-tt-metal-llama3/src/run_vllm_api_server.py
index 992874b1..dfe08930 100644
--- a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py
+++ b/vllm-tt-metal-llama3/src/run_vllm_api_server.py
@@ -12,11 +12,14 @@
 
 from utils.logging_utils import set_vllm_logging_config
 
-# importing from tt-metal install path
-from models.demos.t3000.llama2_70b.tt.llama_generation import TtLlamaModelForGeneration
+# Import and register models from tt-metal
+from models.demos.t3000.llama2_70b.tt.generator_vllm import TtLlamaForCausalLM
+from models.demos.llama3.tt.generator_vllm import TtMllamaForConditionalGeneration
 
-# register the model
-ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaModelForGeneration)
+ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaForCausalLM)
+ModelRegistry.register_model(
+    "TTMllamaForConditionalGeneration", TtMllamaForConditionalGeneration
+)
 
 
 def get_encoded_api_key(jwt_secret):
@@ -27,7 +30,43 @@ def get_encoded_api_key(jwt_secret):
     return encoded_jwt
 
 
+def get_hf_model_id():
+    model = os.environ.get("HF_MODEL_REPO_ID")
+    if not model:
+        print("Must set environment variable: HF_MODEL_REPO_ID")
+        sys.exit()
+    return model
+
+
+def model_setup(hf_model_id):
+    # TODO: check HF repo access with HF_TOKEN supplied
+    print(f"using model: {hf_model_id}")
+    args = {
+        "model": hf_model_id,
+        "block_size": "64",
+        "max_num_seqs": "32",
+        "max_model_len": "131072",
+        "max_num_batched_tokens": "131072",
+        "num_scheduler_steps": "10",
+        "max-log-len": "64",
+        "port": os.getenv("SERVICE_PORT", "7000"),
+        "download-dir": os.getenv("CACHE_DIR", None),
+        "api-key": get_encoded_api_key(os.getenv("JWT_SECRET", None)),
+    }
+    if hf_model_id == "meta-llama/Llama-3.2-11B-Vision-Instruct":
+        if os.environ.get("MESH_DEVICE") is None:
+            os.environ["MESH_DEVICE"] = "N300"
+        else:
+            assert os.environ["MESH_DEVICE"] in [
+                "N300",
+                "T3K_LINE",
+            ], "Invalid MESH_DEVICE for multi-modal inference"
+
+    return args
+
+
 def main():
+    hf_model_id = get_hf_model_id()
     # set up logging
     config_path, log_path = set_vllm_logging_config(level="DEBUG")
     print(f"setting vllm logging config at: {config_path}")
@@ -41,18 +80,7 @@ def main():
     # timeout is 3x VLLM_RPC_TIMEOUT
     os.environ["VLLM_RPC_TIMEOUT"] = "200000"  # 200000ms = 200s
     # vLLM CLI arguments
-    args = {
-        "model": "meta-llama/Llama-3.1-70B-Instruct",
-        "block_size": "64",
-        "max_num_seqs": "32",
-        "max_model_len": "131072",
-        "max_num_batched_tokens": "131072",
-        "num_scheduler_steps": "10",
-        "max-log-len": "32",
-        "port": os.getenv("SERVICE_PORT", "7000"),
-        "download-dir": os.getenv("CACHE_DIR", None),
-        "api-key": get_encoded_api_key(os.getenv("JWT_SECRET", None)),
-    }
+    args = model_setup(hf_model_id)
     for key, value in args.items():
         if value is not None:
             sys.argv.extend(["--" + key, value])
diff --git a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile
similarity index 89%
rename from vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile
rename to vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile
index 2184d356..4f2dce30 100644
--- a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile
+++ b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile
@@ -90,18 +90,21 @@ RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\
     && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \
     && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ."
 
-# extra vllm dependencies
-RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install compressed-tensors"
+# extra vllm and model dependencies
+RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
+    && pip install compressed-tensors \
+    && pip install -r /tt-metal/models/demos/llama3/requirements.txt"
 
 ARG APP_DIR="${HOME_DIR}/app"
 WORKDIR ${APP_DIR}
 ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR}
-COPY --chown=user:user "vllm-tt-metal-llama3-70b/src" "${APP_DIR}/src"
-COPY --chown=user:user "vllm-tt-metal-llama3-70b/requirements.txt" "${APP_DIR}/requirements.txt"
+COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src"
+COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt"
 COPY --chown=user:user "utils" "${APP_DIR}/utils"
 COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking"
 COPY --chown=user:user "evals" "${APP_DIR}/evals"
 COPY --chown=user:user "tests" "${APP_DIR}/tests"
+COPY --chown=user:user "locust" "${APP_DIR}/locust"
 RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
 && pip install --default-timeout=240 --no-cache-dir -r requirements.txt"
 
diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile
new file mode 100644
index 00000000..2e87a84c
--- /dev/null
+++ b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# default base image, override with --build-arg TT_METAL_DOCKERFILE_VERSION=<version>
+ARG TT_METAL_DOCKERFILE_VERSION
+
+FROM local/tt-metal/tt-metalium/ubuntu-22.04-amd64:$TT_METAL_DOCKERFILE_VERSION
+
+# Build stage
+LABEL maintainer="Tom Stesco <tstesco@tenstorrent.com>"
+# connect Github repo with package
+LABEL org.opencontainers.image.source=https://github.com/tenstorrent/tt-inference-server
+
+ARG DEBIAN_FRONTEND=noninteractive
+# default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG=<sha>
+ARG TT_METAL_COMMIT_SHA_OR_TAG
+ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev
+
+# make build commit SHA available in the image for reference and debugging
+ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG}
+ENV SHELL=/bin/bash
+ENV TZ=America/Los_Angeles
+# tt-metal build vars
+ENV ARCH_NAME=wormhole_b0
+ENV TT_METAL_HOME=/tt-metal
+ENV CONFIG=Release
+ENV TT_METAL_ENV=dev
+ENV LOGURU_LEVEL=INFO
+# derived vars
+ENV PYTHONPATH=${TT_METAL_HOME}
+# note: PYTHON_ENV_DIR is used by create_venv.sh
+ENV PYTHON_ENV_DIR=${TT_METAL_HOME}/python_env
+ENV LD_LIBRARY_PATH=${TT_METAL_HOME}/build/lib
+
+# extra system deps
+RUN apt-get update && apt-get install -y \
+    libsndfile1 \
+    wget \
+    nano \
+    acl \
+    jq \
+    vim \
+    # user deps
+    htop \
+    screen \
+    tmux \
+    unzip \
+    zip \
+    curl \
+    iputils-ping \
+    rsync \
+    && rm -rf /var/lib/apt/lists/*
+
+# build tt-metal
+RUN git clone https://github.com/tenstorrent-metal/tt-metal.git ${TT_METAL_HOME} \
+    && cd ${TT_METAL_HOME} \
+    && git checkout ${TT_METAL_COMMIT_SHA_OR_TAG} \
+    && git submodule update --init --recursive \
+    && git submodule foreach 'git lfs fetch --all && git lfs pull' \
+    && bash ./build_metal.sh \
+    && bash ./create_venv.sh
+
+# user setup
+ARG HOME_DIR=/home/user
+RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \
+    && mkdir -p ${HOME_DIR} \
+    && chown -R user:user ${HOME_DIR} \
+    && chown -R user:user ${TT_METAL_HOME}
+  
+USER user
+
+# tt-metal python env default
+RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc
+
+# install tt-smi
+RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
+    && pip3 install --upgrade pip \
+    && pip3 install git+https://github.com/tenstorrent/tt-smi"
+
+# runtime required for tt-metal on WH
+ENV WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
+
+WORKDIR ${HOME_DIR}
+# vllm install, see: https://github.com/tenstorrent/vllm/blob/dev/tt_metal/README.md
+ENV vllm_dir=${HOME_DIR}/vllm
+ENV PYTHONPATH=${PYTHONPATH}:${vllm_dir}
+ENV VLLM_TARGET_DEVICE="tt"
+RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\
+    && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \
+    && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ."
+
+# extra vllm and model dependencies
+RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
+    && pip install compressed-tensors \
+    && pip install -r /tt-metal/models/demos/llama3/requirements.txt"
+
+ARG APP_DIR="${HOME_DIR}/app"
+WORKDIR ${APP_DIR}
+ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR}
+COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src"
+COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt"
+COPY --chown=user:user "utils" "${APP_DIR}/utils"
+COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking"
+COPY --chown=user:user "evals" "${APP_DIR}/evals"
+COPY --chown=user:user "tests" "${APP_DIR}/tests"
+COPY --chown=user:user "locust" "${APP_DIR}/locust"
+RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
+&& pip install --default-timeout=240 --no-cache-dir -r requirements.txt"
+
+WORKDIR "${APP_DIR}/src"
+CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python run_vllm_api_server.py"]

From d850b73e90e6ea2c3f3fa399c1382d0aa425bd3b Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 14 Jan 2025 19:17:20 -0500
Subject: [PATCH 2/3] use vllm.llama3.src.shared.Dockerfile for shared build
 steps for ubuntu 22.04 and 20.04 Dockerfiles

---
 .../vllm.llama3.src.shared.Dockerfile         | 107 +++++++++++++++++
 ...m.llama3.src.ubuntu-20.04-amd64.Dockerfile | 106 +----------------
 ...m.llama3.src.ubuntu-22.04-amd64.Dockerfile | 109 +-----------------
 3 files changed, 115 insertions(+), 207 deletions(-)
 create mode 100644 vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile

diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile
new file mode 100644
index 00000000..c9532dcc
--- /dev/null
+++ b/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# shared build stage, FROM is set by the OS specific Dockerfiles
+LABEL maintainer="Tom Stesco <tstesco@tenstorrent.com>"
+# connect Github repo with package
+LABEL org.opencontainers.image.source=https://github.com/tenstorrent/tt-inference-server
+
+ARG DEBIAN_FRONTEND=noninteractive
+# default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG=<sha>
+ARG TT_METAL_COMMIT_SHA_OR_TAG
+ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev
+
+# make build commit SHA available in the image for reference and debugging
+ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG}
+ENV SHELL=/bin/bash
+ENV TZ=America/Los_Angeles
+# tt-metal build vars
+ENV ARCH_NAME=wormhole_b0
+ENV TT_METAL_HOME=/tt-metal
+ENV CONFIG=Release
+ENV TT_METAL_ENV=dev
+ENV LOGURU_LEVEL=INFO
+# derived vars
+ENV PYTHONPATH=${TT_METAL_HOME}
+# note: PYTHON_ENV_DIR is used by create_venv.sh
+ENV PYTHON_ENV_DIR=${TT_METAL_HOME}/python_env
+ENV LD_LIBRARY_PATH=${TT_METAL_HOME}/build/lib
+
+# extra system deps
+RUN apt-get update && apt-get install -y \
+    libsndfile1 \
+    wget \
+    nano \
+    acl \
+    jq \
+    vim \
+    # user deps
+    htop \
+    screen \
+    tmux \
+    unzip \
+    zip \
+    curl \
+    iputils-ping \
+    rsync \
+    && rm -rf /var/lib/apt/lists/*
+
+# build tt-metal
+RUN git clone https://github.com/tenstorrent-metal/tt-metal.git ${TT_METAL_HOME} \
+    && cd ${TT_METAL_HOME} \
+    && git checkout ${TT_METAL_COMMIT_SHA_OR_TAG} \
+    && git submodule update --init --recursive \
+    && git submodule foreach 'git lfs fetch --all && git lfs pull' \
+    && bash ./build_metal.sh \
+    && bash ./create_venv.sh
+
+# user setup
+ARG HOME_DIR=/home/user
+RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \
+    && mkdir -p ${HOME_DIR} \
+    && chown -R user:user ${HOME_DIR} \
+    && chown -R user:user ${TT_METAL_HOME}
+  
+USER user
+
+# tt-metal python env default
+RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc
+
+# install tt-smi
+RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
+    && pip3 install --upgrade pip \
+    && pip3 install git+https://github.com/tenstorrent/tt-smi"
+
+# runtime required for tt-metal on WH
+ENV WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
+
+WORKDIR ${HOME_DIR}
+# vllm install, see: https://github.com/tenstorrent/vllm/blob/dev/tt_metal/README.md
+ENV vllm_dir=${HOME_DIR}/vllm
+ENV PYTHONPATH=${PYTHONPATH}:${vllm_dir}
+ENV VLLM_TARGET_DEVICE="tt"
+RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\
+    && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \
+    && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ."
+
+# extra vllm and model dependencies
+RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
+    && pip install compressed-tensors \
+    && pip install -r /tt-metal/models/demos/llama3/requirements.txt"
+
+ARG APP_DIR="${HOME_DIR}/app"
+WORKDIR ${APP_DIR}
+ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR}
+COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src"
+COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt"
+COPY --chown=user:user "utils" "${APP_DIR}/utils"
+COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking"
+COPY --chown=user:user "evals" "${APP_DIR}/evals"
+COPY --chown=user:user "tests" "${APP_DIR}/tests"
+COPY --chown=user:user "locust" "${APP_DIR}/locust"
+RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
+&& pip install --default-timeout=240 --no-cache-dir -r requirements.txt"
+
+WORKDIR "${APP_DIR}/src"
+CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python run_vllm_api_server.py"]
diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile
index 4f2dce30..49e0fc43 100644
--- a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile
+++ b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile
@@ -7,106 +7,6 @@ ARG TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34
 
 FROM ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:$TT_METAL_DOCKERFILE_VERSION-dev
 
-# Build stage
-LABEL maintainer="Tom Stesco <tstesco@tenstorrent.com>"
-# connect Github repo with package
-LABEL org.opencontainers.image.source=https://github.com/tenstorrent/tt-inference-server
-
-ARG DEBIAN_FRONTEND=noninteractive
-# default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG=<sha>
-ARG TT_METAL_COMMIT_SHA_OR_TAG
-ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev
-
-# make build commit SHA available in the image for reference and debugging
-ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG}
-ENV SHELL=/bin/bash
-ENV TZ=America/Los_Angeles
-# tt-metal build vars
-ENV ARCH_NAME=wormhole_b0
-ENV TT_METAL_HOME=/tt-metal
-ENV CONFIG=Release
-ENV TT_METAL_ENV=dev
-ENV LOGURU_LEVEL=INFO
-# derived vars
-ENV PYTHONPATH=${TT_METAL_HOME}
-# note: PYTHON_ENV_DIR is used by create_venv.sh
-ENV PYTHON_ENV_DIR=${TT_METAL_HOME}/python_env
-ENV LD_LIBRARY_PATH=${TT_METAL_HOME}/build/lib
-
-# extra system deps
-RUN apt-get update && apt-get install -y \
-    libsndfile1 \
-    wget \
-    nano \
-    acl \
-    jq \
-    vim \
-    # user deps
-    htop \
-    screen \
-    tmux \
-    unzip \
-    zip \
-    curl \
-    iputils-ping \
-    rsync \
-    && rm -rf /var/lib/apt/lists/*
-
-# build tt-metal
-RUN git clone https://github.com/tenstorrent-metal/tt-metal.git ${TT_METAL_HOME} \
-    && cd ${TT_METAL_HOME} \
-    && git checkout ${TT_METAL_COMMIT_SHA_OR_TAG} \
-    && git submodule update --init --recursive \
-    && git submodule foreach 'git lfs fetch --all && git lfs pull' \
-    && bash ./build_metal.sh \
-    && bash ./create_venv.sh
-
-# user setup
-ARG HOME_DIR=/home/user
-RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \
-    && mkdir -p ${HOME_DIR} \
-    && chown -R user:user ${HOME_DIR} \
-    && chown -R user:user ${TT_METAL_HOME}
-  
-USER user
-
-# tt-metal python env default
-RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc
-
-# install tt-smi
-RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
-    && pip3 install --upgrade pip \
-    && pip3 install git+https://github.com/tenstorrent/tt-smi"
-
-# runtime required for tt-metal on WH
-ENV WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
-
-WORKDIR ${HOME_DIR}
-# vllm install, see: https://github.com/tenstorrent/vllm/blob/dev/tt_metal/README.md
-ENV vllm_dir=${HOME_DIR}/vllm
-ENV PYTHONPATH=${PYTHONPATH}:${vllm_dir}
-ENV VLLM_TARGET_DEVICE="tt"
-RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\
-    && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \
-    && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ."
-
-# extra vllm and model dependencies
-RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
-    && pip install compressed-tensors \
-    && pip install -r /tt-metal/models/demos/llama3/requirements.txt"
-
-ARG APP_DIR="${HOME_DIR}/app"
-WORKDIR ${APP_DIR}
-ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR}
-COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src"
-COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt"
-COPY --chown=user:user "utils" "${APP_DIR}/utils"
-COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking"
-COPY --chown=user:user "evals" "${APP_DIR}/evals"
-COPY --chown=user:user "tests" "${APP_DIR}/tests"
-COPY --chown=user:user "locust" "${APP_DIR}/locust"
-RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
-&& pip install --default-timeout=240 --no-cache-dir -r requirements.txt"
-
-WORKDIR "${APP_DIR}/src"
-CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python run_vllm_api_server.py"]
+# include shared instructions
+COPY vllm.llama3.src.shared.Dockerfile /vllm.llama3.src.shared.Dockerfile
+RUN cat /vllm.llama3.src.shared.Dockerfile | docker build - < .
diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile
index 2e87a84c..dfcdce9d 100644
--- a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile
+++ b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile
@@ -2,111 +2,12 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
-# default base image, override with --build-arg TT_METAL_DOCKERFILE_VERSION=<version>
+# set with --build-arg TT_METAL_DOCKERFILE_VERSION=<version>
+# NOTE: tt-metal Ubuntu 22.04 Dockerfile must be built locally until release images are published
 ARG TT_METAL_DOCKERFILE_VERSION
 
 FROM local/tt-metal/tt-metalium/ubuntu-22.04-amd64:$TT_METAL_DOCKERFILE_VERSION
 
-# Build stage
-LABEL maintainer="Tom Stesco <tstesco@tenstorrent.com>"
-# connect Github repo with package
-LABEL org.opencontainers.image.source=https://github.com/tenstorrent/tt-inference-server
-
-ARG DEBIAN_FRONTEND=noninteractive
-# default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG=<sha>
-ARG TT_METAL_COMMIT_SHA_OR_TAG
-ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev
-
-# make build commit SHA available in the image for reference and debugging
-ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG}
-ENV SHELL=/bin/bash
-ENV TZ=America/Los_Angeles
-# tt-metal build vars
-ENV ARCH_NAME=wormhole_b0
-ENV TT_METAL_HOME=/tt-metal
-ENV CONFIG=Release
-ENV TT_METAL_ENV=dev
-ENV LOGURU_LEVEL=INFO
-# derived vars
-ENV PYTHONPATH=${TT_METAL_HOME}
-# note: PYTHON_ENV_DIR is used by create_venv.sh
-ENV PYTHON_ENV_DIR=${TT_METAL_HOME}/python_env
-ENV LD_LIBRARY_PATH=${TT_METAL_HOME}/build/lib
-
-# extra system deps
-RUN apt-get update && apt-get install -y \
-    libsndfile1 \
-    wget \
-    nano \
-    acl \
-    jq \
-    vim \
-    # user deps
-    htop \
-    screen \
-    tmux \
-    unzip \
-    zip \
-    curl \
-    iputils-ping \
-    rsync \
-    && rm -rf /var/lib/apt/lists/*
-
-# build tt-metal
-RUN git clone https://github.com/tenstorrent-metal/tt-metal.git ${TT_METAL_HOME} \
-    && cd ${TT_METAL_HOME} \
-    && git checkout ${TT_METAL_COMMIT_SHA_OR_TAG} \
-    && git submodule update --init --recursive \
-    && git submodule foreach 'git lfs fetch --all && git lfs pull' \
-    && bash ./build_metal.sh \
-    && bash ./create_venv.sh
-
-# user setup
-ARG HOME_DIR=/home/user
-RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \
-    && mkdir -p ${HOME_DIR} \
-    && chown -R user:user ${HOME_DIR} \
-    && chown -R user:user ${TT_METAL_HOME}
-  
-USER user
-
-# tt-metal python env default
-RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc
-
-# install tt-smi
-RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
-    && pip3 install --upgrade pip \
-    && pip3 install git+https://github.com/tenstorrent/tt-smi"
-
-# runtime required for tt-metal on WH
-ENV WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
-
-WORKDIR ${HOME_DIR}
-# vllm install, see: https://github.com/tenstorrent/vllm/blob/dev/tt_metal/README.md
-ENV vllm_dir=${HOME_DIR}/vllm
-ENV PYTHONPATH=${PYTHONPATH}:${vllm_dir}
-ENV VLLM_TARGET_DEVICE="tt"
-RUN git clone https://github.com/tenstorrent/vllm.git ${vllm_dir}\
-    && cd ${vllm_dir} && git checkout ${TT_VLLM_COMMIT_SHA_OR_TAG} \
-    && /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install -e ."
-
-# extra vllm and model dependencies
-RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
-    && pip install compressed-tensors \
-    && pip install -r /tt-metal/models/demos/llama3/requirements.txt"
-
-ARG APP_DIR="${HOME_DIR}/app"
-WORKDIR ${APP_DIR}
-ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR}
-COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src"
-COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt"
-COPY --chown=user:user "utils" "${APP_DIR}/utils"
-COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking"
-COPY --chown=user:user "evals" "${APP_DIR}/evals"
-COPY --chown=user:user "tests" "${APP_DIR}/tests"
-COPY --chown=user:user "locust" "${APP_DIR}/locust"
-RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
-&& pip install --default-timeout=240 --no-cache-dir -r requirements.txt"
-
-WORKDIR "${APP_DIR}/src"
-CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python run_vllm_api_server.py"]
+# include shared instructions
+COPY vllm.llama3.src.shared.Dockerfile /vllm.llama3.src.shared.Dockerfile
+RUN cat /vllm.llama3.src.shared.Dockerfile | docker build - < .

From 11141de96438fc2d61823a6bbf5ace4a0cc955d9 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 15 Jan 2025 00:58:43 +0000
Subject: [PATCH 3/3] use full url TT_METAL_DOCKERFILE_URL to allow for 1
 Dockerfile for multiple base images

---
 tests/README.md                               |  4 ++--
 tests/mock.vllm.openai.api.dockerfile         |  7 +++---
 vllm-tt-metal-llama3/docs/development.md      | 23 +++++++++++++++----
 ....Dockerfile => vllm.llama3.src.Dockerfile} |  6 +++++
 ...m.llama3.src.ubuntu-20.04-amd64.Dockerfile | 12 ----------
 ...m.llama3.src.ubuntu-22.04-amd64.Dockerfile | 13 -----------
 6 files changed, 31 insertions(+), 34 deletions(-)
 rename vllm-tt-metal-llama3/{vllm.llama3.src.shared.Dockerfile => vllm.llama3.src.Dockerfile} (91%)
 delete mode 100644 vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile
 delete mode 100644 vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile

diff --git a/tests/README.md b/tests/README.md
index 0d0ac168..d4240bd9 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -35,14 +35,14 @@ WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/user/tests/mock_
 # set build context to repo root
 cd tt-inference-server
 # build image
-export TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34
+export TT_METAL_DOCKERFILE_URL=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:v0.53.0-rc34-dev
 export TT_METAL_COMMIT_SHA_OR_TAG=385904186f81fed15d5c87c162221d4f34387164
 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12}
 export TT_VLLM_COMMIT_SHA_OR_TAG=384f1790c3be16e1d1b10de07252be2e66d00935
 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12}
 docker build \
   -t ghcr.io/tenstorrent/tt-inference-server/mock.vllm.openai.api:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \
-  --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \
+  --build-arg TT_METAL_DOCKERFILE_URL=${TT_METAL_DOCKERFILE_URL} \
   --build-arg TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} \
   --build-arg TT_VLLM_COMMIT_SHA_OR_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG} \
   . -f tests/mock.vllm.openai.api.dockerfile
diff --git a/tests/mock.vllm.openai.api.dockerfile b/tests/mock.vllm.openai.api.dockerfile
index 9b4e94e6..8cf9f718 100644
--- a/tests/mock.vllm.openai.api.dockerfile
+++ b/tests/mock.vllm.openai.api.dockerfile
@@ -2,10 +2,11 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
-# default base image, override with --build-arg TT_METAL_DOCKERFILE_VERSION=<version>
-ARG TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34
+# default base image, override with --build-arg TT_METAL_DOCKERFILE_URL=<url or local image path>
+# NOTE: tt-metal Ubuntu 22.04 Dockerfile must be built locally until release images are published
+ARG TT_METAL_DOCKERFILE_URL=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:v0.53.0-rc34-dev
 
-FROM ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:$TT_METAL_DOCKERFILE_VERSION-dev
+FROM ${TT_METAL_DOCKERFILE_URL}
 
 # Build stage
 LABEL maintainer="Tom Stesco <tstesco@tenstorrent.com>"
diff --git a/vllm-tt-metal-llama3/docs/development.md b/vllm-tt-metal-llama3/docs/development.md
index f7dd6c15..3a1b9a5e 100644
--- a/vllm-tt-metal-llama3/docs/development.md
+++ b/vllm-tt-metal-llama3/docs/development.md
@@ -13,21 +13,36 @@ When building, update the commit SHA and get correct SHA from model developers o
 # set build context to repo root
 cd tt-inference-server
 # build image
-export TT_METAL_DOCKERFILE_VERSION=v0.53.0
+export TT_METAL_DOCKERFILE_URL=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:v0.53.0-rc34-dev
 export TT_METAL_COMMIT_SHA_OR_TAG=v0.54.0-rc2
 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12}
 export TT_VLLM_COMMIT_SHA_OR_TAG=953161188c50f10da95a88ab305e23977ebd3750
 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12}
-export OS_VERSION=ubuntu-20.04-amd64
 export IMAGE_VERSION=v0.0.1
 docker build \
   -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm-${OS_VERSION}:${IMAGE_VERSION}-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \
-  --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \
+  --build-arg TT_METAL_DOCKERFILE_URL=${TT_METAL_DOCKERFILE_URL} \
   --build-arg TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG} \
   --build-arg TT_VLLM_COMMIT_SHA_OR_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG} \
-  . -f vllm-tt-metal-llama3/vllm.llama3.src.${OS_VERSION}.Dockerfile
+  . -f vllm-tt-metal-llama3/vllm.llama3.src.Dockerfile
 ```
 
+### Ubuntu 22.04 base image
+
+In the tt-metal repo there is a Ubuntu 22.04 Dockerfile: https://github.com/tenstorrent/tt-metal/blob/main/dockerfile/ubuntu-22.04-amd64.Dockerfile
+This Dockerfile installs the python dependencies for Ubuntu 22.04 running Python 3.10: https://github.com/tenstorrent/tt-metal/blob/main/scripts/docker/requirements-22.04.txt
+
+The Ubuntu 22.04 images are not yet published to GHCR as the Ubuntu 20.04 images are (https://github.com/tenstorrent/tt-metal/pkgs/container/tt-metal%2Ftt-metalium%2Fubuntu-20.04-amd64)
+
+You can build local tt-metal ubuntu 22.04 base image:
+```bash
+git clone --depth 1 --branch ${TT_METAL_COMMIT_SHA_OR_TAG} https://github.com/tenstorrent/tt-metal.git
+cd tt-metal
+docker build -t local/tt-metal/tt-metalium/ubuntu-22.04-amd64:latest -f dockerfile/ubuntu-22.04-amd64.Dockerfile .
+```
+
+You can then repeat the steps above to build with, e.g. `TT_METAL_DOCKERFILE_URL=local/tt-metal/tt-metalium/ubuntu-22.04-amd64:latest`
+
 ### push image (only for admin deployment to GHCR)
 ```bash
 docker push ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm-${OS_VERSION}:${IMAGE_VERSION}-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG}
diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.Dockerfile
similarity index 91%
rename from vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile
rename to vllm-tt-metal-llama3/vllm.llama3.src.Dockerfile
index c9532dcc..aa556360 100644
--- a/vllm-tt-metal-llama3/vllm.llama3.src.shared.Dockerfile
+++ b/vllm-tt-metal-llama3/vllm.llama3.src.Dockerfile
@@ -2,6 +2,12 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
+# default base image, override with --build-arg TT_METAL_DOCKERFILE_URL=<url or local image path>
+# NOTE: tt-metal Ubuntu 22.04 Dockerfile must be built locally until release images are published
+ARG TT_METAL_DOCKERFILE_URL=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:v0.53.0-rc34-dev
+
+FROM ${TT_METAL_DOCKERFILE_URL}
+
 # shared build stage, FROM is set by the OS specific Dockerfiles
 LABEL maintainer="Tom Stesco <tstesco@tenstorrent.com>"
 # connect Github repo with package
diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile
deleted file mode 100644
index 49e0fc43..00000000
--- a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-20.04-amd64.Dockerfile
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-#
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# default base image, override with --build-arg TT_METAL_DOCKERFILE_VERSION=<version>
-ARG TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34
-
-FROM ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:$TT_METAL_DOCKERFILE_VERSION-dev
-
-# include shared instructions
-COPY vllm.llama3.src.shared.Dockerfile /vllm.llama3.src.shared.Dockerfile
-RUN cat /vllm.llama3.src.shared.Dockerfile | docker build - < .
diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile
deleted file mode 100644
index dfcdce9d..00000000
--- a/vllm-tt-metal-llama3/vllm.llama3.src.ubuntu-22.04-amd64.Dockerfile
+++ /dev/null
@@ -1,13 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-#
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# set with --build-arg TT_METAL_DOCKERFILE_VERSION=<version>
-# NOTE: tt-metal Ubuntu 22.04 Dockerfile must be built locally until release images are published
-ARG TT_METAL_DOCKERFILE_VERSION
-
-FROM local/tt-metal/tt-metalium/ubuntu-22.04-amd64:$TT_METAL_DOCKERFILE_VERSION
-
-# include shared instructions
-COPY vllm.llama3.src.shared.Dockerfile /vllm.llama3.src.shared.Dockerfile
-RUN cat /vllm.llama3.src.shared.Dockerfile | docker build - < .