diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py
index e580046..76679b0 100644
--- a/benchmarking/benchmark_summary.py
+++ b/benchmarking/benchmark_summary.py
@@ -45,13 +45,14 @@ def parse_args():
 
 def extract_params_from_filename(filename: str) -> Dict[str, Any]:
     pattern = r"""
-        benchmark_
+        .*?benchmark_                                       # Any prefix before benchmark_
         (?P<timestamp>\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})  # Timestamp
         (_(?P<mesh_device>N150|N300|T3K_LINE|T3K_RING|TG))? # MESH_DEVICE
         _isl-(?P<isl>\d+)                                   # Input sequence length
         _osl-(?P<osl>\d+)                                   # Output sequence length
-        _bsz-(?P<bsz>\d+)                                   # Batch size
-        _n-(?P<n>\d+)                                       # Number of requests
+        _maxcon-(?P<maxcon>\d+)                            # Max concurrency
+        _n-(?P<n>\d+)                                      # Number of requests
+        \.json$
     """
     match = re.search(pattern, filename, re.VERBOSE)
     if not match:
@@ -67,7 +68,7 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]:
         "mesh_device": match.group("mesh_device"),
         "input_sequence_length": int(match.group("isl")),
         "output_sequence_length": int(match.group("osl")),
-        "batch_size": int(match.group("bsz")),
+        "batch_size": int(match.group("maxcon")),
         "num_requests": int(match.group("n")),
     }
 
diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index b5e9edc..27ba806 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -102,8 +102,15 @@ def run_sequence_length_test(
         tokenizer = AutoTokenizer.from_pretrained(model)
 
         # pre-capture traces so benchmark does not include 1st run trace capture time
-        # TODO: add support for image input to capture_traces
-        prompt_client.capture_traces(context_lens=[(input_len, output_len)])
+        image_resolutions = []
+        if images:
+            image_resolutions = [
+                (prompt_config.image_width, prompt_config.image_height)
+            ]
+
+        prompt_client.capture_traces(
+            context_lens=[(input_len, output_len)], image_resolutions=image_resolutions
+        )
         # Process batches
         try:
             responses = batch_processor.process_batch(
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 338755b..096ae6c 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -126,8 +126,10 @@ def main():
             / f"vllm_online_benchmark_{run_timestamp}_{mesh_device}_isl-{isl}_osl-{osl}_maxcon-{max_concurrent}_n-{num_prompts}.json"
         )
         logger.info(f"\nRunning benchmark {i}/{len(combinations)}")
+        vllm_dir = os.environ.get("vllm_dir")
+        assert vllm_dir is not None, "vllm_dir must be set."
         run_benchmark(
-            benchmark_script="/home/user/vllm/benchmarks/benchmark_serving.py",
+            benchmark_script=f"{vllm_dir}/benchmarks/benchmark_serving.py",
             params=params,
             model=env_config.vllm_model,
             port=env_config.service_port,
diff --git a/evals/README.md b/evals/README.md
index 8ad95bb..9e7d407 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -13,24 +13,9 @@ For instructions on building the Docker image see: [vllm-tt-metal-llama3/docs/de
 
 ## Step 2: Run Docker container for LM evals development
 
-note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`.
+Follow run guide: [vllm-tt-metal-llama3/README.md](../vllm-tt-metal-llama3/README.md)
 
-```bash
-cd tt-inference-server
-export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/
-docker run \
-  --rm \
-  -it \
-  --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \
-  --cap-add ALL \
-  --device /dev/tenstorrent:/dev/tenstorrent \
-  --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
-  --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \
-  --shm-size 32G \
-  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG}
-```
-
-The default Docker image command will start the vLLM server. 
+note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`.
 
 ## Step 3: Inside container set up llama-recipes LM evalulation harness templates
 
@@ -44,7 +29,7 @@ To access Meta Llama 3.1 evals, you must:
 #### Hugging Face authentication - option 1: HF_TOKEN (if not already passed into Docker container)
 ```bash
 # set up HF Token if not already set up in .env, needed for datasets
-echo "HF_TOKEN=hf_<your_token>" >> vllm-tt-metal-llama3/.env
+echo "HF_TOKEN=hf_<your_token>"
 ```
 
 #### Hugging Face authentication - option 2: huggingface_hub login
diff --git a/evals/run_evals.sh b/evals/run_evals.sh
index 640d6df..6470267 100644
--- a/evals/run_evals.sh
+++ b/evals/run_evals.sh
@@ -45,7 +45,7 @@ lm_eval \
 --gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
 --tasks meta_gpqa \
 --batch_size auto \
---output_path /home/user/cache_root/eval_output \
+--output_path ${CACHE_ROOT}/eval_output \
 --include_path ./work_dir \
 --seed 42  \
 --log_samples
@@ -57,7 +57,7 @@ lm_eval \
 --gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
 --tasks meta_ifeval \
 --batch_size auto \
---output_path /home/user/cache_root/eval_output \
+--output_path ${CACHE_ROOT}/eval_output \
 --include_path ./work_dir \
 --seed 42  \
 --log_samples
diff --git a/evals/run_evals_vision.sh b/evals/run_evals_vision.sh
index f4615ec..ee1b4d8 100644
--- a/evals/run_evals_vision.sh
+++ b/evals/run_evals_vision.sh
@@ -35,7 +35,7 @@ lm_eval \
 --gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
 --tasks mmmu_val \
 --batch_size auto \
---output_path /home/user/cache_root/eval_output \
+--output_path /home/container_app_user/cache_root/eval_output \
 --seed 42  \
 --log_samples
 
diff --git a/setup.sh b/setup.sh
index df7cf64..60e0c77 100755
--- a/setup.sh
+++ b/setup.sh
@@ -9,18 +9,19 @@ set -euo pipefail  # Exit on error, print commands, unset variables treated as e
 usage() {
     echo "Usage: $0 <model_type>"
     echo "Available model types:"
-    echo "  llama-3.3-70b-instruct"
-    echo "  llama-3.2-11b-vision-instruct"
-    echo "  llama-3.2-3b-instruct"
-    echo "  llama-3.2-1b-instruct"
-    echo "  llama-3.1-70b-instruct"
-    echo "  llama-3.1-70b"
-    echo "  llama-3.1-8b-instruct"
-    echo "  llama-3.1-8b"
-    echo "  llama-3-70b-instruct"
-    echo "  llama-3-70b"
-    echo "  llama-3-8b-instruct"
-    echo "  llama-3-8b"
+    echo "  DeepSeek-R1-Distill-Llama-70B"
+    echo "  Llama-3.3-70B-Instruct"
+    echo "  Llama-3.2-11B-Vision-Instruct"
+    echo "  Llama-3.2-3B-Instruct"
+    echo "  Llama-3.2-1B-Instruct"
+    echo "  Llama-3.1-70B-Instruct"
+    echo "  Llama-3.1-70B"
+    echo "  Llama-3.1-8B-Instruct"
+    echo "  Llama-3.1-8B"
+    echo "  Llama-3-70B-Instruct"
+    echo "  Llama-3-70B"
+    echo "  Llama-3-8B-Instruct"
+    echo "  Llama-3-8B"
     echo
     exit 1
 }
@@ -74,6 +75,7 @@ get_hf_env_vars() {
         echo "HF_TOKEN environment variable is not set. Please set it before running the script."
         read -r -s -p "Enter your HF_TOKEN: " input_hf_token
         echo
+        echo "entered HF_TOKEN contains: ${#input_hf_token} characters, expected 37."
         if [ -z "${input_hf_token:-}" ]; then
             echo "⛔ HF_TOKEN cannot be empty. Please try again."
             exit 1
@@ -111,84 +113,104 @@ setup_model_environment() {
     # Set environment variables based on the model selection
     # note: MODEL_NAME is the directory name for the model weights
     case "$1" in
-        "llama-3.3-70b-instruct")
+        "DeepSeek-R1-Distill-Llama-70B")
+        IMPL_ID="tt-metal"
+        MODEL_NAME="DeepSeek-R1-Distill-Llama-70B"
+        HF_MODEL_REPO_ID="deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
+        REPACKED=1
+        ;;
+        "Llama-3.3-70B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.3-70B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=1
         ;;
-        "llama-3.2-11b-vision-instruct")
+        "Llama-3.2-11B-Vision-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.2-11B-Vision-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=0
         ;;
-        "llama-3.2-3b-instruct")
+        "Llama-3.2-3B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.2-3B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.2-3B-Instruct"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=0
         ;;
-        "llama-3.2-1b-instruct")
+        "Llama-3.2-1B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.2-1B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.2-1B-Instruct"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=0
         ;;
-        "llama-3.1-70b-instruct")
+        "Llama-3.1-70B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.1-70B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct"
         META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct"
         META_DIR_FILTER="llama3_1"
         REPACKED=1
         ;;
-        "llama-3.1-70b")
+        "Llama-3.1-70B")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.1-70B"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B"
         META_MODEL_NAME="Meta-Llama-3.1-70B"
         META_DIR_FILTER="llama3_1"
         REPACKED=1
         ;;
-        "llama-3.1-8b-instruct")
+        "Llama-3.1-8B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.1-8B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B-Instruct"
         META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct"
         META_DIR_FILTER="llama3_1"
         REPACKED=0
         ;;
-        "llama-3.1-8b")
+        "Llama-3.1-8B")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.1-8B"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B"
         META_MODEL_NAME="Meta-Llama-3.1-8B"
         META_DIR_FILTER="llama3_1"
         REPACKED=0
         ;;
-        "llama-3-70b-instruct")
+        "Llama-3-70B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3-70B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3-70B-Instruct"
         META_MODEL_NAME="Meta-Llama-3-70B-Instruct"
         META_DIR_FILTER="llama3"
         REPACKED=1
         ;;
-        "llama-3-70b")
+        "Llama-3-70B")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3-70B"
         HF_MODEL_REPO_ID="meta-llama/Llama-3-70B"
         META_MODEL_NAME="Meta-Llama-3-70B"
         META_DIR_FILTER="llama3"
         REPACKED=1
         ;;
-        "llama-3-8b-instruct")
+        "Llama-3-8B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3-8B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3-8B-Instruct"
         META_MODEL_NAME="Meta-Llama-3-8B-Instruct"
         META_DIR_FILTER="llama3"
         REPACKED=0
         ;;
-        "llama-3-8b")
+        "Llama-3-8B")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3-8B"
         HF_MODEL_REPO_ID="meta-llama/Llama-3-8B"
         META_MODEL_NAME="Meta-Llama-3-8B"
@@ -201,32 +223,32 @@ setup_model_environment() {
         exit 1
         ;;
     esac
-    # Initialize OVERWRITE_ENV
-    OVERWRITE_ENV=false
 
     # Set default values for environment variables
     DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume
-    MODEL_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs"
-    
+    # Safely handle potentially unset environment variables using default values
+    PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
+    # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
+    read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
+    PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
+    echo # move to a new line after input   
+    # Set environment variables with defaults if not already set
+    MODEL_VERSION="0.0.1"
+    MODEL_ID="id_${IMPL_ID}-${MODEL_NAME}-v${MODEL_VERSION}"
+    PERSISTENT_VOLUME="${PERSISTENT_VOLUME_ROOT}/volume_${MODEL_ID}"
+
+    # Initialize OVERWRITE_ENV
+    OVERWRITE_ENV=false
+    MODEL_ENV_DIR="${PERSISTENT_VOLUME_ROOT}/model_envs"
     mkdir -p ${MODEL_ENV_DIR}
     ENV_FILE="${MODEL_ENV_DIR}/${MODEL_NAME}.env"
     export ENV_FILE
     check_and_prompt_env_file
 
-
     if [ "$OVERWRITE_ENV" = false ]; then
         echo "✅ using existing .env file: ${ENV_FILE}."
         return 0
     fi
-    # Safely handle potentially unset environment variables using default values
-    PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
-    # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
-    read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
-    PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
-    echo # move to a new line after input   
-    # Set environment variables with defaults if not already set
-    PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1
-    
 
     read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token
     choice_use_hf_token=${input_use_hf_token:-"y"}
@@ -283,15 +305,15 @@ setup_model_environment() {
     cat > ${ENV_FILE} <<EOF
 # Environment variables for the model setup
 USE_HF_DOWNLOAD=$choice_use_hf_token
+HF_MODEL_REPO_ID=$HF_MODEL_REPO_ID
 MODEL_NAME=$MODEL_NAME
+MODEL_VERSION=${MODEL_VERSION}
+IMPL_ID=${IMPL_ID}
+MODEL_ID=${MODEL_ID}
 META_MODEL_NAME=$META_MODEL_NAME
-HF_MODEL_REPO_ID=$HF_MODEL_REPO_ID
 REPACKED=${REPACKED}
 REPACKED_STR=${REPACKED_STR}
 # model runtime variables
-LLAMA_VERSION=llama3
-TT_METAL_ASYNC_DEVICE_QUEUE=1
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
 SERVICE_PORT=7000
 # host paths
 HOST_HF_HOME=${HF_HOME:-""}
@@ -467,17 +489,17 @@ setup_weights_huggingface() {
         mv "${WEIGHTS_DIR}/consolidated.pth" "${WEIGHTS_DIR}/consolidated.00.pth"  
     fi
 
-    # Step 6: Process and copy weights
+    # Step 6: Cleanup HF setup venv
+    deactivate
+    rm -rf ${VENV_NAME}
+    
+    # Step 7: Process and copy weights
     if [ "${REPACKED}" -eq 1 ]; then
         REPACKED_WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
         mkdir -p "${REPACKED_WEIGHTS_DIR}"
         repack_weights "${WEIGHTS_DIR}" "${REPACKED_WEIGHTS_DIR}"
     fi
 
-    # Step 7: Cleanup
-    deactivate
-    rm -rf ${VENV_NAME}
-
     echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
     echo "✅ setup_weights_huggingface completed!"
 }
diff --git a/tests/README.md b/tests/README.md
index d4240bd..22050d8 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -18,7 +18,7 @@ export VLLM_COMMIT_SHA=<vllm-commit>
 Add a volume mounting the `test` directory in the container before running with the following in the docker run command:
 
 ```bash
---volume $PWD/tests:/home/user/tests
+--volume $PWD/tests:/home/container_app_user/tests
 ```
 
 ## 3. Run The Mock Model
@@ -26,7 +26,7 @@ Add a volume mounting the `test` directory in the container before running with
 Once in the docker container, run the mock script with:
 
 ```bash
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/user/tests/mock_vllm_offline_inference_tt.py
+WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/container_app_user/tests/mock_vllm_offline_inference_tt.py
 ```
 
 # Build mock model container
diff --git a/tests/benchmark_vllm_offline_inference.py b/tests/benchmark_vllm_offline_inference.py
index 73ef98f..e59c115 100644
--- a/tests/benchmark_vllm_offline_inference.py
+++ b/tests/benchmark_vllm_offline_inference.py
@@ -30,7 +30,7 @@ def parse_args():
     parser.add_argument(
         "--prompts_json",
         type=str,
-        default="/home/user/vllm/tt_metal/prompts.json",
+        default="/home/container_app_user/vllm/tt_metal/prompts.json",
         help="Path to JSON file containing prompts",
     )
     parser.add_argument(
diff --git a/tests/mock.vllm.openai.api.dockerfile b/tests/mock.vllm.openai.api.dockerfile
index 3806700..57039fc 100644
--- a/tests/mock.vllm.openai.api.dockerfile
+++ b/tests/mock.vllm.openai.api.dockerfile
@@ -17,6 +17,8 @@ ARG DEBIAN_FRONTEND=noninteractive
 # default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG=<sha>
 ARG TT_METAL_COMMIT_SHA_OR_TAG
 ARG TT_VLLM_COMMIT_SHA_OR_TAG=dev
+# CONTAINER_APP_UID is a random ID, change this and rebuild if it collides with host
+ARG CONTAINER_APP_UID=15863
 
 # make build commit SHA available in the image for reference and debugging
 ENV TT_METAL_COMMIT_SHA_OR_TAG=${TT_METAL_COMMIT_SHA_OR_TAG}
@@ -62,13 +64,14 @@ RUN git clone --depth 1 https://github.com/tenstorrent-metal/tt-metal.git ${TT_M
     && bash ./create_venv.sh
 
 # user setup
-ARG HOME_DIR=/home/user
-RUN useradd -u 1000 -s /bin/bash -d ${HOME_DIR} user \
+ENV CONTAINER_APP_USERNAME=container_app_user
+ARG HOME_DIR=/home/${CONTAINER_APP_USERNAME}
+RUN useradd -u ${CONTAINER_APP_UID} -s /bin/bash -d ${HOME_DIR} ${CONTAINER_APP_USERNAME} \
     && mkdir -p ${HOME_DIR} \
-    && chown -R user:user ${HOME_DIR} \
-    && chown -R user:user ${TT_METAL_HOME}
-  
-USER user
+    && chown -R ${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} ${HOME_DIR} \
+    && chown -R ${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} ${TT_METAL_HOME}
+
+USER ${CONTAINER_APP_USERNAME}
 
 # tt-metal python env default
 RUN echo "source ${PYTHON_ENV_DIR}/bin/activate" >> ${HOME_DIR}/.bashrc
@@ -96,21 +99,27 @@ RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate && pip install compresse
 ARG APP_DIR="${HOME_DIR}/app"
 WORKDIR ${APP_DIR}
 ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR}
-COPY --chown=user:user "vllm-tt-metal-llama3/src" "${APP_DIR}/src"
-COPY --chown=user:user "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt"
-COPY --chown=user:user "utils" "${APP_DIR}/utils"
-COPY --chown=user:user "tests" "${APP_DIR}/tests"
+COPY --chown=${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} "vllm-tt-metal-llama3/src" "${APP_DIR}/src"
+COPY --chown=${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} "vllm-tt-metal-llama3/requirements.txt" "${APP_DIR}/requirements.txt"
+COPY --chown=${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} "utils" "${APP_DIR}/utils"
+COPY --chown=${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} "tests" "${APP_DIR}/tests"
 RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
 && pip install --default-timeout=240 --no-cache-dir -r requirements.txt"
 
-WORKDIR "${APP_DIR}/tests"
-CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python mock_vllm_api_server.py"]
-
 # Default environment variables for the Llama-3.1-70b-instruct inference server
 # Note: LLAMA3_CKPT_DIR and similar variables get set by mock_vllm_api_server.py
-ENV CACHE_ROOT=/home/user/cache_root
-ENV HF_HOME=/home/user/cache_root/huggingface
+ENV CACHE_ROOT=${HOME_DIR}/cache_root
+ENV HF_HOME=${CACHE_ROOT}/huggingface
 ENV MODEL_WEIGHTS_ID=id_repacked-Llama-3.1-70B-Instruct
-ENV MODEL_WEIGHTS_PATH=/home/user/cache_root/model_weights/repacked-Llama-3.1-70B-Instruct
+ENV MODEL_WEIGHTS_PATH=${CACHE_ROOT}/model_weights/repacked-Llama-3.1-70B-Instruct
 ENV LLAMA_VERSION=llama3
 ENV SERVICE_PORT=7000
+
+# Switch back to root for entrypoint
+USER root
+
+COPY docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+
+ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
+CMD ["/bin/bash", "-c", "source ${PYTHON_ENV_DIR}/bin/activate && python ${APP_DIR}/tests/mock_vllm_api_server.py"]
diff --git a/tests/mock_vllm_offline_inference_tt.py b/tests/mock_vllm_offline_inference_tt.py
index 71c3de9..dc2ae9a 100644
--- a/tests/mock_vllm_offline_inference_tt.py
+++ b/tests/mock_vllm_offline_inference_tt.py
@@ -214,7 +214,7 @@ async def generate_tokens_async(
     parser.add_argument(
         "--prompts_json",
         type=str,
-        default="/home/user/vllm/tt_metal/prompts.json",
+        default="/home/container_app_user/vllm/tt_metal/prompts.json",
         help="Path to JSON file containing prompts",
     )
     parser.add_argument(
diff --git a/utils/capture_traces.py b/utils/capture_traces.py
index f3703b1..699cfdf 100644
--- a/utils/capture_traces.py
+++ b/utils/capture_traces.py
@@ -2,6 +2,7 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
+import argparse
 import logging
 
 from utils.prompt_configs import EnvironmentConfig
@@ -14,11 +15,38 @@
 logger.setLevel(logging.INFO)
 
 
-def capture_input_sizes():
+def add_cli_args(parser):
+    parser.add_argument(
+        "--include_images",
+        action="store_true",
+        help="Include randomly generated images with prompts",
+    )
+    parser.add_argument(
+        "--image_width",
+        type=int,
+        default=256,
+        help="Width of generated images",
+    )
+    parser.add_argument(
+        "--image_height",
+        type=int,
+        default=256,
+        help="Height of generated images",
+    )
+    return parser
+
+
+def capture_input_sizes(arg):
     env_config = EnvironmentConfig()
     prompt_client = PromptClient(env_config)
-    prompt_client.capture_traces()
+    image_resolutions = []
+    if args.include_images:
+        image_resolutions = [(args.image_width, args.image_height)]
+    prompt_client.capture_traces(image_resolutions=image_resolutions)
 
 
 if __name__ == "__main__":
-    capture_input_sizes()
+    parser = argparse.ArgumentParser()
+    parser = add_cli_args(parser)
+    args = parser.parse_args()
+    capture_input_sizes(args)
diff --git a/utils/prompt_client.py b/utils/prompt_client.py
index 53d70d0..4b7d2ee 100644
--- a/utils/prompt_client.py
+++ b/utils/prompt_client.py
@@ -98,13 +98,18 @@ def wait_for_healthy(self, timeout: int = 300, interval: int = 10) -> bool:
     def capture_traces(
         self,
         context_lens: List[Tuple[int, int]] = None,
-        prompts_per_size: int = 1,
+        image_resolutions: List[Tuple[int, int]] = None,
     ) -> None:
-        logger.info("Capturing input sizes ...")
+        """Capture traces for text and/or image inputs at different sizes.
 
-        # Default input sizes based on get_padded_prefill_len()
+        Args:
+            context_lens: List of (input_seq_len, output_seq_len) tuples for text lengths
+            image_resolutions: List of (width, height) tuples for image resolutions
+        """
+        logger.info("Capturing traces for input configurations...")
+
+        # Default input sizes if none provided
         if context_lens is None:
-            # generate 4 osl tokens by default for each isl
             context_lens = [
                 (32, 4),
                 (64, 4),
@@ -121,49 +126,106 @@ def capture_traces(
         if not self.wait_for_healthy():
             raise RuntimeError("vLLM did not start correctly!")
 
+        # Import image generation only if needed
+        if image_resolutions:
+            from utils.prompt_generation import generate_random_images
+
+        # Process each text length configuration
         for isl, osl in context_lens:
-            logger.info(f"Capture trace: isl={isl}, osl={osl}")
+            logger.info(
+                f"Capturing traces for input_seq_len={isl}, output_seq_len={osl}"
+            )
 
-            # Create prompt config for current size
+            # Create prompt config
             prompt_config = PromptConfig(
                 input_seq_len=isl,
                 max_prompt_length=isl,
-                num_prompts=prompts_per_size,
+                num_prompts=1,
                 distribution="fixed",
                 dataset="random",
                 tokenizer_model=self.env_config.vllm_model,
                 template=None,
                 save_path=None,
                 print_prompts=False,
+                use_chat_api=bool(image_resolutions),  # Use chat API if we have images
             )
 
             # Generate prompts for current size
             prompts, prompt_lengths = generate_prompts(prompt_config)
 
-            # Process each prompt
-            for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)):
-                try:
-                    logger.info(
-                        f"Starting trace capture for: input_seq_len:={prompt_len}, output_seq_len:={osl}"
-                    )
-                    response_data = self.call_inference(
-                        prompt=prompt,
-                        images=[],
-                        response_idx=i,
-                        prompt_len=prompt_len,
-                        max_tokens=osl,
-                        stream=True,
-                        vll_model=self.env_config.vllm_model,
-                        tokenizer=None,
-                        force_max_tokens=True,
-                    )
-                    logger.info(
-                        f"tokens generated: {response_data['output_seq_len']}, "
-                        f"TTFT: {response_data['ttft_ms']:.3f} ms, "
-                        f"TPOT: {response_data['tpot_ms']:.3f} ms"
-                    )
-                except Exception as e:
-                    logger.error(f"Error processing prompt: {e}")
+            # If no image resolutions specified, do text-only traces
+            if not image_resolutions:
+                for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)):
+                    try:
+                        logger.info(
+                            f"Starting text trace capture: "
+                            f"input_seq_len={prompt_len}, output_seq_len={osl}"
+                        )
+                        response_data = self.call_inference(
+                            prompt=prompt,
+                            images=[],
+                            response_idx=i,
+                            prompt_len=prompt_len,
+                            max_tokens=osl,
+                            stream=True,
+                            vll_model=self.env_config.vllm_model,
+                            tokenizer=None,
+                            force_max_tokens=True,
+                            use_chat_api=False,
+                        )
+                        logger.info(
+                            f"Text trace completed: "
+                            f"tokens_generated={response_data['output_seq_len']}, "
+                            f"TTFT={response_data['ttft_ms']:.3f}ms, "
+                            f"TPOT={response_data['tpot_ms']:.3f}ms\n"
+                        )
+                    except Exception as e:
+                        logger.error(f"Error processing text prompt: {e}")
+                        continue
+            else:
+                # Process each image resolution with the current text length
+                for width, height in image_resolutions:
+                    for i, (prompt, prompt_len) in enumerate(
+                        zip(prompts, prompt_lengths)
+                    ):
+                        try:
+                            # Generate random image at current resolution
+                            image_data = generate_random_images(
+                                width=width,
+                                height=height,
+                                base64_encoded=True,
+                            )
+
+                            logger.info(
+                                f"Starting image + text trace capture: "
+                                f"input_seq_len={prompt_len}, output_seq_len={osl}, "
+                                f"image_size={width}x{height}"
+                            )
+
+                            response_data = self.call_inference(
+                                prompt=prompt,
+                                images=[image_data],
+                                response_idx=i,
+                                prompt_len=prompt_len,
+                                max_tokens=osl,
+                                stream=True,
+                                vll_model=self.env_config.vllm_model,
+                                tokenizer=None,
+                                force_max_tokens=True,
+                                use_chat_api=True,
+                            )
+
+                            logger.info(
+                                f"Image + Text trace completed: "
+                                f"tokens_generated={response_data['output_seq_len']}, "
+                                f"TTFT={response_data['ttft_ms']:.3f}ms, "
+                                f"TPOT={response_data['tpot_ms']:.3f}ms\n"
+                            )
+                        except Exception as e:
+                            logger.error(
+                                f"Error processing prompt with image {width}x{height}: {e}"
+                            )
+                            continue
 
     def call_inference(
         self,
diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index 63b451a..f8ce8a9 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -214,8 +214,14 @@ def main():
 
     if not args.skip_trace_precapture:
         # pre-capture traces to not include 1st run trace capture time
+        image_resolutions = []
+        if images:
+            image_resolutions = [
+                (prompt_config.image_width, prompt_config.image_height)
+            ]
         prompt_client.capture_traces(
-            context_lens=[(args.input_seq_len, args.output_seq_len)]
+            context_lens=[(args.input_seq_len, args.output_seq_len)],
+            image_resolutions=image_resolutions,
         )
 
     # Process batches
diff --git a/utils/prompt_configs.py b/utils/prompt_configs.py
index f43cb89..f35a9c5 100644
--- a/utils/prompt_configs.py
+++ b/utils/prompt_configs.py
@@ -44,7 +44,7 @@ def get_mesh_device():
         # need record of what MESH_DEVICE configuration is running
         raise ValueError(
             "environment variable MESH_DEVICE must be set.\n",
-            "Possible values: N150, N300, T3K_LINE",
+            "Possible values: N150, N300, T3K_LINE, T3K_RING",
         )
     return mesh_device
 
diff --git a/vllm-tt-metal-llama3/README.md b/vllm-tt-metal-llama3/README.md
index 62f6079..6d5de44 100644
--- a/vllm-tt-metal-llama3/README.md
+++ b/vllm-tt-metal-llama3/README.md
@@ -24,18 +24,19 @@ Run the container from the project root at `tt-inference-server`:
 ```bash
 cd tt-inference-server
 # make sure if you already set up the model weights and cache you use the correct persistent volume
-export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-Llama-3.3-70B-Instructv0.0.1/
+export MODEL_NAME=Llama-3.3-70B-Instruct
+export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-${MODEL_NAME}-v0.0.1/
 docker run \
   --rm \
   -it \
-  --env-file persistent_volume/model_envs/Llama-3.3-70B-Instruct.env \
+  --env-file persistent_volume/model_envs/${MODEL_NAME}.env \
   --cap-add ALL \
   --device /dev/tenstorrent:/dev/tenstorrent \
   --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
-  --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \
+  --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/container_app_user/cache_root:rw \
   --shm-size 32G \
   --publish 7000:7000 \
-  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50 
+  ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64:v0.0.1-47fb1a2fb6e0-2f33504bad49
 ```
 
 By default the Docker container will start running the entrypoint command wrapped in `src/run_vllm_api_server.py`.
@@ -65,7 +66,6 @@ python example_requests_client_alpaca_eval.py --stream True --n_samples 1 --num_
 python example_requests_client_alpaca_eval.py --stream True --n_samples 805 --num_full_iterations 1 --batch_size 32
 ```
 
-
 ## First run setup
 
 Tested starting condition is from a fresh installation of Ubuntu 20.04 with Tenstorrent system dependencies installed.
@@ -78,11 +78,17 @@ Recommended to follow postinstall guide to allow $USER to run docker without sud
 
 ### 2. Ensure system dependencies installed
 
+Follow TT strating guide software installation at: https://docs.tenstorrent.com/quickstart.html
+
+Ensure all set up:
+- firmware: tt-firmware (https://github.com/tenstorrent/tt-firmware)
+- drivers: tt-kmd (https://github.com/tenstorrent/tt-kmd)
+- hugepages: see https://docs.tenstorrent.com/quickstart.html#step-4-setup-hugepages and https://github.com/tenstorrent/tt-system-tools
 - tt-smi: https://github.com/tenstorrent/tt-smi
-- firmware: bundle 80.10.1.0 (https://github.com/tenstorrent/tt-firmware/blob/02b4b6ed49b6ea2fb9a8664e99d4fed25e443bd6/experiments/fw_pack-80.10.1.0.fwbundle)
-- drivers: tt-kmd version 1.29 (https://github.com/tenstorrent/tt-kmd/tree/ttkmd-1.29)
-- topology: ensure mesh topology https://github.com/tenstorrent/tt-topology
-- hugepages: https://github.com/tenstorrent/tt-system-tools
+
+If running on a TT-LoudBox or TT-QuietBox, you will also need:
+- topology: tt-topology https://github.com/tenstorrent/tt-topology
+  - set up mesh topology, see https://github.com/tenstorrent/tt-topology?tab=readme-ov-file#mesh
 
 ### 3. CPU performance setting
 
@@ -105,7 +111,7 @@ Either download the Docker image from GitHub Container Registry (recommended for
 
 ```bash
 # pull image from GHCR
-docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50
+docker pull ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64:v0.0.1-47fb1a2fb6e0-2f33504bad49
 ```
 
 #### Option B: Build Docker Image
@@ -124,7 +130,7 @@ The script `setup.sh` automates:
 ```bash
 cd tt-inference-server
 chmod +x setup.sh
-./setup.sh llama-3.1-70b-instruct
+./setup.sh Llama-3.3-70B-instruct
 ```
 
 # Additional Documentation
diff --git a/vllm-tt-metal-llama3/docs/development.md b/vllm-tt-metal-llama3/docs/development.md
index 3537af2..0756e8f 100644
--- a/vllm-tt-metal-llama3/docs/development.md
+++ b/vllm-tt-metal-llama3/docs/development.md
@@ -54,27 +54,29 @@ docker push ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base
 
 ## Step 2: Run container for LM evals development
 
-note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`.
+note: this requires running `setup.sh` to set up the weights for a particular model, in this example `Llama-3.3-70B-Instruct`.
 
 ```bash
 cd tt-inference-server
-export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-Llama-3.3-70B-Instructv0.0.1/
+export MODEL_NAME=Llama-3.3-70B-Instruct
+export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-${MODEL_NAME}-v0.0.1/
+
 docker run \
   --rm \
   -it \
-  --env-file persistent_volume/model_envs/Llama-3.1-70B-Instruct.env \
+  --env-file persistent_volume/model_envs/${MODEL_NAME}.env \
   --cap-add ALL \
   --device /dev/tenstorrent:/dev/tenstorrent \
   --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
-  --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \
+  --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/container_app_user/cache_root:rw \
   --shm-size 32G \
-  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} bash
+  ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64:v0.0.1-{TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} bash
 ```
 
 additionally for development you can mount the volumes:
 ```bash
-  --volume $PWD/../vllm:/home/user/vllm \
-  --volume $PWD/../lm-evaluation-harness:/home/user/lm-evaluation-harness \
+  --volume $PWD/../vllm:/home/container_app_user/vllm \
+  --volume $PWD/../lm-evaluation-harness:/home/container_app_user/lm-evaluation-harness \
 ```
 
 ## Step 3: Inside container setup and run vLLM
@@ -93,7 +95,7 @@ Already built into Docker image, continue to run vLLM.
 
 ```bash
 # option 2: install from github
-cd /home/user/vllm
+cd /home/container_app_user/vllm
 git fetch
 git checkout <branch>
 git pull
@@ -104,7 +106,7 @@ echo "done vllm install."
 
 ```bash
 # option 3: install edittable (for development) - mount from outside container
-cd /home/user/vllm
+cd /home/container_app_user/vllm
 pip install -e .
 echo "done vllm install."
 ```
@@ -113,7 +115,7 @@ echo "done vllm install."
 
 ```bash
 # run vllm serving
-cd /home/user/vllm
+cd /home/container_app_user/vllm
 python examples/server_example_tt.py
 ```
 
diff --git a/vllm-tt-metal-llama3/src/run_vllm_api_server.py b/vllm-tt-metal-llama3/src/run_vllm_api_server.py
index 98b95a4..527d482 100644
--- a/vllm-tt-metal-llama3/src/run_vllm_api_server.py
+++ b/vllm-tt-metal-llama3/src/run_vllm_api_server.py
@@ -32,6 +32,8 @@ def register_vllm_models():
     hf_model_id = get_hf_model_id()
     if hf_model_id in legacy_impl_models:
         from models.demos.t3000.llama2_70b.tt.generator_vllm import TtLlamaForCausalLM
+
+        ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaForCausalLM)
     else:
         from models.demos.llama3.tt.generator_vllm import (
             TtMllamaForConditionalGeneration,
@@ -95,10 +97,33 @@ def ensure_mesh_device(hf_model_id):
     print(f"using MESH_DEVICE:={os.environ.get('MESH_DEVICE')}")
 
 
+def runtime_settings(hf_model_id):
+    # default runtime env vars
+    env_vars = {
+        "TT_METAL_ASYNC_DEVICE_QUEUE": 1,
+        "WH_ARCH_YAML": "wormhole_b0_80_arch_eth_dispatch.yaml",
+    }
+    env_var_map = {
+        "meta-llama/Llama-3.1-70B-Instruct": {
+            "LLAMA_VERSION": "llama3",
+        },
+        "meta-llama/Llama-3.3-70B-Instruct": {
+            "LLAMA_VERSION": "llama3",
+        },
+    }
+    env_vars.update(env_var_map.get(hf_model_id, {}))
+    # Set each environment variable
+    print("setting runtime environment variables:")
+    for key, value in env_vars.items():
+        print(f"{key}={value}")
+        os.environ[key] = str(value)
+
+
 def model_setup(hf_model_id):
     # TODO: check HF repo access with HF_TOKEN supplied
     print(f"using model: {hf_model_id}")
     ensure_mesh_device(hf_model_id)
+    runtime_settings(hf_model_id)
     args = {
         "model": hf_model_id,
         "block_size": "64",