Release Candidate v0.0.1 (#88)

# change log * Default handling of MESH_DEVICE for Llama 3.x models * Modified setup script improvements: * Improved environment variable handling and persistence storage integration * Added IMPL_ID field (set to "tt-metal" for all current models) * Introduced MODEL_VERSION and MODEL_ID variables for better versioning * Add image input support for image-text-to-text models in client scripts and tools * Added support for image input in trace capturing * Added new parameters for image width and height * Implemented handling of both text-only and image+text trace captures * Rename client side scripts batch_size options to max_concurrent to indicate client side concurrent request limits * Fixed the vLLM model registration logic. Added missing ModelRegistry.register_model call for TTLlamaForCausalLM for legacy implementation models * Updated benchmark path handling to use $HOME environment variable instead of hardcoded /home/user path * Add benchmark summary support handling for vllm benchmark script, add documentation example * Added support for a new model "DeepSeek-R1-Distill-Llama-70B" in the model setup configurations * use CACHE_ROOT and vllm_dir where possible, fix mock.vllm.openai.dockerfile (#96)
tenstorrent · Feb 5, 2025 · 6061606 · 6061606
1 parent 0409f4b
commit 6061606
Show file tree

Hide file tree

Showing 18 changed files with 307 additions and 152 deletions.
diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py
@@ -45,13 +45,14 @@ def parse_args():
 
 def extract_params_from_filename(filename: str) -> Dict[str, Any]:
     pattern = r"""
-        benchmark_
+        .*?benchmark_                                       # Any prefix before benchmark_
         (?P<timestamp>\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})  # Timestamp
         (_(?P<mesh_device>N150|N300|T3K_LINE|T3K_RING|TG))? # MESH_DEVICE
         _isl-(?P<isl>\d+)                                   # Input sequence length
         _osl-(?P<osl>\d+)                                   # Output sequence length
-        _bsz-(?P<bsz>\d+)                                   # Batch size
-        _n-(?P<n>\d+)                                       # Number of requests
+        _maxcon-(?P<maxcon>\d+)                            # Max concurrency
+        _n-(?P<n>\d+)                                      # Number of requests
+        \.json$
     """
     match = re.search(pattern, filename, re.VERBOSE)
     if not match:
@@ -67,7 +68,7 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]:
         "mesh_device": match.group("mesh_device"),
         "input_sequence_length": int(match.group("isl")),
         "output_sequence_length": int(match.group("osl")),
-        "batch_size": int(match.group("bsz")),
+        "batch_size": int(match.group("maxcon")),
         "num_requests": int(match.group("n")),
     }
 

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
@@ -102,8 +102,15 @@ def run_sequence_length_test(
         tokenizer = AutoTokenizer.from_pretrained(model)
 
         # pre-capture traces so benchmark does not include 1st run trace capture time
-        # TODO: add support for image input to capture_traces
-        prompt_client.capture_traces(context_lens=[(input_len, output_len)])
+        image_resolutions = []
+        if images:
+            image_resolutions = [
+                (prompt_config.image_width, prompt_config.image_height)
+            ]
+
+        prompt_client.capture_traces(
+            context_lens=[(input_len, output_len)], image_resolutions=image_resolutions
+        )
         # Process batches
         try:
             responses = batch_processor.process_batch(

diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
@@ -126,8 +126,10 @@ def main():
             / f"vllm_online_benchmark_{run_timestamp}_{mesh_device}_isl-{isl}_osl-{osl}_maxcon-{max_concurrent}_n-{num_prompts}.json"
         )
         logger.info(f"\nRunning benchmark {i}/{len(combinations)}")
+        vllm_dir = os.environ.get("vllm_dir")
+        assert vllm_dir is not None, "vllm_dir must be set."
         run_benchmark(
-            benchmark_script="/home/user/vllm/benchmarks/benchmark_serving.py",
+            benchmark_script=f"{vllm_dir}/benchmarks/benchmark_serving.py",
             params=params,
             model=env_config.vllm_model,
             port=env_config.service_port,

diff --git a/evals/README.md b/evals/README.md
@@ -13,24 +13,9 @@ For instructions on building the Docker image see: [vllm-tt-metal-llama3/docs/de
 
 ## Step 2: Run Docker container for LM evals development
 
-note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`.
+Follow run guide: [vllm-tt-metal-llama3/README.md](../vllm-tt-metal-llama3/README.md)
 
-```bash
-cd tt-inference-server
-export MODEL_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/
-docker run \
-  --rm \
-  -it \
-  --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \
-  --cap-add ALL \
-  --device /dev/tenstorrent:/dev/tenstorrent \
-  --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
-  --volume ${MODEL_VOLUME?ERROR env var MODEL_VOLUME must be set}:/home/user/cache_root:rw \
-  --shm-size 32G \
-  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG}
-```
-
-The default Docker image command will start the vLLM server. 
+note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`.
 
 ## Step 3: Inside container set up llama-recipes LM evalulation harness templates
 
@@ -44,7 +29,7 @@ To access Meta Llama 3.1 evals, you must:
 #### Hugging Face authentication - option 1: HF_TOKEN (if not already passed into Docker container)
 ```bash
 # set up HF Token if not already set up in .env, needed for datasets
-echo "HF_TOKEN=hf_<your_token>" >> vllm-tt-metal-llama3/.env
+echo "HF_TOKEN=hf_<your_token>"
 ```
 
 #### Hugging Face authentication - option 2: huggingface_hub login

diff --git a/evals/run_evals.sh b/evals/run_evals.sh
@@ -45,7 +45,7 @@ lm_eval \
 --gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
 --tasks meta_gpqa \
 --batch_size auto \
---output_path /home/user/cache_root/eval_output \
+--output_path ${CACHE_ROOT}/eval_output \
 --include_path ./work_dir \
 --seed 42  \
 --log_samples
@@ -57,7 +57,7 @@ lm_eval \
 --gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
 --tasks meta_ifeval \
 --batch_size auto \
---output_path /home/user/cache_root/eval_output \
+--output_path ${CACHE_ROOT}/eval_output \
 --include_path ./work_dir \
 --seed 42  \
 --log_samples

diff --git a/evals/run_evals_vision.sh b/evals/run_evals_vision.sh
@@ -35,7 +35,7 @@ lm_eval \
 --gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
 --tasks mmmu_val \
 --batch_size auto \
---output_path /home/user/cache_root/eval_output \
+--output_path /home/container_app_user/cache_root/eval_output \
 --seed 42  \
 --log_samples
 

diff --git a/setup.sh b/setup.sh
@@ -9,18 +9,19 @@ set -euo pipefail  # Exit on error, print commands, unset variables treated as e
 usage() {
     echo "Usage: $0 <model_type>"
     echo "Available model types:"
-    echo "  llama-3.3-70b-instruct"
-    echo "  llama-3.2-11b-vision-instruct"
-    echo "  llama-3.2-3b-instruct"
-    echo "  llama-3.2-1b-instruct"
-    echo "  llama-3.1-70b-instruct"
-    echo "  llama-3.1-70b"
-    echo "  llama-3.1-8b-instruct"
-    echo "  llama-3.1-8b"
-    echo "  llama-3-70b-instruct"
-    echo "  llama-3-70b"
-    echo "  llama-3-8b-instruct"
-    echo "  llama-3-8b"
+    echo "  DeepSeek-R1-Distill-Llama-70B"
+    echo "  Llama-3.3-70B-Instruct"
+    echo "  Llama-3.2-11B-Vision-Instruct"
+    echo "  Llama-3.2-3B-Instruct"
+    echo "  Llama-3.2-1B-Instruct"
+    echo "  Llama-3.1-70B-Instruct"
+    echo "  Llama-3.1-70B"
+    echo "  Llama-3.1-8B-Instruct"
+    echo "  Llama-3.1-8B"
+    echo "  Llama-3-70B-Instruct"
+    echo "  Llama-3-70B"
+    echo "  Llama-3-8B-Instruct"
+    echo "  Llama-3-8B"
     echo
     exit 1
 }
@@ -74,6 +75,7 @@ get_hf_env_vars() {
         echo "HF_TOKEN environment variable is not set. Please set it before running the script."
         read -r -s -p "Enter your HF_TOKEN: " input_hf_token
         echo
+        echo "entered HF_TOKEN contains: ${#input_hf_token} characters, expected 37."
         if [ -z "${input_hf_token:-}" ]; then
             echo "⛔ HF_TOKEN cannot be empty. Please try again."
             exit 1
@@ -111,84 +113,104 @@ setup_model_environment() {
     # Set environment variables based on the model selection
     # note: MODEL_NAME is the directory name for the model weights
     case "$1" in
-        "llama-3.3-70b-instruct")
+        "DeepSeek-R1-Distill-Llama-70B")
+        IMPL_ID="tt-metal"
+        MODEL_NAME="DeepSeek-R1-Distill-Llama-70B"
+        HF_MODEL_REPO_ID="deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
+        REPACKED=1
+        ;;
+        "Llama-3.3-70B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.3-70B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=1
         ;;
-        "llama-3.2-11b-vision-instruct")
+        "Llama-3.2-11B-Vision-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.2-11B-Vision-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=0
         ;;
-        "llama-3.2-3b-instruct")
+        "Llama-3.2-3B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.2-3B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.2-3B-Instruct"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=0
         ;;
-        "llama-3.2-1b-instruct")
+        "Llama-3.2-1B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.2-1B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.2-1B-Instruct"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=0
         ;;
-        "llama-3.1-70b-instruct")
+        "Llama-3.1-70B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.1-70B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct"
         META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct"
         META_DIR_FILTER="llama3_1"
         REPACKED=1
         ;;
-        "llama-3.1-70b")
+        "Llama-3.1-70B")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.1-70B"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B"
         META_MODEL_NAME="Meta-Llama-3.1-70B"
         META_DIR_FILTER="llama3_1"
         REPACKED=1
         ;;
-        "llama-3.1-8b-instruct")
+        "Llama-3.1-8B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.1-8B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B-Instruct"
         META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct"
         META_DIR_FILTER="llama3_1"
         REPACKED=0
         ;;
-        "llama-3.1-8b")
+        "Llama-3.1-8B")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3.1-8B"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B"
         META_MODEL_NAME="Meta-Llama-3.1-8B"
         META_DIR_FILTER="llama3_1"
         REPACKED=0
         ;;
-        "llama-3-70b-instruct")
+        "Llama-3-70B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3-70B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3-70B-Instruct"
         META_MODEL_NAME="Meta-Llama-3-70B-Instruct"
         META_DIR_FILTER="llama3"
         REPACKED=1
         ;;
-        "llama-3-70b")
+        "Llama-3-70B")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3-70B"
         HF_MODEL_REPO_ID="meta-llama/Llama-3-70B"
         META_MODEL_NAME="Meta-Llama-3-70B"
         META_DIR_FILTER="llama3"
         REPACKED=1
         ;;
-        "llama-3-8b-instruct")
+        "Llama-3-8B-Instruct")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3-8B-Instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3-8B-Instruct"
         META_MODEL_NAME="Meta-Llama-3-8B-Instruct"
         META_DIR_FILTER="llama3"
         REPACKED=0
         ;;
-        "llama-3-8b")
+        "Llama-3-8B")
+        IMPL_ID="tt-metal"
         MODEL_NAME="Llama-3-8B"
         HF_MODEL_REPO_ID="meta-llama/Llama-3-8B"
         META_MODEL_NAME="Meta-Llama-3-8B"
@@ -201,32 +223,32 @@ setup_model_environment() {
         exit 1
         ;;
     esac
-    # Initialize OVERWRITE_ENV
-    OVERWRITE_ENV=false
 
     # Set default values for environment variables
     DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume
-    MODEL_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs"
-
+    # Safely handle potentially unset environment variables using default values
+    PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
+    # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
+    read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
+    PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
+    echo # move to a new line after input   
+    # Set environment variables with defaults if not already set
+    MODEL_VERSION="0.0.1"
+    MODEL_ID="id_${IMPL_ID}-${MODEL_NAME}-v${MODEL_VERSION}"
+    PERSISTENT_VOLUME="${PERSISTENT_VOLUME_ROOT}/volume_${MODEL_ID}"
+
+    # Initialize OVERWRITE_ENV
+    OVERWRITE_ENV=false
+    MODEL_ENV_DIR="${PERSISTENT_VOLUME_ROOT}/model_envs"
     mkdir -p ${MODEL_ENV_DIR}
     ENV_FILE="${MODEL_ENV_DIR}/${MODEL_NAME}.env"
     export ENV_FILE
     check_and_prompt_env_file
 
-
     if [ "$OVERWRITE_ENV" = false ]; then
         echo "✅ using existing .env file: ${ENV_FILE}."
         return 0
     fi
-    # Safely handle potentially unset environment variables using default values
-    PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
-    # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
-    read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
-    PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
-    echo # move to a new line after input   
-    # Set environment variables with defaults if not already set
-    PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1
-
 
     read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token
     choice_use_hf_token=${input_use_hf_token:-"y"}
@@ -283,15 +305,15 @@ setup_model_environment() {
     cat > ${ENV_FILE} <<EOF
 # Environment variables for the model setup
 USE_HF_DOWNLOAD=$choice_use_hf_token
+HF_MODEL_REPO_ID=$HF_MODEL_REPO_ID
 MODEL_NAME=$MODEL_NAME
+MODEL_VERSION=${MODEL_VERSION}
+IMPL_ID=${IMPL_ID}
+MODEL_ID=${MODEL_ID}
 META_MODEL_NAME=$META_MODEL_NAME
-HF_MODEL_REPO_ID=$HF_MODEL_REPO_ID
 REPACKED=${REPACKED}
 REPACKED_STR=${REPACKED_STR}
 # model runtime variables
-LLAMA_VERSION=llama3
-TT_METAL_ASYNC_DEVICE_QUEUE=1
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
 SERVICE_PORT=7000
 # host paths
 HOST_HF_HOME=${HF_HOME:-""}
@@ -467,17 +489,17 @@ setup_weights_huggingface() {
         mv "${WEIGHTS_DIR}/consolidated.pth" "${WEIGHTS_DIR}/consolidated.00.pth"  
     fi
 
-    # Step 6: Process and copy weights
+    # Step 6: Cleanup HF setup venv
+    deactivate
+    rm -rf ${VENV_NAME}
+
+    # Step 7: Process and copy weights
     if [ "${REPACKED}" -eq 1 ]; then
         REPACKED_WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
         mkdir -p "${REPACKED_WEIGHTS_DIR}"
         repack_weights "${WEIGHTS_DIR}" "${REPACKED_WEIGHTS_DIR}"
     fi
 
-    # Step 7: Cleanup
-    deactivate
-    rm -rf ${VENV_NAME}
-
     echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
     echo "✅ setup_weights_huggingface completed!"
 }

diff --git a/tests/README.md b/tests/README.md
@@ -18,15 +18,15 @@ export VLLM_COMMIT_SHA=<vllm-commit>
 Add a volume mounting the `test` directory in the container before running with the following in the docker run command:
 
 ```bash
---volume $PWD/tests:/home/user/tests
+--volume $PWD/tests:/home/container_app_user/tests
 ```
 
 ## 3. Run The Mock Model
 
 Once in the docker container, run the mock script with:
 
 ```bash
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/user/tests/mock_vllm_offline_inference_tt.py
+WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python /home/container_app_user/tests/mock_vllm_offline_inference_tt.py
 ```
 
 # Build mock model container

diff --git a/tests/benchmark_vllm_offline_inference.py b/tests/benchmark_vllm_offline_inference.py
@@ -30,7 +30,7 @@ def parse_args():
     parser.add_argument(
         "--prompts_json",
         type=str,
-        default="/home/user/vllm/tt_metal/prompts.json",
+        default="/home/container_app_user/vllm/tt_metal/prompts.json",
         help="Path to JSON file containing prompts",
     )
     parser.add_argument(