tenstorrent · ppetrovicTT · Feb 3, 2025 · tstescoTT · Feb 3, 2025 · ppetrovicTT
diff --git a/benchmarking/README.md b/benchmarking/README.md
@@ -51,20 +51,17 @@ python utils/prompt_client_cli.py \
 ```
 
 ### using vllm/benchmarking/benchmark_serving.py
-Within the Docker container, use the benchmark_serving.patch file:
+Within the Docker container, in one shell start the server:
 ```
-cd ~/app/src
-python run_vllm_api_server.py
+source $PYTHON_ENV_DIR/bin/activate # activate python env
+python /home/$CONTAINER_APP_USERNAME/app/src/run_vllm_api_server.py
 ```
-This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues.
 
-To run the benchmarks, in another shell into the Docker container:
+Then in another shell run the benchmarks:
+
 ```
-cd ~/vllm
-git apply ~/app/benchmarking/benchmark_serving.patch
-cd ~/app
-export PYTHONPATH=$PYTHONPATH:$PWD
-python benchmarking/vllm_online_benchmark.py
+source $PYTHON_ENV_DIR/bin/activate # activate python env as well
+python /home/$CONTAINER_APP_USERNAME/app/benchmarking/vllm_online_benchmark.py
 ```
 
 The output will be available for each input/output sequence length defined and time stamped.

diff --git a/vllm-tt-metal-llama3/vllm.llama3.src.dev.Dockerfile b/vllm-tt-metal-llama3/vllm.llama3.src.dev.Dockerfile
@@ -115,6 +115,10 @@ COPY --chown=${CONTAINER_APP_USERNAME}:${CONTAINER_APP_USERNAME} "locust" "${APP
 RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
 && pip install --default-timeout=240 --no-cache-dir -r requirements.txt"
 
+# apply patch to remove best-of argument from vllm
+WORKDIR ${vllm_dir}
+RUN git apply ${APP_DIR}/benchmarking/benchmark_serving.patch
+
 WORKDIR "${APP_DIR}/src"
 
 # Switch back to root for entrypoint