huggingface · tengomucho · Jan 6, 2025 · Dec 19, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/.github/workflows/tpu-tgi-release.yml b/.github/workflows/tpu-tgi-release.yml
@@ -76,7 +76,6 @@ jobs:
             labels: ${{ steps.meta.outputs.labels }}
             build-args: |
               VERSION=${{ steps.version.outputs.version }}
-              TGI_VERSION=v2.4.1
 
 
         - name: Generate artifact attestation for TGI
@@ -97,7 +96,6 @@ jobs:
             labels: ${{ steps.meta-ie.outputs.labels }}
             build-args: |
               VERSION=${{ steps.version.outputs.version }}
-              TGI_VERSION=v2.4.1
             target: inference-endpoint
 
 

diff --git a/Makefile b/Makefile
@@ -19,7 +19,7 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
 
 .PHONY:	build_dist style style_check clean
 
-TGI_VERSION ?= v2.4.1
+TGI_VERSION ?= v3.0.0
 
 rwildcard=$(wildcard $1) $(foreach d,$1,$(call rwildcard,$(addsuffix /$(notdir $d),$(wildcard $(dir $d)*))))
 

diff --git a/text-generation-inference/Cargo.toml b/text-generation-inference/Cargo.toml
@@ -0,0 +1,47 @@
+[workspace]
+members = [
+  "backends/v2",
+  "backends/grpc-metadata",
+  "launcher",
+  "router"
+]
+default-members = [
+  "backends/v2",
+  "backends/grpc-metadata",
+  "launcher",
+  "router"
+]
+resolver = "2"
+
+[workspace.package]
+version = "3.0.0"
+edition = "2021"
+authors = ["Olivier Dehaene"]
+homepage = "https://github.com/huggingface/text-generation-inference"
+
+[workspace.dependencies]
+base64 = "0.22.0"
+tokenizers = { version = "0.20.0", features = ["http"] }
+hf-hub = { version = "0.3.1", features = ["tokio"] }
+metrics = { version = "0.23.0" }
+metrics-exporter-prometheus = { version = "0.15.1", features = [] }
+minijinja = { version = "2.2.0", features = ["json"] }
+minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
+pyo3 = { version = "0.22.2", features = ["auto-initialize"] }
+
+[profile.release]
+incremental = true
+
+[profile.release-binary]
+inherits = "release"
+debug = 1
+incremental = true
+panic = "abort"
+
+[profile.release-opt]
+inherits = "release"
+debug = 0
+incremental = false
+lto = "fat"
+opt-level = 3
+codegen-units = 1
diff --git a/text-generation-inference/docker/Dockerfile b/text-generation-inference/docker/Dockerfile
@@ -1,25 +1,35 @@
 # Fetch and extract the TGI sources
 FROM alpine AS tgi
-# TGI version 2.4.1 by default
-ARG TGI_VERSION=v2.4.1
+# TGI version 3.0.0 by default
+ARG TGI_VERSION=v3.0.0
 RUN test -n ${TGI_VERSION:?}
 RUN mkdir -p /tgi
 ADD https://github.com/huggingface/text-generation-inference/archive/${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
 RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
-# Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
-FROM lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef
+# Note: we cannot use the cargo-chef base image as it uses python 3.11
+FROM ubuntu:22.04 AS chef
+
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    curl ca-certificates build-essential \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.80.1 --profile minimal -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN cargo install cargo-chef --locked
+
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
 
 FROM chef AS planner
-COPY --from=tgi /tgi/Cargo.toml Cargo.toml
+COPY text-generation-inference/Cargo.toml Cargo.toml
 COPY --from=tgi /tgi/Cargo.lock Cargo.lock
 COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
 COPY --from=tgi /tgi/proto proto
-COPY --from=tgi /tgi/benchmark benchmark
 COPY --from=tgi /tgi/router router
 COPY --from=tgi /tgi/backends backends
 COPY --from=tgi /tgi/launcher launcher
@@ -29,22 +39,25 @@ FROM chef AS builder
 ARG ENABLE_GOOGLE_FEATURE
 RUN echo "Google Feature Status: ${ENABLE_GOOGLE_FEATURE}"
 
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-    python3.11-dev
+RUN apt-get update -y \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    unzip python3-dev libssl-dev pkg-config \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
     unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
     rm -f $PROTOC_ZIP
 
+COPY text-generation-inference/Cargo.toml Cargo.toml
 COPY --from=planner /usr/src/recipe.json recipe.json
 RUN cargo chef cook --profile release-opt --recipe-path recipe.json
 
-COPY --from=tgi /tgi/Cargo.toml Cargo.toml
 COPY --from=tgi /tgi/Cargo.lock Cargo.lock
 COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
 COPY --from=tgi /tgi/proto proto
-COPY --from=tgi /tgi/benchmark benchmark
 COPY --from=tgi /tgi/router router
 COPY --from=tgi /tgi/backends backends
 COPY --from=tgi /tgi/launcher launcher
@@ -114,8 +127,6 @@ ARG VERSION=${VERSION}
 RUN apt-get update -y \
  && apt-get install -y --no-install-recommends \
     libpython3.10 \
-    libpython3.11 \
-    python3.11 \
     git \
     gnupg2 \
     wget \
@@ -142,18 +153,14 @@ ENV PORT=${PORT:-80}
 ENV HF_HOME=${ENABLE_GOOGLE_FEATURE:+/tmp}
 ENV HF_HOME=${HF_HOME:-/data}
 
-# Install requirements for TGI, that uses python3.11
-RUN python3.11 -m pip install transformers==${TRANSFORMERS_VERSION}
-
 # Install requirements for optimum-tpu, then for TGI then optimum-tpu
 RUN python3 -m pip install hf_transfer safetensors==${SAFETENSORS_VERSION} typer
 COPY --from=optimum-tpu-installer /opt/optimum-tpu /opt/optimum-tpu
 RUN python3 /opt/optimum-tpu/optimum/tpu/cli.py install-jetstream-pytorch --yes
 RUN python3 -m pip install -e /opt/optimum-tpu \
         -f https://storage.googleapis.com/libtpu-releases/index.html
 
-# Install benchmarker
-COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+
 # Install router
 COPY --from=builder /usr/src/target/release-opt/text-generation-router-v2 /usr/local/bin/text-generation-router
 # Install launcher
@@ -196,5 +203,3 @@ ENTRYPOINT ["./entrypoint.sh"]
 FROM tpu_base
 
 ENTRYPOINT ["text-generation-launcher"]
-# This is commented out in the original TGI Dockerfile
-# CMD ["--json-output"]
diff --git a/text-generation-inference/docker/entrypoint.sh b/text-generation-inference/docker/entrypoint.sh
@@ -10,6 +10,19 @@ if [[ -z "${MAX_BATCH_SIZE}" ]]; then
 fi
 export MAX_BATCH_SIZE="${MAX_BATCH_SIZE}"
 
+# At some point we used to have MAX_INPUT_LENGTH, now we should use MAX_INPUT_TOKENS
+# (This would be done automatically by the launcher, but we need to calculate the
+# MAX_BATCH_PREFILL_TOKENS if not set)
+if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then
+  MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH}
+  unset MAX_INPUT_LENGTH
+fi
-if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then
-  MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH}
-  unset MAX_INPUT_LENGTH
-fi
+if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then
+  MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH}
+fi
+
+unset MAX_INPUT_LENGTH
-if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then
-  MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH}
-  unset MAX_INPUT_LENGTH
-fi
+if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then
+  MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH}
+fi
+
+unset MAX_INPUT_LENGTH
+
+if [[ -z "${MAX_BATCH_PREFILL_TOKENS}" ]]; then
+  MAX_BATCH_PREFILL_TOKENS=$(( ${MAX_BATCH_SIZE} * ${MAX_INPUT_TOKENS} ))
+fi
+export MAX_BATCH_PREFILL_TOKENS="${MAX_BATCH_PREFILL_TOKENS}"
+
 if [[ -z "${JSON_OUTPUT_DISABLE}" ]]; then
   JSON_OUTPUT_DISABLE=--json-output
 else

diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
@@ -2,7 +2,7 @@
 pkg_name := text_generation_server
 BUILDDIR ?= $(CURDIR)/build
 VERSION ?= 0.0.1
-TGI_VERSION ?= "v2.4.1"
+TGI_VERSION ?= "v3.0.0"
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 pkg_dir := $(BUILDDIR)/$(pkg_name)

diff --git a/text-generation-inference/server/build-requirements.txt b/text-generation-inference/server/build-requirements.txt
@@ -1,3 +1,3 @@
 build
-grpcio-tools==1.62.1
-mypy-protobuf==3.2.0
+grpcio-tools==1.53.0
+mypy-protobuf
diff --git a/text-generation-inference/server/text_generation_server/cli.py b/text-generation-inference/server/text_generation_server/cli.py
@@ -99,6 +99,7 @@ def download_weights(
     auto_convert: Optional[bool] = None,
     extension: Optional[str] = None,
     trust_remote_code: Optional[bool] = None,
+    merge_lora: Optional[bool] = None,
 ):
     """Download the model weights.
 
@@ -122,6 +123,8 @@ def download_weights(
         logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
     if auto_convert is not None:
         logger.warning("'auto_convert' argument is not supported and will be ignored.")
+    if merge_lora is not None:
+        logger.warning("'merge_lora' argument is not supported and will be ignored.")
 
     # Import here after the logger is added to log potential import exceptions
     from optimum.tpu.model import fetch_model

diff --git a/text-generation-inference/server/text_generation_server/generator.py b/text-generation-inference/server/text_generation_server/generator.py
@@ -298,6 +298,12 @@ def attention_mask(self) -> torch.LongTensor:
     def max_token(self) -> int:
         return self._generation_config.max_length
 
+    @property
+    def max_new_tokens(self) -> int:
+        # The current value of max_new_tokens: might be different of the target max_new_tokens
+        # if the slot has been paused and resumed.
+        return self._generation_config.max_new_tokens
+
 
 class TpuGeneratorSingleThread(Generator):
     """A Generator for models running on TPU, single threaded."""
@@ -474,6 +480,9 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
             slot.assign(self.batch_id, request, self.model.generation_config)
             self.slots.append(slot)
             logger.debug(f"Request {slot.request_id} assigned to slot {slot.id}")
+            logger.debug(
+                f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
+            )
         # Reconstruct the full inputs (without padding) as seen by the model.
         # This comprises:
         # - the inputs for new requests,
@@ -576,6 +585,8 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
             slot.append(next_token)
             slot.resume()
         logger.debug("Model ready for decoding")
+        if next_batch is not None:
+            logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
         return generation, next_batch
 
     @torch.no_grad
@@ -704,14 +715,16 @@ def _post_generate(
             if next_token == self.tokenizer.eos_token_id:
                 finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
             elif slot.stopped:
-                # For now we only support the length stopping criteria
-                finish_reason = FinishReason.FINISH_REASON_LENGTH
+                if slot.generated_tokens == slot.max_new_tokens:
+                    finish_reason = FinishReason.FINISH_REASON_LENGTH
+                else:
+                    finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
             if finish_reason is not None:
                 # We must include the generated text for each finished sequence in the response
                 generated_text = GeneratedText(
                     text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
                 )
-                logger.debug(f"Finished generating tokens for request {request_id}")
+                logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
                 # This slot is now empty, it will be removed from the list of
                 # active slots once a new prefill is requested
                 slot.clear()

diff --git a/text-generation-inference/server/text_generation_server/jetstream_pt_support/generator.py b/text-generation-inference/server/text_generation_server/jetstream_pt_support/generator.py
@@ -228,6 +228,11 @@ def empty(self) -> bool:
     def seed(self) -> int:
         return self._seed
 
+    @property
+    def max_new_tokens(self) -> int:
+        # The current value of max_new_tokens: might be different of the target max_new_tokens
+        # if the slot has been paused and resumed.
+        return self._generation_config.max_new_tokens
 
 class PrefillSlot:
     def __init__(self):
@@ -443,7 +448,9 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
             self.prefill_slot.set(slot)
             self.slot_index += 1
             slot.assign(self.batch_id, request, self.model.generation_config)
-            logger.debug(f"Request {slot.request_id} assigned to slot {slot.id}")
+            logger.debug(
+                f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
+            )
 
             # Tokenize the inputs
             input_ids, true_lengths = self._token_encode(request.inputs, slot.truncate)
@@ -475,6 +482,8 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
         cached_batch = self._cached_batch(self.batch_id, prefilled_active_slots)
         self.batch_id += 1
         logger.debug("Model ready for decoding")
+        if cached_batch is not None:
+            logger.debug(f"Next batch is {cached_batch.id} with requests: {cached_batch.request_ids}")
         return generations, cached_batch
 
     def _select_from_slots(self, logits: jnp.ndarray, batch_size: int=0) -> jnp.ndarray:
@@ -566,15 +575,17 @@ def _post_generate(self, slot: Slot, next_token: int, generations: List[Generati
         if next_token == self.tokenizer.eos_token_id:
             finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
         elif slot.stopped:
-            # For now we only support the length stopping criteria
-            finish_reason = FinishReason.FINISH_REASON_LENGTH
+            if slot.generated_tokens == slot.max_new_tokens:
+                finish_reason = FinishReason.FINISH_REASON_LENGTH
+            else:
+                finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
         request_id = slot.request_id
         if finish_reason is not None:
             # We must include the generated text for each finished sequence in the response
             generated_text = GeneratedText(
                 text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
             )
-            logger.debug(f"Finished generating tokens for request {request_id}")
+            logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
             # This slot is now empty, it will be removed from the list of
             # active slots.
             slot.clear()