Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump TGI version to v3.0.0 #135

Merged
merged 6 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/tpu-tgi-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ jobs:
labels: ${{ steps.meta.outputs.labels }}
build-args: |
VERSION=${{ steps.version.outputs.version }}
TGI_VERSION=v2.4.1


- name: Generate artifact attestation for TGI
Expand All @@ -97,7 +96,6 @@ jobs:
labels: ${{ steps.meta-ie.outputs.labels }}
build-args: |
VERSION=${{ steps.version.outputs.version }}
TGI_VERSION=v2.4.1
target: inference-endpoint


Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))

.PHONY: build_dist style style_check clean

TGI_VERSION ?= v2.4.1
TGI_VERSION ?= v3.0.0

rwildcard=$(wildcard $1) $(foreach d,$1,$(call rwildcard,$(addsuffix /$(notdir $d),$(wildcard $(dir $d)*))))

Expand Down
47 changes: 47 additions & 0 deletions text-generation-inference/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
[workspace]
members = [
"backends/v2",
"backends/grpc-metadata",
"launcher",
"router"
]
default-members = [
"backends/v2",
"backends/grpc-metadata",
"launcher",
"router"
]
resolver = "2"

[workspace.package]
version = "3.0.0"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"

[workspace.dependencies]
base64 = "0.22.0"
tokenizers = { version = "0.20.0", features = ["http"] }
hf-hub = { version = "0.3.1", features = ["tokio"] }
metrics = { version = "0.23.0" }
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
minijinja = { version = "2.2.0", features = ["json"] }
minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
pyo3 = { version = "0.22.2", features = ["auto-initialize"] }

[profile.release]
incremental = true

[profile.release-binary]
inherits = "release"
debug = 1
incremental = true
panic = "abort"

[profile.release-opt]
inherits = "release"
debug = 0
incremental = false
lto = "fat"
opt-level = 3
codegen-units = 1
43 changes: 24 additions & 19 deletions text-generation-inference/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,35 @@
# Fetch and extract the TGI sources
FROM alpine AS tgi
# TGI version 2.4.1 by default
ARG TGI_VERSION=v2.4.1
# TGI version 3.0.0 by default
ARG TGI_VERSION=v3.0.0
RUN test -n ${TGI_VERSION:?}
RUN mkdir -p /tgi
ADD https://github.com/huggingface/text-generation-inference/archive/${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1

# Build cargo components (adapted from TGI original Dockerfile)
# Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
FROM lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef
# Note: we cannot use the cargo-chef base image as it uses python 3.11
FROM ubuntu:22.04 AS chef

RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
curl ca-certificates build-essential \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.80.1 --profile minimal -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN cargo install cargo-chef --locked

WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef AS planner
COPY --from=tgi /tgi/Cargo.toml Cargo.toml
COPY text-generation-inference/Cargo.toml Cargo.toml
COPY --from=tgi /tgi/Cargo.lock Cargo.lock
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
COPY --from=tgi /tgi/proto proto
COPY --from=tgi /tgi/benchmark benchmark
COPY --from=tgi /tgi/router router
COPY --from=tgi /tgi/backends backends
COPY --from=tgi /tgi/launcher launcher
Expand All @@ -29,22 +39,25 @@ FROM chef AS builder
ARG ENABLE_GOOGLE_FEATURE
RUN echo "Google Feature Status: ${ENABLE_GOOGLE_FEATURE}"

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
python3.11-dev
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
unzip python3-dev libssl-dev pkg-config \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
rm -f $PROTOC_ZIP

COPY text-generation-inference/Cargo.toml Cargo.toml
COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --profile release-opt --recipe-path recipe.json

COPY --from=tgi /tgi/Cargo.toml Cargo.toml
COPY --from=tgi /tgi/Cargo.lock Cargo.lock
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
COPY --from=tgi /tgi/proto proto
COPY --from=tgi /tgi/benchmark benchmark
COPY --from=tgi /tgi/router router
COPY --from=tgi /tgi/backends backends
COPY --from=tgi /tgi/launcher launcher
Expand Down Expand Up @@ -114,8 +127,6 @@ ARG VERSION=${VERSION}
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
libpython3.10 \
libpython3.11 \
python3.11 \
git \
gnupg2 \
wget \
Expand All @@ -142,18 +153,14 @@ ENV PORT=${PORT:-80}
ENV HF_HOME=${ENABLE_GOOGLE_FEATURE:+/tmp}
ENV HF_HOME=${HF_HOME:-/data}

# Install requirements for TGI, that uses python3.11
RUN python3.11 -m pip install transformers==${TRANSFORMERS_VERSION}

# Install requirements for optimum-tpu, then for TGI then optimum-tpu
RUN python3 -m pip install hf_transfer safetensors==${SAFETENSORS_VERSION} typer
COPY --from=optimum-tpu-installer /opt/optimum-tpu /opt/optimum-tpu
RUN python3 /opt/optimum-tpu/optimum/tpu/cli.py install-jetstream-pytorch --yes
RUN python3 -m pip install -e /opt/optimum-tpu \
-f https://storage.googleapis.com/libtpu-releases/index.html

# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark

# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router-v2 /usr/local/bin/text-generation-router
# Install launcher
Expand Down Expand Up @@ -196,5 +203,3 @@ ENTRYPOINT ["./entrypoint.sh"]
FROM tpu_base

ENTRYPOINT ["text-generation-launcher"]
# This is commented out in the original TGI Dockerfile
# CMD ["--json-output"]
13 changes: 13 additions & 0 deletions text-generation-inference/docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ if [[ -z "${MAX_BATCH_SIZE}" ]]; then
fi
export MAX_BATCH_SIZE="${MAX_BATCH_SIZE}"

# At some point we used to have MAX_INPUT_LENGTH, now we should use MAX_INPUT_TOKENS
# (This would be done automatically by the launcher, but we need to calculate the
# MAX_BATCH_PREFILL_TOKENS if not set)
if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then
MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH}
unset MAX_INPUT_LENGTH
fi
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then
MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH}
unset MAX_INPUT_LENGTH
fi
if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then
MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH}
fi
unset MAX_INPUT_LENGTH

Maybe we should unset MAX_INPUT_LENGTH anyway to prevent any downstream use of it as it is deprecated


if [[ -z "${MAX_BATCH_PREFILL_TOKENS}" ]]; then
MAX_BATCH_PREFILL_TOKENS=$(( ${MAX_BATCH_SIZE} * ${MAX_INPUT_TOKENS} ))
fi
export MAX_BATCH_PREFILL_TOKENS="${MAX_BATCH_PREFILL_TOKENS}"

if [[ -z "${JSON_OUTPUT_DISABLE}" ]]; then
JSON_OUTPUT_DISABLE=--json-output
else
Expand Down
2 changes: 1 addition & 1 deletion text-generation-inference/server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
pkg_name := text_generation_server
BUILDDIR ?= $(CURDIR)/build
VERSION ?= 0.0.1
TGI_VERSION ?= "v2.4.1"
TGI_VERSION ?= "v3.0.0"
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
mkfile_dir := $(dir $(mkfile_path))
pkg_dir := $(BUILDDIR)/$(pkg_name)
Expand Down
4 changes: 2 additions & 2 deletions text-generation-inference/server/build-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
build
grpcio-tools==1.62.1
mypy-protobuf==3.2.0
grpcio-tools==1.53.0
mypy-protobuf
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def download_weights(
auto_convert: Optional[bool] = None,
extension: Optional[str] = None,
trust_remote_code: Optional[bool] = None,
merge_lora: Optional[bool] = None,
):
"""Download the model weights.

Expand All @@ -122,6 +123,8 @@ def download_weights(
logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
if auto_convert is not None:
logger.warning("'auto_convert' argument is not supported and will be ignored.")
if merge_lora is not None:
logger.warning("'merge_lora' argument is not supported and will be ignored.")

# Import here after the logger is added to log potential import exceptions
from optimum.tpu.model import fetch_model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,12 @@ def attention_mask(self) -> torch.LongTensor:
def max_token(self) -> int:
return self._generation_config.max_length

@property
def max_new_tokens(self) -> int:
# The current value of max_new_tokens: might be different of the target max_new_tokens
# if the slot has been paused and resumed.
return self._generation_config.max_new_tokens


class TpuGeneratorSingleThread(Generator):
"""A Generator for models running on TPU, single threaded."""
Expand Down Expand Up @@ -474,6 +480,9 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
slot.assign(self.batch_id, request, self.model.generation_config)
self.slots.append(slot)
logger.debug(f"Request {slot.request_id} assigned to slot {slot.id}")
logger.debug(
f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
)
# Reconstruct the full inputs (without padding) as seen by the model.
# This comprises:
# - the inputs for new requests,
Expand Down Expand Up @@ -576,6 +585,8 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
slot.append(next_token)
slot.resume()
logger.debug("Model ready for decoding")
if next_batch is not None:
logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
return generation, next_batch

@torch.no_grad
Expand Down Expand Up @@ -704,14 +715,16 @@ def _post_generate(
if next_token == self.tokenizer.eos_token_id:
finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
elif slot.stopped:
# For now we only support the length stopping criteria
finish_reason = FinishReason.FINISH_REASON_LENGTH
if slot.generated_tokens == slot.max_new_tokens:
finish_reason = FinishReason.FINISH_REASON_LENGTH
else:
finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
if finish_reason is not None:
# We must include the generated text for each finished sequence in the response
generated_text = GeneratedText(
text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
)
logger.debug(f"Finished generating tokens for request {request_id}")
logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
# This slot is now empty, it will be removed from the list of
# active slots once a new prefill is requested
slot.clear()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,11 @@ def empty(self) -> bool:
def seed(self) -> int:
return self._seed

@property
def max_new_tokens(self) -> int:
# The current value of max_new_tokens: might be different of the target max_new_tokens
# if the slot has been paused and resumed.
return self._generation_config.max_new_tokens

class PrefillSlot:
def __init__(self):
Expand Down Expand Up @@ -443,7 +448,9 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
self.prefill_slot.set(slot)
self.slot_index += 1
slot.assign(self.batch_id, request, self.model.generation_config)
logger.debug(f"Request {slot.request_id} assigned to slot {slot.id}")
logger.debug(
f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
)

# Tokenize the inputs
input_ids, true_lengths = self._token_encode(request.inputs, slot.truncate)
Expand Down Expand Up @@ -475,6 +482,8 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
cached_batch = self._cached_batch(self.batch_id, prefilled_active_slots)
self.batch_id += 1
logger.debug("Model ready for decoding")
if cached_batch is not None:
logger.debug(f"Next batch is {cached_batch.id} with requests: {cached_batch.request_ids}")
return generations, cached_batch

def _select_from_slots(self, logits: jnp.ndarray, batch_size: int=0) -> jnp.ndarray:
Expand Down Expand Up @@ -566,15 +575,17 @@ def _post_generate(self, slot: Slot, next_token: int, generations: List[Generati
if next_token == self.tokenizer.eos_token_id:
finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
elif slot.stopped:
# For now we only support the length stopping criteria
finish_reason = FinishReason.FINISH_REASON_LENGTH
if slot.generated_tokens == slot.max_new_tokens:
finish_reason = FinishReason.FINISH_REASON_LENGTH
else:
finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
request_id = slot.request_id
if finish_reason is not None:
# We must include the generated text for each finished sequence in the response
generated_text = GeneratedText(
text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
)
logger.debug(f"Finished generating tokens for request {request_id}")
logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
# This slot is now empty, it will be removed from the list of
# active slots.
slot.clear()
Expand Down
Loading