From 39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80 Mon Sep 17 00:00:00 2001 From: Shantanu Date: Tue, 12 Sep 2023 17:39:23 -0700 Subject: [PATCH] Sync codebase --- CHANGELOG.md | 3 +++ Cargo.toml | 2 +- pyproject.toml | 2 +- tiktoken/__init__.py | 2 ++ tiktoken/model.py | 29 ++++++++++++++++++++--------- 5 files changed, 27 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 26187242..8f37d7b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ This is the changelog for the open source version of tiktoken. +## [v0.5.1] +- Add `encoding_name_for_model`, undo some renames to variables that are implementation details + ## [v0.5.0] - Add `tiktoken._educational` submodule to better document how byte pair encoding works - Ensure `encoding_for_model` knows about several new models diff --git a/Cargo.toml b/Cargo.toml index d789da9e..0486639a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.5.0" +version = "0.5.1" edition = "2021" rust-version = "1.57.0" diff --git a/pyproject.toml b/pyproject.toml index 25849475..e3df78a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tiktoken" -version = "0.5.0" +version = "0.5.1" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py index 9ad09a35..3a531b18 100644 --- a/tiktoken/__init__.py +++ b/tiktoken/__init__.py @@ -1,4 +1,6 @@ +# This is the public API of tiktoken from .core import Encoding as Encoding from .model import encoding_for_model as encoding_for_model +from .model import encoding_name_for_model as encoding_name_for_model from .registry import get_encoding as get_encoding from .registry import list_encoding_names as list_encoding_names diff --git a/tiktoken/model.py b/tiktoken/model.py index a67830cb..3f366937 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -4,7 +4,7 @@ from .registry import get_encoding # TODO: these will likely be replaced by an API endpoint -_MODEL_PREFIX_TO_ENCODING: dict[str, str] = { +MODEL_PREFIX_TO_ENCODING: dict[str, str] = { # chat "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. @@ -16,7 +16,7 @@ "ft:babbage-002": "cl100k_base", } -_MODEL_TO_ENCODING: dict[str, str] = { +MODEL_TO_ENCODING: dict[str, str] = { # chat "gpt-4": "cl100k_base", "gpt-3.5-turbo": "cl100k_base", @@ -64,18 +64,21 @@ } -def encoding_for_model(model_name: str) -> Encoding: - """Returns the encoding used by a model.""" +def encoding_name_for_model(model_name: str) -> str: + """Returns the name of the encoding used by a model. + + Raises a KeyError if the model name is not recognised. + """ encoding_name = None - if model_name in _MODEL_TO_ENCODING: - encoding_name = _MODEL_TO_ENCODING[model_name] + if model_name in MODEL_TO_ENCODING: + encoding_name = MODEL_TO_ENCODING[model_name] else: # Check if the model matches a known prefix # Prefix matching avoids needing library updates for every model version release # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE) - for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items(): + for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items(): if model_name.startswith(model_prefix): - return get_encoding(model_encoding_name) + return model_encoding_name if encoding_name is None: raise KeyError( @@ -83,4 +86,12 @@ def encoding_for_model(model_name: str) -> Encoding: "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect." ) from None - return get_encoding(encoding_name) + return encoding_name + + +def encoding_for_model(model_name: str) -> Encoding: + """Returns the encoding used by a model. + + Raises a KeyError if the model name is not recognised. + """ + return get_encoding(encoding_name_for_model(model_name))