From 39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80 Mon Sep 17 00:00:00 2001
From: Shantanu <shantanu@openai.com>
Date: Tue, 12 Sep 2023 17:39:23 -0700
Subject: [PATCH] Sync codebase

---
 CHANGELOG.md         |  3 +++
 Cargo.toml           |  2 +-
 pyproject.toml       |  2 +-
 tiktoken/__init__.py |  2 ++
 tiktoken/model.py    | 29 ++++++++++++++++++++---------
 5 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 26187242..8f37d7b4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.5.1]
+- Add `encoding_name_for_model`, undo some renames to variables that are implementation details
+
 ## [v0.5.0]
 - Add `tiktoken._educational` submodule to better document how byte pair encoding works
 - Ensure `encoding_for_model` knows about several new models
diff --git a/Cargo.toml b/Cargo.toml
index d789da9e..0486639a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.5.0"
+version = "0.5.1"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/pyproject.toml b/pyproject.toml
index 25849475..e3df78a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.5.0"
+version = "0.5.1"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py
index 9ad09a35..3a531b18 100644
--- a/tiktoken/__init__.py
+++ b/tiktoken/__init__.py
@@ -1,4 +1,6 @@
+# This is the public API of tiktoken
 from .core import Encoding as Encoding
 from .model import encoding_for_model as encoding_for_model
+from .model import encoding_name_for_model as encoding_name_for_model
 from .registry import get_encoding as get_encoding
 from .registry import list_encoding_names as list_encoding_names
diff --git a/tiktoken/model.py b/tiktoken/model.py
index a67830cb..3f366937 100644
--- a/tiktoken/model.py
+++ b/tiktoken/model.py
@@ -4,7 +4,7 @@
 from .registry import get_encoding
 
 # TODO: these will likely be replaced by an API endpoint
-_MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
+MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
@@ -16,7 +16,7 @@
     "ft:babbage-002": "cl100k_base",
 }
 
-_MODEL_TO_ENCODING: dict[str, str] = {
+MODEL_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
@@ -64,18 +64,21 @@
 }
 
 
-def encoding_for_model(model_name: str) -> Encoding:
-    """Returns the encoding used by a model."""
+def encoding_name_for_model(model_name: str) -> str:
+    """Returns the name of the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
     encoding_name = None
-    if model_name in _MODEL_TO_ENCODING:
-        encoding_name = _MODEL_TO_ENCODING[model_name]
+    if model_name in MODEL_TO_ENCODING:
+        encoding_name = MODEL_TO_ENCODING[model_name]
     else:
         # Check if the model matches a known prefix
         # Prefix matching avoids needing library updates for every model version release
         # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
-        for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items():
+        for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
             if model_name.startswith(model_prefix):
-                return get_encoding(model_encoding_name)
+                return model_encoding_name
 
     if encoding_name is None:
         raise KeyError(
@@ -83,4 +86,12 @@ def encoding_for_model(model_name: str) -> Encoding:
             "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
         ) from None
 
-    return get_encoding(encoding_name)
+    return encoding_name
+
+
+def encoding_for_model(model_name: str) -> Encoding:
+    """Returns the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
+    return get_encoding(encoding_name_for_model(model_name))