diff --git a/CHANGELOG.md b/CHANGELOG.md index d0365b8a..2d666cda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ This is the changelog for the open source version of tiktoken. +## [v0.3.3] +- `tiktoken` will now make a best effort attempt to replace surrogate pairs with the corresponding + Unicode character and will replace lone surrogates with the Unicode replacement character. + ## [v0.3.2] - Add encoding for GPT-4 diff --git a/Cargo.toml b/Cargo.toml index 07182cd4..1b540a05 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.3.2" +version = "0.3.3" edition = "2021" rust-version = "1.57.0" diff --git a/pyproject.toml b/pyproject.toml index 739d2958..23a46242 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tiktoken" -version = "0.3.2" +version = "0.3.3" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} diff --git a/tiktoken/core.py b/tiktoken/core.py index 05613aab..27922c72 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -65,7 +65,12 @@ def encode_ordinary(self, text: str) -> list[int]: >>> enc.encode_ordinary("hello world") [31373, 995] """ - return self._core_bpe.encode_ordinary(text) + try: + return self._core_bpe.encode_ordinary(text) + except UnicodeEncodeError: + # See comment in encode + text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace") + return self._core_bpe.encode_ordinary(text) def encode( self, @@ -111,7 +116,17 @@ def encode( if match := _special_token_regex(disallowed_special).search(text): raise_disallowed_special_token(match.group()) - return self._core_bpe.encode(text, allowed_special) + try: + return self._core_bpe.encode(text, allowed_special) + except UnicodeEncodeError: + # BPE operates on bytes, but the regex operates on unicode. If we pass a str that is + # invalid UTF-8 to Rust, it will rightfully complain. Here we do a quick and dirty + # fixup for any surrogate pairs that may have sneaked their way into the text. + # Technically, this introduces a place where encode + decode doesn't roundtrip a Python + # string, but given that this is input we want to support, maybe that's okay. + # Also we use errors="replace" to handle weird things like lone surrogates. + text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace") + return self._core_bpe.encode(text, allowed_special) def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]: """Encodes a list of strings into tokens, in parallel, ignoring special tokens.