Skip to content

Commit

Permalink
updated Readme
Browse files Browse the repository at this point in the history
  • Loading branch information
RahulDey12 committed May 27, 2024
1 parent d135371 commit 7aba306
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 20 deletions.
12 changes: 10 additions & 2 deletions Readme.md → README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,13 @@ $encoder->encode('<|endoftext|>', allowedSpecial: 'all');

Tiktoken always cache the server's responses when downloading them.

By default it uses the system's default directory to cache a response but you
By default it uses the system's default temporary directory to cache a response but you
can still overwrite the cache location by setting `TIKTOKEN_CACHE_DIR` environment variable.

### Registering Custom Encoding

```php
use Rahul900day\Tiktoken\Encodings\OpenAiPublic\Cl100KBaseEncoding;
use Rahul900day\Tiktoken\Loaders\TiktokenLoader;

class Cl100KIm extends Cl100KBaseEncoding
{
Expand Down Expand Up @@ -73,3 +72,12 @@ $encoding = Tiktoken::getEncoding('cl100k_im');
$encoding->encode("<|im_start|>", allowedSpecial: 'all');

```

## Credits

- [Rahul Dey](https://github.com/RahulDey12)
- [All Contributors](https://github.com/RahulDey12/tiktoken-php/graphs/contributors)

## License

This package is released under the [MIT License](https://github.com/RahulDey12/tiktoken-php/blob/main/LICENSE.md).
41 changes: 23 additions & 18 deletions test.php
Original file line number Diff line number Diff line change
@@ -1,26 +1,31 @@
<?php

use Rahul900day\Tiktoken\Tiktoken;
require_once __DIR__.'/vendor/autoload.php';

require __DIR__.'/vendor/autoload.php';
use Rahul900day\Tiktoken\Encodings\OpenAiPublic\Cl100KBaseEncoding;

//$l = new Loader();
//$data = $l->loadDataGymRanks(
// "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
// "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
// "1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
// "196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
//);
class Cl100KIm extends Cl100KBaseEncoding
{
protected function getName(): string
{
return 'cl100k_im';
}

//$data = $l->load_tiktoken_bpe(
// "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
// "306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
//);
protected function getSpecialTokens(): array
{
return [
...parent::getSpecialTokens(),
"<|im_start|>" => 100264,
"<|im_end|>" => 100265,
];
}
}

//var_dump($data);
use Rahul900day\Tiktoken\Registry;
use Rahul900day\Tiktoken\Tiktoken;

$encoder = Tiktoken::getEncodingForModel('gpt-4');
$result = $encoder->encode('<|endoftext|>', allowedSpecial: 'all');
$r2 = $encoder->encode('Hello World');
Registry::registerCustomEncoding('cl100k_im', new Cl100KIm);
$encoding = Tiktoken::getEncoding('cl100k_im');

dd($result, $r2);
// Expect: 100264
dd($encoding->encode("<|im_start|>", allowedSpecial: 'all'));

0 comments on commit 7aba306

Please sign in to comment.