Sync: devchat[main](f6590bfd) Merge pull request #389 from devchat-ai/log_llm_calls

This commit is contained in:
Sync-Packages Action 2024-05-18 07:06:58 +00:00
parent 501b7ff4ae
commit f256ec7f4d
60 changed files with 678 additions and 580 deletions

View File

@ -31,7 +31,7 @@ Requires-Dist: pathspec (>=0.12.1,<0.13.0)
Requires-Dist: pydantic (==1.10.14)
Requires-Dist: rich_click (>=1.6.1,<2.0.0)
Requires-Dist: tenacity (>=8.2.3,<9.0.0)
Requires-Dist: tiktoken (>=0.4.0,<0.5.0)
Requires-Dist: tiktoken (>0.4.0)
Requires-Dist: tinydb (>=4.7.1,<5.0.0)
Requires-Dist: urllib3 (<2.0)
Description-Content-Type: text/markdown

View File

@ -1,7 +1,7 @@
../../../bin/devchat,sha256=a8KMZYH-GZd6OA7nXki105OsOlnCcZkv9SCnT1Fa3UU,260
devchat-0.2.10.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
devchat-0.2.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
devchat-0.2.10.dist-info/METADATA,sha256=rG2jPljWa__TRpp63OJYF6TE7hh3osGAQgYvEJi4Bn0,7314
devchat-0.2.10.dist-info/METADATA,sha256=Q5u0xLzfHCNzM0-vKtsoHE3DDbhXi9iRjUxw9KvYApI,7306
devchat-0.2.10.dist-info/RECORD,,
devchat-0.2.10.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
devchat-0.2.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
@ -94,8 +94,8 @@ devchat/llm/__pycache__/pipeline.cpython-38.pyc,,
devchat/llm/__pycache__/text_confirm.cpython-38.pyc,,
devchat/llm/__pycache__/tools_call.cpython-38.pyc,,
devchat/llm/chat.py,sha256=XWXUIpbWCMiuMCtBchrQpBpuyLwjga5KcCFzDoapbcc,3377
devchat/llm/openai.py,sha256=8hK2OByDRq8sPgROf-UvVPA8Oz0lSDfMaAFSKh0D644,6208
devchat/llm/pipeline.py,sha256=D214HASOUA7DsUm63_QDVFTYsHShPrrBwTbd0hM3tRI,1920
devchat/llm/openai.py,sha256=VnYIl2XB7qNDuRWJxAcwMGQk8v9JwL8aZ-J-SXduN4Y,6492
devchat/llm/pipeline.py,sha256=qxOCMYJi-TlA_gBN2r6ImG_U5qzcAWnbZ0oThJ1RbTc,2267
devchat/llm/text_confirm.py,sha256=sdt7AUFDcsOZ0fLfS0vtjdS2_8xhkTF6aF8Sn05OlI0,1462
devchat/llm/tools_call.py,sha256=OBObtFAzuqEJPq7Ro9hR4oirrcMtxGchlMQl8vL1CBc,8038
devchat/memory/__init__.py,sha256=aPR0Dt8dcf4oWXu2HME2fFSpDJDeoBayPWMFOpO8v5k,133

View File

@ -7,6 +7,8 @@ from typing import Dict, List
import httpx
import openai
from devchat.ide import IDEService
from .pipeline import (
RetryException,
exception_handle,
@ -83,6 +85,7 @@ def retry_timeout(chunks):
for chunk in chunks:
yield chunk
except (openai.APIConnectionError, openai.APITimeoutError) as err:
IDEService().ide_logging("info", f"in retry_timeout: err: {err}")
raise RetryException(err) from err
@ -127,8 +130,10 @@ def content_to_json(content):
response_obj = json.loads(content_no_block)
return response_obj
except json.JSONDecodeError as err:
IDEService().ide_logging("info", f"in content_to_json: json decode error: {err}")
raise RetryException(err) from err
except Exception as err:
IDEService().ide_logging("info", f"in content_to_json: other error: {err}")
raise err

View File

@ -1,8 +1,11 @@
import sys
import time
from typing import Dict
import openai
from devchat.ide import IDEService
class RetryException(Exception):
def __init__(self, err):
@ -17,8 +20,10 @@ def retry(func, times):
except RetryException as err:
if index + 1 == times:
raise err.error
IDEService().ide_logging("debug", f"has retries: {index + 1}")
continue
except Exception as err:
IDEService().ide_logging("info", f"exception: {err}")
raise err.error
return wrapper
@ -59,6 +64,7 @@ def exception_handle(func, handler):
def pipeline(*funcs):
def wrapper(*args, **kwargs):
start_time = time.time()
for index, func in enumerate(funcs):
if index > 0:
if isinstance(args, Dict) and args.get("__type__", None) == "parallel":
@ -67,6 +73,8 @@ def pipeline(*funcs):
args = func(args)
else:
args = func(*args, **kwargs)
end_time = time.time()
IDEService().ide_logging("debug", f"time on pipeline: {end_time-start_time}")
return args
return wrapper

View File

@ -1,6 +1,6 @@
Metadata-Version: 2.3
Name: openai
Version: 1.29.0
Version: 1.30.1
Summary: The official Python library for the openai API
Project-URL: Homepage, https://github.com/openai/openai-python
Project-URL: Repository, https://github.com/openai/openai-python

View File

@ -1,10 +1,10 @@
../../../bin/openai,sha256=OM6FORuLrwfh02Zj_-DY6nOIKjU9ftrONpb5slPZlhM,253
openai-1.29.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
openai-1.29.0.dist-info/METADATA,sha256=cZxRLlTiimxDXv2dE0M5Se6E_fMw0axdYGJzNsnFTOU,21941
openai-1.29.0.dist-info/RECORD,,
openai-1.29.0.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
openai-1.29.0.dist-info/entry_points.txt,sha256=kAYhQEmziJwsKs5raYAIOvJ2LWmbz5dulEXOzsY71ro,43
openai-1.29.0.dist-info/licenses/LICENSE,sha256=d0M6HDjQ76tf255XPlAGkIoECMe688MXcGEYsOFySfI,11336
openai-1.30.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
openai-1.30.1.dist-info/METADATA,sha256=DzzOq2T6f1fkTmkYH-M9wfjpZaIaWqkvBjG6rnWBqDw,21941
openai-1.30.1.dist-info/RECORD,,
openai-1.30.1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
openai-1.30.1.dist-info/entry_points.txt,sha256=kAYhQEmziJwsKs5raYAIOvJ2LWmbz5dulEXOzsY71ro,43
openai-1.30.1.dist-info/licenses/LICENSE,sha256=d0M6HDjQ76tf255XPlAGkIoECMe688MXcGEYsOFySfI,11336
openai/__init__.py,sha256=hTM-EsfeafKBLu-n5AVSQVDB2MMBGnZoLtATFeW-OL0,10007
openai/__main__.py,sha256=bYt9eEaoRQWdejEHFD8REx9jxVEdZptECFsV7F49Ink,30
openai/__pycache__/__init__.cpython-38.pyc,,
@ -64,7 +64,7 @@ openai/_utils/_sync.py,sha256=8zEEYfir8iCUcAMFtWd8cDi8NVEaZonc4sfLAYr16io,2269
openai/_utils/_transform.py,sha256=NCz3q9_O-vuj60xVe-qzhEQ8uJWlZWJTsM-GwHDccf8,12958
openai/_utils/_typing.py,sha256=tFbktdpdHCQliwzGsWysgn0P5H0JRdagkZdb_LegGkY,3838
openai/_utils/_utils.py,sha256=1_mm0IcPWDckpwQrb5chWTqeG7JWst_ycXaoFUTXbzE,11497
openai/_version.py,sha256=7kEu6q_6Mk9wI8Ot8PAoWdAXrIe5hSH3VkSKRvWdNSc,159
openai/_version.py,sha256=K2r2kM0eNbvNjtxPpgtRbJoHYQWganeQIMbwtYW_BDA,159
openai/cli/__init__.py,sha256=soGgtqyomgddl92H0KJRqHqGuaXIaghq86qkzLuVp7U,31
openai/cli/__pycache__/__init__.cpython-38.pyc,,
openai/cli/__pycache__/_cli.cpython-38.pyc,,
@ -135,12 +135,12 @@ openai/resources/audio/audio.py,sha256=1HHcDRWT58KshYelRdSnJs-0bvMBRS1vOhnU-h_oP
openai/resources/audio/speech.py,sha256=A4_SwpCesEfHg89cxazNdrHz8JxNvUp5LlLNoMqo-0w,7876
openai/resources/audio/transcriptions.py,sha256=bBdQZXzjamZIbe5R_Ji9JJ6W9nJCNN7EwQVinu572Pk,11128
openai/resources/audio/translations.py,sha256=_NoBAOXYqMEtjeUhdoHF3DNb-UqnhqVrmfqgITvhajI,9070
openai/resources/batches.py,sha256=HpMvKfSgC3F5ea8ZlmvvnJ5A0tkpzjMJkAioo4vk0Cs,17614
openai/resources/batches.py,sha256=QsK-LsjUuW9rzRyLfgmAj-e9Idve1GAUuG4JxJ4vPWA,18188
openai/resources/beta/__init__.py,sha256=nXoV4P8WCrbEZuNMtptbIuy_LqlVafY9lJ2qfW35GFc,1636
openai/resources/beta/__pycache__/__init__.cpython-38.pyc,,
openai/resources/beta/__pycache__/assistants.cpython-38.pyc,,
openai/resources/beta/__pycache__/beta.cpython-38.pyc,,
openai/resources/beta/assistants.py,sha256=dGJLZIqkpeS_6DTYTVmL7Gb8lRXm08_miXSGwUJI4Yo,39476
openai/resources/beta/assistants.py,sha256=jE9tf1oWbDEf28WRRD2_lgg_pkz52aHi0xM0-B7cuwI,39768
openai/resources/beta/beta.py,sha256=xw_dfi9ZpyRG4ChwweQtirWwsWxhAA4mXSV46D7pS5M,4485
openai/resources/beta/threads/__init__.py,sha256=fQ_qdUVSfouVS5h47DlTb5mamChT4K-v-siPuuAB6do,1177
openai/resources/beta/threads/__pycache__/__init__.cpython-38.pyc,,
@ -151,9 +151,9 @@ openai/resources/beta/threads/runs/__init__.py,sha256=2FfDaqwmJJCd-IVpY_CrzWcFvw
openai/resources/beta/threads/runs/__pycache__/__init__.cpython-38.pyc,,
openai/resources/beta/threads/runs/__pycache__/runs.cpython-38.pyc,,
openai/resources/beta/threads/runs/__pycache__/steps.cpython-38.pyc,,
openai/resources/beta/threads/runs/runs.py,sha256=trveAGqtbYxNTdLct6xjcFJOiVJyYcTks1rsDDFqYOI,148671
openai/resources/beta/threads/runs/runs.py,sha256=06N5t4J-bfTF6iFeuiJAUDeGi8hgk91rTOVLqChxlxM,149137
openai/resources/beta/threads/runs/steps.py,sha256=uRykb4JapSNZCF8OD54f5qOWtrp2GoU1k5uAZgA4kAk,12223
openai/resources/beta/threads/threads.py,sha256=-X8O2UODf3TIvXw6iiRTl7wcw50rJVLoLmj94sjrSwE,100560
openai/resources/beta/threads/threads.py,sha256=IMyZG0pD7a_ZT2UJ83MxgFh8ShXna_HFdIJqGqLH1rs,100998
openai/resources/beta/vector_stores/__init__.py,sha256=11Xn1vhgndWiI0defJHv31vmbtbDgh2GwZT3gX8GgHk,1296
openai/resources/beta/vector_stores/__pycache__/__init__.cpython-38.pyc,,
openai/resources/beta/vector_stores/__pycache__/file_batches.cpython-38.pyc,,
@ -170,7 +170,7 @@ openai/resources/chat/chat.py,sha256=Edexhbq1anfSS_I0wNRQb7rx1OV6-rq4sxgVlYDGb6Y
openai/resources/chat/completions.py,sha256=uMtKJiYRRIZ8o2MFwNTB2Kq4Tgt0KBDP2LP2B6uyyTQ,68761
openai/resources/completions.py,sha256=4Rfv9o3XwI5GRfhN1RD4tEgNn0I2jb6TRW6j0b6bpZc,58712
openai/resources/embeddings.py,sha256=cMSXtMc_7mBqlSiQ99B7qXYoRLGyoeIFazyYQ0jJ1O4,10755
openai/resources/files.py,sha256=VYmoTHNjENqDRiyQGl0ZwisIy7ysP5NTGR2B8uFJDXk,26238
openai/resources/files.py,sha256=Hdu7an1HsoYIVTp7OJiaDF2m9YmYyHwpr9_Nz8Q6DqU,26392
openai/resources/fine_tuning/__init__.py,sha256=s6uoq7gM4gwoywdOOZQkPeYiSbUl-OwpeuMhwJJk0lc,837
openai/resources/fine_tuning/__pycache__/__init__.cpython-38.pyc,,
openai/resources/fine_tuning/__pycache__/fine_tuning.cpython-38.pyc,,
@ -227,7 +227,7 @@ openai/types/audio/transcription_create_params.py,sha256=H7LOzb4VHwhF_cm0MXMIDgf
openai/types/audio/translation.py,sha256=_PhTtQ-s1yc-4kAKlgc88FTqUpXnNYfM2ld5IuRRGkA,195
openai/types/audio/translation_create_params.py,sha256=pynqbAozfcVwu1U6C6xvauZSFlQxIz1cswSXJLfRI30,1506
openai/types/batch.py,sha256=eIOIaJnDuv93fdefTI0WRfTm7MZH8gLBdF0B12JCiZw,2787
openai/types/batch_create_params.py,sha256=Kh4ZGVNBFpO3mHakKNSktaUPc-cLpBrlh9RqyLjsnqk,1183
openai/types/batch_create_params.py,sha256=vNgtioC1ADnTCdEQ6vyOlAvtq1PBioRvnBPJduz4Xoo,1440
openai/types/batch_error.py,sha256=Xxl-gYm0jerpYyI-mKSSVxRMQRubkoLUiOP9U3v72EM,622
openai/types/batch_list_params.py,sha256=X1_sfRspuIMSDyXWVh0YnJ9vJLeOOH66TrvgEHueC84,705
openai/types/batch_request_counts.py,sha256=nOzdL84OlZRycVNW99EDkdjCFqqKh68emaWT4Lx7dBE,410
@ -267,8 +267,8 @@ openai/types/beta/__pycache__/vector_store_create_params.cpython-38.pyc,,
openai/types/beta/__pycache__/vector_store_deleted.cpython-38.pyc,,
openai/types/beta/__pycache__/vector_store_list_params.cpython-38.pyc,,
openai/types/beta/__pycache__/vector_store_update_params.cpython-38.pyc,,
openai/types/beta/assistant.py,sha256=9lrwz2SdGMf553qzYltklaVSKtdQIfR7WKBFJgUr_cg,4615
openai/types/beta/assistant_create_params.py,sha256=5vqnBevWOOfO5DvG4EWpQ7B_heMCc9rT1eEgYm068RQ,6122
openai/types/beta/assistant.py,sha256=m5bgNTyelK6MA1RUrdyLg2yTalyR0Xm67K6iBOqlwSk,4674
openai/types/beta/assistant_create_params.py,sha256=AntnxPRSPdSSOYrX7anCN54aeTYry3YddIFbEGta_z0,6181
openai/types/beta/assistant_deleted.py,sha256=bTTUl5FPHTBI5nRm7d0sGuR9VCSBDZ-IbOn9G_IpmJQ,301
openai/types/beta/assistant_list_params.py,sha256=1-osjSX8tKieHSP0xaKBBU8j-J01fKrrxIJRHDudFHk,1220
openai/types/beta/assistant_response_format.py,sha256=-JYxEihoHEHMak9E7KiyD5Zh_f3c-155j110mBDTFNE,378
@ -284,7 +284,7 @@ openai/types/beta/assistant_tool_choice_option.py,sha256=WaLj1FSgQyLrss5hoKbmb19
openai/types/beta/assistant_tool_choice_option_param.py,sha256=ODCix7ElFxtyABiL09OhaYbQy9RjICCSmILeqBFWeLE,402
openai/types/beta/assistant_tool_choice_param.py,sha256=NOWx9SzZEwYaHeAyFZTQlG3pmogMNXzjPJDGQUlbv7Q,572
openai/types/beta/assistant_tool_param.py,sha256=xsB-Vq93uyS69m5zMoAc7keLXB_OSwEUH6XgB2g3ex4,450
openai/types/beta/assistant_update_params.py,sha256=8YGYglHCQhoBCleaaKsDmR13LijeDgrhIhQ5Lo8B1L0,4363
openai/types/beta/assistant_update_params.py,sha256=Z4MA4GtxZzV3a6PlUShoDmDHAIwo7AyVk9O5wUnFhe8,4422
openai/types/beta/chat/__init__.py,sha256=OKfJYcKb4NObdiRObqJV_dOyDQ8feXekDUge2o_4pXQ,122
openai/types/beta/chat/__pycache__/__init__.cpython-38.pyc,,
openai/types/beta/code_interpreter_tool.py,sha256=7mgQc9OtD_ZUnZeNhoobMFcmmvtZPFCNYGB-PEnNnfs,333
@ -294,7 +294,7 @@ openai/types/beta/file_search_tool_param.py,sha256=nAON5EUoano9jVPYZMzMYMLCxde_4
openai/types/beta/function_tool.py,sha256=oYGJfcfPpUohKw2ikgshDjOI1HXCK-5pAWyegYNezeU,397
openai/types/beta/function_tool_param.py,sha256=T_k2OX1OULgkrHHXw0rY_J-O0y5qA0lM-B58C64YyfM,453
openai/types/beta/thread.py,sha256=wd00j3ogUpOa_O0Sf1m6H4f8t1Nf05DKWiK_4m33O6s,2013
openai/types/beta/thread_create_and_run_params.py,sha256=RXTfHQiS8dktu0bkomzqrVKHopBJoqAMSAxHpgYDTs8,12692
openai/types/beta/thread_create_and_run_params.py,sha256=fd4N3XYkRhBkBJlRePjH2ZXvJ2oAgDyMoR103j4kzXw,12751
openai/types/beta/thread_create_params.py,sha256=yu1ChXFvm6FQV4486PWxes88_jg3-yspp2jDwGZOBlw,4509
openai/types/beta/thread_deleted.py,sha256=MaYG_jZIjSiB9h_ZBiTtpMsRSwFKkCY83ziM5GO_oUk,292
openai/types/beta/thread_update_params.py,sha256=RYsR88YHwReKLiLqnLlnWiReiVIGlEGvVV9-g_wptgM,1750
@ -369,10 +369,10 @@ openai/types/beta/threads/message_delta_event.py,sha256=7SpE4Dd3Lrc_cm97SzBwZzGG
openai/types/beta/threads/message_list_params.py,sha256=LXqc3deSkKO6VN337OlQ4fzG7dfgBE7Iv_CLzZHhbhw,1294
openai/types/beta/threads/message_update_params.py,sha256=bw6_U-vZA4c9_CDmeGOh7IEPIm8BU3BBOKtxnii0LKA,629
openai/types/beta/threads/required_action_function_tool_call.py,sha256=XsR4OBbxI-RWteLvhcLEDBan6eUUGvhLORFRKjPbsLg,888
openai/types/beta/threads/run.py,sha256=D6TDDeIGMS39jc2TVY4HrVw0mpBDXhro9VIzeH2ejdg,7656
openai/types/beta/threads/run_create_params.py,sha256=cqbzHcQIOEIustQs1YGv8bMXuVof9NevPnyn2N9Ok7A,9035
openai/types/beta/threads/run.py,sha256=DRc46FFjOudl1VAYKM2ni63ngCg_8TTRoRoTom9KWjU,7729
openai/types/beta/threads/run_create_params.py,sha256=J2Une2MRcK0LgPOpGuw5ngg_kZykivtNbR1-TpHmzmw,9094
openai/types/beta/threads/run_list_params.py,sha256=73poqeRcb5TEsIVn7OzJ_g9OajNokEzpCVLzVNKZmPk,1208
openai/types/beta/threads/run_status.py,sha256=6KPJB7l0YfGSKzx4wuIP8SDiZSiaD2nb0KOf0uRPDP4,282
openai/types/beta/threads/run_status.py,sha256=ky3dh-uD5OhuQB7e4BMQjRXvIDOUJnecTKGXr_PNcFY,329
openai/types/beta/threads/run_submit_tool_outputs_params.py,sha256=aDrg0FZZoJKaPVQzcFjUg4ZKaeW8KF6UJBxhJEIjC2I,1630
openai/types/beta/threads/run_update_params.py,sha256=76dWMNa3zCUliemCdwWv6p07GNeMYCdZoJs9KNbdZSE,621
openai/types/beta/threads/runs/__init__.py,sha256=uhxk5F1_5c5wg2_p70AjlOy9cE3Ga8-ILn4Ep-gcls4,1515

View File

@ -1,4 +1,4 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
__title__ = "openai"
__version__ = "1.29.0" # x-release-please-version
__version__ = "1.30.1" # x-release-please-version

View File

@ -40,7 +40,7 @@ class Batches(SyncAPIResource):
self,
*,
completion_window: Literal["24h"],
endpoint: Literal["/v1/chat/completions", "/v1/embeddings"],
endpoint: Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"],
input_file_id: str,
metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@ -58,7 +58,9 @@ class Batches(SyncAPIResource):
is supported.
endpoint: The endpoint to be used for all requests in the batch. Currently
`/v1/chat/completions` and `/v1/embeddings` are supported.
`/v1/chat/completions`, `/v1/embeddings`, and `/v1/completions` are supported.
Note that `/v1/embeddings` batches are also restricted to a maximum of 50,000
embedding inputs across all requests in the batch.
input_file_id: The ID of an uploaded file that contains requests for the new batch.
@ -67,7 +69,8 @@ class Batches(SyncAPIResource):
Your input file must be formatted as a
[JSONL file](https://platform.openai.com/docs/api-reference/batch/requestInput),
and must be uploaded with the purpose `batch`.
and must be uploaded with the purpose `batch`. The file can contain up to 50,000
requests, and can be up to 100 MB in size.
metadata: Optional custom metadata for the batch.
@ -228,7 +231,7 @@ class AsyncBatches(AsyncAPIResource):
self,
*,
completion_window: Literal["24h"],
endpoint: Literal["/v1/chat/completions", "/v1/embeddings"],
endpoint: Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"],
input_file_id: str,
metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@ -246,7 +249,9 @@ class AsyncBatches(AsyncAPIResource):
is supported.
endpoint: The endpoint to be used for all requests in the batch. Currently
`/v1/chat/completions` and `/v1/embeddings` are supported.
`/v1/chat/completions`, `/v1/embeddings`, and `/v1/completions` are supported.
Note that `/v1/embeddings` batches are also restricted to a maximum of 50,000
embedding inputs across all requests in the batch.
input_file_id: The ID of an uploaded file that contains requests for the new batch.
@ -255,7 +260,8 @@ class AsyncBatches(AsyncAPIResource):
Your input file must be formatted as a
[JSONL file](https://platform.openai.com/docs/api-reference/batch/requestInput),
and must be uploaded with the purpose `batch`.
and must be uploaded with the purpose `batch`. The file can contain up to 50,000
requests, and can be up to 100 MB in size.
metadata: Optional custom metadata for the batch.

View File

@ -110,8 +110,9 @@ class Assistants(SyncAPIResource):
name: The name of the assistant. The maximum length is 256 characters.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -254,8 +255,9 @@ class Assistants(SyncAPIResource):
name: The name of the assistant. The maximum length is 256 characters.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -497,8 +499,9 @@ class AsyncAssistants(AsyncAPIResource):
name: The name of the assistant. The maximum length is 256 characters.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -641,8 +644,9 @@ class AsyncAssistants(AsyncAPIResource):
name: The name of the assistant. The maximum length is 256 characters.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.

View File

@ -164,8 +164,9 @@ class Runs(SyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -314,8 +315,9 @@ class Runs(SyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -460,8 +462,9 @@ class Runs(SyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -1097,7 +1100,7 @@ class Runs(SyncAPIResource):
if is_given(poll_interval_ms):
extra_headers["X-Stainless-Custom-Poll-Interval"] = str(poll_interval_ms)
terminal_states = {"requires_action", "cancelled", "completed", "failed", "expired"}
terminal_states = {"requires_action", "cancelled", "completed", "failed", "expired", "incomplete"}
while True:
response = self.with_raw_response.retrieve(
thread_id=thread_id,
@ -1718,8 +1721,9 @@ class AsyncRuns(AsyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -1868,8 +1872,9 @@ class AsyncRuns(AsyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -2014,8 +2019,9 @@ class AsyncRuns(AsyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -2653,7 +2659,7 @@ class AsyncRuns(AsyncAPIResource):
if is_given(poll_interval_ms):
extra_headers["X-Stainless-Custom-Poll-Interval"] = str(poll_interval_ms)
terminal_states = {"requires_action", "cancelled", "completed", "failed", "expired"}
terminal_states = {"requires_action", "cancelled", "completed", "failed", "expired", "incomplete"}
while True:
response = await self.with_raw_response.retrieve(
thread_id=thread_id,

View File

@ -341,8 +341,9 @@ class Threads(SyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -490,8 +491,9 @@ class Threads(SyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -635,8 +637,9 @@ class Threads(SyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -1331,8 +1334,9 @@ class AsyncThreads(AsyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -1480,8 +1484,9 @@ class AsyncThreads(AsyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -1625,8 +1630,9 @@ class AsyncThreads(AsyncAPIResource):
assistant will be used.
response_format: Specifies the format that the model must output. Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.

View File

@ -62,14 +62,18 @@ class Files(SyncAPIResource):
) -> FileObject:
"""Upload a file that can be used across various endpoints.
The size of all the
files uploaded by one organization can be up to 100 GB.
Individual files can be
up to 512 MB, and the size of all files uploaded by one organization can be up
to 100 GB.
The size of individual files can be a maximum of 512 MB or 2 million tokens for
Assistants. See the
[Assistants Tools guide](https://platform.openai.com/docs/assistants/tools) to
learn more about the types of files supported. The Fine-tuning API only supports
`.jsonl` files.
The Assistants API supports files up to 2 million tokens and of specific file
types. See the
[Assistants Tools guide](https://platform.openai.com/docs/assistants/tools) for
details.
The Fine-tuning API only supports `.jsonl` files.
The Batch API only supports `.jsonl` files up to 100 MB in size.
Please [contact us](https://help.openai.com/) if you need to increase these
storage limits.
@ -335,14 +339,18 @@ class AsyncFiles(AsyncAPIResource):
) -> FileObject:
"""Upload a file that can be used across various endpoints.
The size of all the
files uploaded by one organization can be up to 100 GB.
Individual files can be
up to 512 MB, and the size of all files uploaded by one organization can be up
to 100 GB.
The size of individual files can be a maximum of 512 MB or 2 million tokens for
Assistants. See the
[Assistants Tools guide](https://platform.openai.com/docs/assistants/tools) to
learn more about the types of files supported. The Fine-tuning API only supports
`.jsonl` files.
The Assistants API supports files up to 2 million tokens and of specific file
types. See the
[Assistants Tools guide](https://platform.openai.com/docs/assistants/tools) for
details.
The Fine-tuning API only supports `.jsonl` files.
The Batch API only supports `.jsonl` files up to 100 MB in size.
Please [contact us](https://help.openai.com/) if you need to increase these
storage limits.

View File

@ -15,10 +15,12 @@ class BatchCreateParams(TypedDict, total=False):
Currently only `24h` is supported.
"""
endpoint: Required[Literal["/v1/chat/completions", "/v1/embeddings"]]
endpoint: Required[Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"]]
"""The endpoint to be used for all requests in the batch.
Currently `/v1/chat/completions` and `/v1/embeddings` are supported.
Currently `/v1/chat/completions`, `/v1/embeddings`, and `/v1/completions` are
supported. Note that `/v1/embeddings` batches are also restricted to a maximum
of 50,000 embedding inputs across all requests in the batch.
"""
input_file_id: Required[str]
@ -29,7 +31,8 @@ class BatchCreateParams(TypedDict, total=False):
Your input file must be formatted as a
[JSONL file](https://platform.openai.com/docs/api-reference/batch/requestInput),
and must be uploaded with the purpose `batch`.
and must be uploaded with the purpose `batch`. The file can contain up to 50,000
requests, and can be up to 100 MB in size.
"""
metadata: Optional[Dict[str, str]]

View File

@ -85,9 +85,9 @@ class Assistant(BaseModel):
response_format: Optional[AssistantResponseFormatOption] = None
"""Specifies the format that the model must output.
Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.

View File

@ -77,9 +77,9 @@ class AssistantCreateParams(TypedDict, total=False):
response_format: Optional[AssistantResponseFormatOptionParam]
"""Specifies the format that the model must output.
Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.

View File

@ -45,9 +45,9 @@ class AssistantUpdateParams(TypedDict, total=False):
response_format: Optional[AssistantResponseFormatOptionParam]
"""Specifies the format that the model must output.
Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.

View File

@ -108,9 +108,9 @@ class ThreadCreateAndRunParamsBase(TypedDict, total=False):
response_format: Optional[AssistantResponseFormatOptionParam]
"""Specifies the format that the model must output.
Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.

View File

@ -160,9 +160,9 @@ class Run(BaseModel):
response_format: Optional[AssistantResponseFormatOption] = None
"""Specifies the format that the model must output.
Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.
@ -182,8 +182,8 @@ class Run(BaseModel):
status: RunStatus
"""
The status of the run, which can be either `queued`, `in_progress`,
`requires_action`, `cancelling`, `cancelled`, `failed`, `completed`, or
`expired`.
`requires_action`, `cancelling`, `cancelled`, `failed`, `completed`,
`incomplete`, or `expired`.
"""
thread_id: str

View File

@ -110,9 +110,9 @@ class RunCreateParamsBase(TypedDict, total=False):
response_format: Optional[AssistantResponseFormatOptionParam]
"""Specifies the format that the model must output.
Compatible with
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
message the model generates is valid JSON.

View File

@ -5,5 +5,13 @@ from typing_extensions import Literal
__all__ = ["RunStatus"]
RunStatus = Literal[
"queued", "in_progress", "requires_action", "cancelling", "cancelled", "failed", "completed", "expired"
"queued",
"in_progress",
"requires_action",
"cancelling",
"cancelled",
"failed",
"completed",
"incomplete",
"expired",
]

View File

@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: rich-click
Version: 1.8.1
Version: 1.8.2
Summary: Format click help output nicely with rich
Author-email: Phil Ewels <phil@ewels.co.uk>
Maintainer-email: Phil Ewels <phil@ewels.co.uk>, Daniel Reeves <xdanielreeves@gmail.com>

View File

@ -1,12 +1,12 @@
../../../bin/rich-click,sha256=ueTpBQA5XZGwZsxmrQ8SOCO4y6uUvAchzTiBaaXEWmU,257
rich_click-1.8.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
rich_click-1.8.1.dist-info/LICENSE,sha256=1GDP5mZhei-Gy3xm-QQfCodhHIsMHy8_Z0ogIq3B8q8,1067
rich_click-1.8.1.dist-info/METADATA,sha256=B-td1s31-T6sp-ju-AI3LJSG4JWhYeBpEXBxBmxiPVM,7880
rich_click-1.8.1.dist-info/RECORD,,
rich_click-1.8.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
rich_click-1.8.1.dist-info/entry_points.txt,sha256=q-JckrJEfhmzklT5lIpe1oTf68OaWJpAy1Mik7lGeXs,51
rich_click-1.8.1.dist-info/top_level.txt,sha256=tKHPQk1z9Wd8Lu2HqxHQyF7oqOeQE5__SUDHezQZ4WE,11
rich_click/__init__.py,sha256=_TOw4JYtBxYm4pD4_GVAk1e2ivCX0nCcnN_l1DhSq8g,4575
rich_click-1.8.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
rich_click-1.8.2.dist-info/LICENSE,sha256=1GDP5mZhei-Gy3xm-QQfCodhHIsMHy8_Z0ogIq3B8q8,1067
rich_click-1.8.2.dist-info/METADATA,sha256=2qjyywq8tVOwerVIev_5AWGSDArVIpTIphJDGwU0aQM,7880
rich_click-1.8.2.dist-info/RECORD,,
rich_click-1.8.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
rich_click-1.8.2.dist-info/entry_points.txt,sha256=q-JckrJEfhmzklT5lIpe1oTf68OaWJpAy1Mik7lGeXs,51
rich_click-1.8.2.dist-info/top_level.txt,sha256=tKHPQk1z9Wd8Lu2HqxHQyF7oqOeQE5__SUDHezQZ4WE,11
rich_click/__init__.py,sha256=a0dCmP1_hnPToMQJLCEzxKCLuQFDkPlhbegtVIAMfOc,4575
rich_click/__main__.py,sha256=FvI_e9IrNyHmo39uQtXeMdv2HbkuaIUf4uKkir3LdzY,417
rich_click/__pycache__/__init__.cpython-38.pyc,,
rich_click/__pycache__/__main__.cpython-38.pyc,,
@ -31,7 +31,7 @@ rich_click/rich_click.py,sha256=Qob8c8vE0BeJgZnPyJoVfTI12_NtClmM5t7LQM6iTFM,6963
rich_click/rich_command.py,sha256=EbRaCMkg9EqfWQWnArqWyYlZ-znjq3V-EwiTKoDWREQ,16148
rich_click/rich_context.py,sha256=HuVUWlEunWTN-CqTrkSEgdxSncs_1GYBvKWmXOgtFaY,3721
rich_click/rich_group.py,sha256=76PCL148Jb9rd9c6KGiELBtoPXTcqlGa-xPRjPRQNAA,254
rich_click/rich_help_configuration.py,sha256=OC3KUP8xMQNWWkzd8mCPMjVyOeSQR_JGiCVwTkBkMSU,11499
rich_click/rich_help_configuration.py,sha256=JnjIpfRFqM1Mc3DT1GE-iMdmf76ezSlCyb6oyqcbp_I,11602
rich_click/rich_help_formatter.py,sha256=eA1Ri-W2Oq4KsIlCkKQzr4CMlq0SnXVn5kFttdtBk7A,4409
rich_click/rich_help_rendering.py,sha256=OTIlltVfCW_Z-xHqhwtVwGPTGWjWSQiCT_zCXYUf8u4,31289
rich_click/utils.py,sha256=B7PEW-S9hNnPBXehkiPbgrZv8EgzhmNR7psVvbh8eC8,1463

View File

@ -6,7 +6,7 @@ The intention is to provide attractive help output from Click, formatted with Ri
customisation required.
"""
__version__ = "1.8.1"
__version__ = "1.8.2"
# Import the entire click API here.
# We need to manually import these instead of `from click import *` to force

View File

@ -172,17 +172,19 @@ class RichHelpConfiguration:
legacy_windows: Optional[bool] = field(default=None)
def __post_init__(self) -> None: # noqa: D105
if self.highlighter is not None:
import warnings
# Todo: Fix this so that the deprecation warning works properly.
warnings.warn(
"`highlighter` kwarg is deprecated in RichHelpConfiguration."
" Please do one of the following instead: either set highlighter_patterns=[...] if you want"
" to use regex; or for more advanced use cases where you'd like to use a different type"
" of rich.highlighter.Highlighter, subclass the `RichHelpFormatter` and update its `highlighter`.",
DeprecationWarning,
stacklevel=2,
)
# if self.highlighter is not None:
# import warnings
#
# warnings.warn(
# "`highlighter` kwarg is deprecated in RichHelpConfiguration."
# " Please do one of the following instead: either set highlighter_patterns=[...] if you want"
# " to use regex; or for more advanced use cases where you'd like to use a different type"
# " of rich.highlighter.Highlighter, subclass the `RichHelpFormatter` and update its `highlighter`.",
# DeprecationWarning,
# stacklevel=2,
# )
self.__dataclass_fields__.pop("highlighter", None)

View File

@ -1,112 +0,0 @@
Metadata-Version: 2.1
Name: tiktoken
Version: 0.4.0
Summary: tiktoken is a fast BPE tokeniser for use with OpenAI's models
Author: Shantanu Jain
Author-email: shantanu@openai.com
License: MIT License
Copyright (c) 2022 OpenAI, Shantanu Jain
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Project-URL: homepage, https://github.com/openai/tiktoken
Project-URL: repository, https://github.com/openai/tiktoken
Project-URL: changelog, https://github.com/openai/tiktoken/blob/main/CHANGELOG.md
Requires-Python: >=3.8
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: requests >=2.26.0
Provides-Extra: blobfile
Requires-Dist: blobfile >=2 ; extra == 'blobfile'
# Clone and added Pure python implementation.
This is a fork of https://github.com/openai/tiktoken with the tokenizer available as a pure python implementation.
You can use it locally like this (after pip install -e .)
```
import tiktoken.registry as registry
from tiktoken.registry import _find_constructors
from tiktoken.core import Encoding
_find_constructors()
constructor = registry.ENCODING_CONSTRUCTORS['cl100k_base']
params = constructor()
enc = Encoding(**params, use_pure_python=True)
enc.encode("hello world")
```
The port to python (from Rust) is 99% done with the help of GPT4 and these are the tests I did
(so use it at your own risk but it should be trivial to compare both across a much diverse charset and
phrases)
```
Encoding for 'a': [64]
Encoding for '!': [0]
Encoding for '1': [16]
Encoding for '&': [5]
Encoding for 'hello': [15339]
Encoding for 'world': [14957]
Encoding for 'python': [12958]
Encoding for 'rust': [36888]
Encoding for 'hello world': [15339, 1917]
Encoding for 'rust is fast': [36888, 374, 5043]
Encoding for '.': [13]
Encoding for ',': [11]
Encoding for '?': [30]
Encoding for '!': [0]
Encoding for 'Hello, world!': [9906, 11, 1917, 0]
Encoding for 'How's it going?': [4438, 596, 433, 2133, 30]
Encoding for '
': [198]
Encoding for ' ': [197]
Encoding for '0': [15]
Encoding for '1': [16]
Encoding for '9': [24]
Encoding for '10': [605]
Encoding for '100': [1041]
Encoding for '12345': [4513, 1774]
Encoding for '0.1': [15, 13, 16]
Encoding for '3.14': [18, 13, 975]
Encoding for '10.001': [605, 13, 4119]
Encoding for 'abc123': [13997, 4513]
Encoding for '42rocks': [2983, 299, 14895]
Encoding for 'HELLO': [51812, 1623]
Encoding for 'World': [10343]
Encoding for 'Python': [31380]
Encoding for 'helloWorld': [15339, 10343]
Encoding for 'rust_rocks': [36888, 27706, 14895]
Encoding for '✓': [38798, 241]
Encoding for '❤️': [49633, 97, 31643]
Encoding for '©': [20644]
Encoding for 'hola': [71, 8083]
Encoding for 'こんにちは': [90115]
Encoding for 'Привет': [54745, 28089, 8341]
Encoding for 'The quick brown fox jumps over the lazy dog.': [791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679, 13]
Encoding for '': []
Encoding for ' ': [220]
Encoding for ' ': [197]
Encoding for '
': [198]
Encoding for '@@@': [19741, 31]
Encoding for '###': [14711]
```

View File

@ -1,25 +0,0 @@
tiktoken-0.4.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
tiktoken-0.4.0.dist-info/LICENSE,sha256=QYy0mbQ2Eo1lPXmUEzOlQ3t74uqSE9zC8E0V1dLFHYY,1078
tiktoken-0.4.0.dist-info/METADATA,sha256=182HHps9h3oj1XnrbG_manrVy8FjK1BYBkiS8Mlvs3E,4154
tiktoken-0.4.0.dist-info/RECORD,,
tiktoken-0.4.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
tiktoken-0.4.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
tiktoken-0.4.0.dist-info/direct_url.json,sha256=PfrcId0UxJKjhG_jK4kY5QCAFNuHX1M31KoLYQw6LPU,140
tiktoken-0.4.0.dist-info/top_level.txt,sha256=54G5MceQnuD7EXvp7jzGxDDapA1iOwsh77jhCN9WKkc,22
tiktoken/__init__.py,sha256=bDWpVFXWe5PRN-K6IRad2UeVs24x3hVlSPELD7MdQCo,215
tiktoken/__pycache__/__init__.cpython-38.pyc,,
tiktoken/__pycache__/_educational.cpython-38.pyc,,
tiktoken/__pycache__/core.cpython-38.pyc,,
tiktoken/__pycache__/load.cpython-38.pyc,,
tiktoken/__pycache__/model.cpython-38.pyc,,
tiktoken/__pycache__/python_tiktoken.cpython-38.pyc,,
tiktoken/__pycache__/registry.cpython-38.pyc,,
tiktoken/_educational.py,sha256=i7eHAkrpsb0yulANGwTrMZB64nU3xD28PjFMUa-iC3Q,7761
tiktoken/core.py,sha256=_83ZiibR-9Iig7nBtY2Egn-MVT-6hR_eGBAAl9Y_fHA,15671
tiktoken/load.py,sha256=j5kjYrCM_Lbic71WxZ6CmdwZXYaokhLfTtBKt81-_ek,4180
tiktoken/model.py,sha256=Qv2lU2CJ-_vtXlUq4m1asZpRupDps_uGmBQQM2Bc8hg,2771
tiktoken/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
tiktoken/python_tiktoken.py,sha256=ukYyPFHZ-6NZ0BBtsgbgV6ORfwWC1rRUtWOdZYo1jYw,8504
tiktoken/registry.py,sha256=urGm3u0ZQlv63daO3GKt45wPpZzzMQpNZXhRgun3wU4,2549
tiktoken_ext/__pycache__/openai_public.cpython-38.pyc,,
tiktoken_ext/openai_public.py,sha256=FrwXijob7DBruofS9xn5GC7aG9a1f5VcKp1xTviZuc4,2798

View File

@ -1 +0,0 @@
{"url": "https://github.com/yangbobo2021/tiktoken.git", "vcs_info": {"commit_id": "87539844cfeb6289e430804311a81cb3548636a3", "vcs": "git"}}

View File

@ -0,0 +1,168 @@
Metadata-Version: 2.1
Name: tiktoken
Version: 0.7.0
Summary: tiktoken is a fast BPE tokeniser for use with OpenAI's models
Author: Shantanu Jain
Author-email: shantanu@openai.com
License: MIT License
Copyright (c) 2022 OpenAI, Shantanu Jain
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Project-URL: homepage, https://github.com/openai/tiktoken
Project-URL: repository, https://github.com/openai/tiktoken
Project-URL: changelog, https://github.com/openai/tiktoken/blob/main/CHANGELOG.md
Requires-Python: >=3.8
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: requests >=2.26.0
Provides-Extra: blobfile
Requires-Dist: blobfile >=2 ; extra == 'blobfile'
# ⏳ tiktoken
tiktoken is a fast [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with
OpenAI's models.
```python
import tiktoken
enc = tiktoken.get_encoding("o200k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"
# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4o")
```
The open source version of `tiktoken` can be installed from PyPI:
```
pip install tiktoken
```
The tokeniser API is documented in `tiktoken/core.py`.
Example code using `tiktoken` can be found in the
[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
## Performance
`tiktoken` is between 3-6x faster than a comparable open source tokeniser:
![image](https://raw.githubusercontent.com/openai/tiktoken/main/perf.svg)
Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from
`tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`.
## Getting help
Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).
If you work at OpenAI, make sure to check the internal documentation or feel free to contact
@shantanu.
## What is BPE anyway?
Language models don't see text like you and I, instead they see a sequence of numbers (known as tokens).
Byte pair encoding (BPE) is a way of converting text into tokens. It has a couple desirable
properties:
1) It's reversible and lossless, so you can convert tokens back into the original text
2) It works on arbitrary text, even text that is not in the tokeniser's training data
3) It compresses the text: the token sequence is shorter than the bytes corresponding to the
original text. On average, in practice, each token corresponds to about 4 bytes.
4) It attempts to let the model see common subwords. For instance, "ing" is a common subword in
English, so BPE encodings will often split "encoding" into tokens like "encod" and "ing"
(instead of e.g. "enc" and "oding"). Because the model will then see the "ing" token again and
again in different contexts, it helps models generalise and better understand grammar.
`tiktoken` contains an educational submodule that is friendlier if you want to learn more about
the details of BPE, including code that helps visualise the BPE procedure:
```python
from tiktoken._educational import *
# Train a BPE tokeniser on a small amount of text
enc = train_simple_encoding()
# Visualise how the GPT-4 encoder encodes text
enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base")
enc.encode("hello world aaaaaaaaaaaa")
```
## Extending tiktoken
You may wish to extend `tiktoken` to support new encodings. There are two ways to do this.
**Create your `Encoding` object exactly the way you want and simply pass it around.**
```python
cl100k_base = tiktoken.get_encoding("cl100k_base")
# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
enc = tiktoken.Encoding(
# If you're changing the set of special tokens, make sure to use a different name
# It should be clear from the name what behaviour to expect.
name="cl100k_im",
pat_str=cl100k_base._pat_str,
mergeable_ranks=cl100k_base._mergeable_ranks,
special_tokens={
**cl100k_base._special_tokens,
"<|im_start|>": 100264,
"<|im_end|>": 100265,
}
)
```
**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**
This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer
option 1.
To do this, you'll need to create a namespace package under `tiktoken_ext`.
Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:
```
my_tiktoken_extension
├── tiktoken_ext
│   └── my_encodings.py
└── setup.py
```
`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.
This is a dictionary from an encoding name to a function that takes no arguments and returns
arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see
`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.
Your `setup.py` should look something like this:
```python
from setuptools import setup, find_namespace_packages
setup(
name="my_tiktoken_extension",
packages=find_namespace_packages(include=['tiktoken_ext*']),
install_requires=["tiktoken"],
...
)
```
Then simply `pip install ./my_tiktoken_extension` and you should be able to use your
custom encodings! Make sure **not** to use an editable install.

View File

@ -0,0 +1,25 @@
tiktoken-0.7.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
tiktoken-0.7.0.dist-info/LICENSE,sha256=QYy0mbQ2Eo1lPXmUEzOlQ3t74uqSE9zC8E0V1dLFHYY,1078
tiktoken-0.7.0.dist-info/METADATA,sha256=Uh4FPcMTr6wxg79NeyNXwWkImgSGn8uTqHPEcNYUwn4,6598
tiktoken-0.7.0.dist-info/RECORD,,
tiktoken-0.7.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
tiktoken-0.7.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
tiktoken-0.7.0.dist-info/direct_url.json,sha256=N3P589ng7z0KBpW-bWO-8Sst21MaC5OT1LJE1SYeuGE,138
tiktoken-0.7.0.dist-info/top_level.txt,sha256=54G5MceQnuD7EXvp7jzGxDDapA1iOwsh77jhCN9WKkc,22
tiktoken/__init__.py,sha256=FNmz8KgZfaG62vRgMMkTL9jj0a2AI7JGV1b-RZ29_tY,322
tiktoken/__pycache__/__init__.cpython-38.pyc,,
tiktoken/__pycache__/_educational.cpython-38.pyc,,
tiktoken/__pycache__/_tiktoken.cpython-38.pyc,,
tiktoken/__pycache__/core.cpython-38.pyc,,
tiktoken/__pycache__/load.cpython-38.pyc,,
tiktoken/__pycache__/model.cpython-38.pyc,,
tiktoken/__pycache__/registry.cpython-38.pyc,,
tiktoken/_educational.py,sha256=l_bTeohxYJ2RHrXDFT2QfRF7aD89S38VFZndzZTI_cM,8234
tiktoken/_tiktoken.py,sha256=uSdqUIlUBtyyCwSPnVGA5eBWI74noFq-uEPdUxWGXgU,3541
tiktoken/core.py,sha256=l9ozzJP6zQ_rlPvV1ZAF1ENZBC3jyiin_rwlkmenxTQ,16129
tiktoken/load.py,sha256=YDbOfHhKn1MEWn9cWc1cVqDxZNwpGifWnuvfEcKeJ4w,5351
tiktoken/model.py,sha256=fCcuegWlKwFFmD1crVXHxFQBlBV6BGWCfwYTIhUcADs,3647
tiktoken/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
tiktoken/registry.py,sha256=ksP_k8jlqLyefL1sr5OAc-yOK0McOFaHZM4oF8KQdYg,2811
tiktoken_ext/__pycache__/openai_public.cpython-38.pyc,,
tiktoken_ext/openai_public.py,sha256=pVz8DaOyPbbPzJc2xhS61vL_Ubim8n0lgTD1v4TaZBc,4515

View File

@ -0,0 +1 @@
{"url": "https://github.com/devchat-ai/tiktoken.git", "vcs_info": {"commit_id": "01de9f4ed2b290ecd33805406a3954488018a783", "vcs": "git"}}

View File

@ -1,4 +1,6 @@
# This is the public API of tiktoken
from .core import Encoding as Encoding
from .model import encoding_for_model as encoding_for_model
from .model import encoding_name_for_model as encoding_name_for_model
from .registry import get_encoding as get_encoding
from .registry import list_encoding_names as list_encoding_names

View File

@ -1,11 +1,8 @@
"""This is an educational implementation of the byte pair encoding algorithm."""
from __future__ import annotations
import collections
import itertools
from typing import Optional
import re as regex
import regex
import tiktoken
@ -187,11 +184,23 @@ def bpe_train(
def visualise_tokens(token_values: list[bytes]) -> None:
backgrounds = itertools.cycle(
[f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]]
)
interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values))
print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8"))
background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
# If token boundaries do not occur at unicode character boundaries, it's unclear how best to
# visualise the token. Here, we'll just use the unicode replacement character to represent some
# fraction of a character.
unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values]
running_length = 0
last_color = None
for token in unicode_token_values:
color = background[running_length % len(background)]
if color == last_color:
color = background[(running_length + 1) % len(background)]
assert color != last_color
last_color = color
running_length += len(token)
print(color + token, end="")
print("\u001b[0m")
def train_simple_encoding():

View File

@ -0,0 +1,89 @@
import re
import hashlib
from typing import Dict, List, Tuple, Union
Rank = int
def _byte_pair_merge(ranks: Dict[bytes, Rank], piece: bytes) -> List[Tuple[bytes, Rank]]:
parts = []
min_rank = (float('inf'), float('inf'))
for i in range(len(piece) - 1):
rank = ranks.get(piece[i:i + 2], float('inf'))
if rank < min_rank[0]:
min_rank = (rank, i)
parts.append((piece[i:i + 2], rank))
parts.append((piece[len(piece) - 1:], float('inf')))
parts.append((piece[len(piece):], float('inf')))
while min_rank[0] != float('inf'):
i = min_rank[1]
if i > 0:
parts[i - 1] = (parts[i - 1][0], get_rank_with_ranks(piece, parts, i - 1, ranks))
parts[i] = (parts[i][0], get_rank_with_ranks(piece, parts, i, ranks))
del parts[i + 1]
min_rank = (float('inf'), float('inf'))
for j, (_, rank) in enumerate(parts[:-1]):
if rank < min_rank[0]:
min_rank = (rank, j)
return parts
def get_rank_with_ranks(piece: bytes, parts: List[Tuple[bytes, Rank]], i: int, ranks: Dict[bytes, Rank]) -> Rank:
if (i + 3) < len(parts):
key = piece[parts[i][0].start:parts[i + 3][0].start]
return ranks.get(key, float('inf'))
else:
return float('inf')
def byte_pair_encode(piece: bytes, ranks: Dict[bytes, Rank]) -> List[Rank]:
assert len(piece) > 1
parts = _byte_pair_merge(ranks, piece)
tokens = []
current_token = []
for part in parts[:-1]:
if len(current_token) == 0:
current_token.append(part[0])
elif ranks.get(b''.join(current_token + [part[0]])) is not None:
current_token.append(part[0])
else:
tokens.append(ranks[b''.join(current_token)])
current_token = [part[0]]
tokens.append(ranks[b''.join(current_token)])
return tokens
def byte_pair_split(piece: bytes, ranks: Dict[bytes, Rank]) -> List[bytes]:
assert len(piece) > 1
parts = _byte_pair_merge(ranks, piece)
return [part[0] for part in parts[:-1]]
class CoreBPE:
def __init__(self, encoder: Dict[bytes, Rank], special_tokens_encoder: Dict[str, Rank], pattern: str):
self.encoder = encoder
self.special_tokens_encoder = special_tokens_encoder
self.decoder = {v: k for k, v in encoder.items()}
self.special_tokens_decoder = {v: k.encode('utf-8') for k, v in special_tokens_encoder.items()}
self.regex = re.compile(pattern)
self.special_regex = re.compile('|'.join(map(re.escape, special_tokens_encoder.keys())))
def encode_ordinary(self, text: str) -> List[Rank]:
return [self.encoder[piece.encode("utf-8")] for piece in self.regex.findall(text)]
def encode(self, text: str, allowed_special: set) -> List[Rank]:
tokens = []
start = 0
for match in self.special_regex.finditer(text):
if match.start() > start:
tokens.extend(self.encode_ordinary(text[start:match.start()]))
if match.group(0) in allowed_special:
tokens.append(self.special_tokens_encoder[match.group(0)])
start = match.end()
if start < len(text):
tokens.extend(self.encode_ordinary(text[start:]))
return tokens
def decode_bytes(self, tokens: List[Rank]) -> bytes:
return b''.join(self.decoder.get(token, self.special_tokens_decoder.get(token)) for token in tokens)
def token_byte_values(self) -> List[bytes]:
return self.sorted_token_bytes

View File

@ -5,12 +5,8 @@ from concurrent.futures import ThreadPoolExecutor
from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union
import re as regex
try:
from tiktoken import _tiktoken
from .python_tiktoken import CoreBPE
except ImportError:
# print("Unable to import rust py binding for _tiktoken, must use pure python implementation")
from .python_tiktoken import CoreBPE
from tiktoken import _tiktoken
class Encoding:
@ -22,7 +18,6 @@ class Encoding:
mergeable_ranks: dict[bytes, int],
special_tokens: dict[str, int],
explicit_n_vocab: Optional[int] = None,
use_pure_python: bool = False
):
"""Creates an Encoding object.
@ -52,11 +47,7 @@ class Encoding:
assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab
assert self.max_token_value == explicit_n_vocab - 1
if use_pure_python:
self._core_bpe = CoreBPE(mergeable_ranks, special_tokens, pat_str)
else:
self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
def __repr__(self) -> str:
return f"<Encoding {self.name!r}>"
@ -125,6 +116,10 @@ class Encoding:
if match := _special_token_regex(disallowed_special).search(text):
raise_disallowed_special_token(match.group())
# https://github.com/PyO3/pyo3/pull/3632
if isinstance(allowed_special, frozenset):
allowed_special = set(allowed_special)
try:
return self._core_bpe.encode(text, allowed_special)
except UnicodeEncodeError:
@ -373,6 +368,26 @@ class Encoding:
def _encode_bytes(self, text: bytes) -> list[int]:
return self._core_bpe._encode_bytes(text)
def __getstate__(self) -> object:
import tiktoken.registry
# As an optimisation, pickle registered encodings by reference
if self is tiktoken.registry.ENCODINGS.get(self.name):
return self.name
return {
"name": self.name,
"pat_str": self._pat_str,
"mergeable_ranks": self._mergeable_ranks,
"special_tokens": self._special_tokens,
}
def __setstate__(self, value: object) -> None:
import tiktoken.registry
if isinstance(value, str):
self.__dict__ = tiktoken.registry.get_encoding(value).__dict__
return
self.__init__(**value)
@functools.lru_cache(maxsize=128)

View File

@ -6,6 +6,7 @@ import json
import os
import tempfile
import uuid
from typing import Optional
import requests
@ -26,13 +27,20 @@ def read_file(blobpath: str) -> bytes:
return resp.content
def read_file_cached(blobpath: str) -> bytes:
def check_hash(data: bytes, expected_hash: str) -> bool:
actual_hash = hashlib.sha256(data).hexdigest()
return actual_hash == expected_hash
def read_file_cached(blobpath: str, expected_hash: Optional[str] = None) -> bytes:
user_specified_cache = True
if "TIKTOKEN_CACHE_DIR" in os.environ:
cache_dir = os.environ["TIKTOKEN_CACHE_DIR"]
elif "DATA_GYM_CACHE_DIR" in os.environ:
cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
else:
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
user_specified_cache = False
if cache_dir == "":
# disable caching
@ -43,21 +51,42 @@ def read_file_cached(blobpath: str) -> bytes:
cache_path = os.path.join(cache_dir, cache_key)
if os.path.exists(cache_path):
with open(cache_path, "rb") as f:
return f.read()
data = f.read()
if expected_hash is None or check_hash(data, expected_hash):
return data
# the cached file does not match the hash, remove it and re-fetch
try:
os.remove(cache_path)
except OSError:
pass
contents = read_file(blobpath)
if expected_hash and not check_hash(contents, expected_hash):
raise ValueError(
f"Hash mismatch for data downloaded from {blobpath} (expected {expected_hash}). "
f"This may indicate a corrupted download. Please try again."
)
os.makedirs(cache_dir, exist_ok=True)
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
with open(tmp_filename, "wb") as f:
f.write(contents)
os.rename(tmp_filename, cache_path)
try:
os.makedirs(cache_dir, exist_ok=True)
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
with open(tmp_filename, "wb") as f:
f.write(contents)
os.rename(tmp_filename, cache_path)
except OSError:
# don't raise if we can't write to the default cache, e.g. issue #75
if user_specified_cache:
raise
return contents
def data_gym_to_mergeable_bpe_ranks(
vocab_bpe_file: str, encoder_json_file: str
vocab_bpe_file: str,
encoder_json_file: str,
vocab_bpe_hash: Optional[str] = None,
encoder_json_hash: Optional[str] = None,
) -> dict[bytes, int]:
# NB: do not add caching to this function
rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "]
@ -72,7 +101,7 @@ def data_gym_to_mergeable_bpe_ranks(
assert len(rank_to_intbyte) == 2**8
# vocab_bpe contains the merges along with associated ranks
vocab_bpe_contents = read_file_cached(vocab_bpe_file).decode()
vocab_bpe_contents = read_file_cached(vocab_bpe_file, vocab_bpe_hash).decode()
bpe_merges = [tuple(merge_str.split()) for merge_str in vocab_bpe_contents.split("\n")[1:-1]]
def decode_data_gym(value: str) -> bytes:
@ -89,7 +118,7 @@ def data_gym_to_mergeable_bpe_ranks(
# check that the encoder file matches the merges file
# this sanity check is important since tiktoken assumes that ranks are ordered the same
# as merge priority
encoder_json = json.loads(read_file_cached(encoder_json_file))
encoder_json = json.loads(read_file_cached(encoder_json_file, encoder_json_hash))
encoder_json_loaded = {decode_data_gym(k): v for k, v in encoder_json.items()}
# drop these two special tokens if present, since they're not mergeable bpe tokens
encoder_json_loaded.pop(b"<|endoftext|>", None)
@ -111,9 +140,11 @@ def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> No
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
def load_tiktoken_bpe(
tiktoken_bpe_file: str, expected_hash: Optional[str] = None
) -> dict[bytes, int]:
# NB: do not add caching to this function
contents = read_file_cached(tiktoken_bpe_file)
contents = read_file_cached(tiktoken_bpe_file, expected_hash)
return {
base64.b64decode(token): int(rank)
for token, rank in (line.split() for line in contents.splitlines() if line)

View File

@ -6,17 +6,33 @@ from .registry import get_encoding
# TODO: these will likely be replaced by an API endpoint
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
"gpt-35-turbo": "cl100k_base", # Azure deployment name
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
# fine-tuned
"ft:gpt-4": "cl100k_base",
"ft:gpt-3.5-turbo": "cl100k_base",
"ft:davinci-002": "cl100k_base",
"ft:babbage-002": "cl100k_base",
}
MODEL_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4o": "o200k_base",
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
"gpt-3.5": "cl100k_base", # Common shorthand
"gpt-35-turbo": "cl100k_base", # Azure deployment name
# text
# base
"davinci-002": "cl100k_base",
"babbage-002": "cl100k_base",
# embeddings
"text-embedding-ada-002": "cl100k_base",
"text-embedding-3-small": "cl100k_base",
"text-embedding-3-large": "cl100k_base",
# DEPRECATED MODELS
# text (DEPRECATED)
"text-davinci-003": "p50k_base",
"text-davinci-002": "p50k_base",
"text-davinci-001": "r50k_base",
@ -27,19 +43,17 @@ MODEL_TO_ENCODING: dict[str, str] = {
"curie": "r50k_base",
"babbage": "r50k_base",
"ada": "r50k_base",
# code
# code (DEPRECATED)
"code-davinci-002": "p50k_base",
"code-davinci-001": "p50k_base",
"code-cushman-002": "p50k_base",
"code-cushman-001": "p50k_base",
"davinci-codex": "p50k_base",
"cushman-codex": "p50k_base",
# edit
# edit (DEPRECATED)
"text-davinci-edit-001": "p50k_edit",
"code-davinci-edit-001": "p50k_edit",
# embeddings
"text-embedding-ada-002": "cl100k_base",
# old embeddings
# old embeddings (DEPRECATED)
"text-similarity-davinci-001": "r50k_base",
"text-similarity-curie-001": "r50k_base",
"text-similarity-babbage-001": "r50k_base",
@ -52,11 +66,15 @@ MODEL_TO_ENCODING: dict[str, str] = {
"code-search-ada-code-001": "r50k_base",
# open source
"gpt2": "gpt2",
"gpt-2": "gpt2", # Maintains consistency with gpt-4
}
def encoding_for_model(model_name: str) -> Encoding:
"""Returns the encoding used by a model."""
def encoding_name_for_model(model_name: str) -> str:
"""Returns the name of the encoding used by a model.
Raises a KeyError if the model name is not recognised.
"""
encoding_name = None
if model_name in MODEL_TO_ENCODING:
encoding_name = MODEL_TO_ENCODING[model_name]
@ -66,7 +84,7 @@ def encoding_for_model(model_name: str) -> Encoding:
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
if model_name.startswith(model_prefix):
return get_encoding(model_encoding_name)
return model_encoding_name
if encoding_name is None:
raise KeyError(
@ -74,4 +92,12 @@ def encoding_for_model(model_name: str) -> Encoding:
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
) from None
return get_encoding(encoding_name)
return encoding_name
def encoding_for_model(model_name: str) -> Encoding:
"""Returns the encoding used by a model.
Raises a KeyError if the model name is not recognised.
"""
return get_encoding(encoding_name_for_model(model_name))

View File

@ -1,238 +0,0 @@
from typing import Dict, List, Callable, Union, Tuple
from collections import defaultdict
import threading
import re
import copy
MAX_NUM_THREADS = 128
def hash_current_thread():
return hash(threading.current_thread().ident)
def byte_pair_merge(piece: bytes, ranks: Dict[bytes, int], f: Callable[[slice], Union[int, str]]) -> List[Union[int, str]]:
"""
Translates the _byte_pair_merge function from Rust to Python.
"""
# This is a list of (start, rank).
# The rank is of the byte pair starting at position start.
# The rank of the last item in the list is not a valid value.
parts = [(i, float('inf')) for i in range(len(piece) + 1)]
def get_rank(parts: List[Tuple[int, int]], start_idx: int, skip: int) -> Union[int, None]:
"""
Inner function to get the rank of a byte pair or sequence.
"""
if start_idx + skip + 2 < len(parts):
return ranks.get(piece[parts[start_idx][0]:parts[start_idx + skip + 2][0]])
else:
return None
# We look up the ranks once in the beginning and iteratively update
# them during each merge, which reduces the number of rank lookups.
for i in range(len(parts) - 2):
rank = get_rank(parts, i, 0)
if rank is not None:
assert rank != float('inf') # Check if rank is not the sentinel value
parts[i] = (parts[i][0], rank)
# Main merging loop
while len(parts) > 1:
# float('inf') is a sentinel rank value allowing us to take the min more quickly
min_rank = (float('inf'), 0)
for i, (_, rank) in enumerate(parts[:-1]):
if rank < min_rank[0]:
min_rank = (rank, i)
if min_rank[0] != float('inf'):
i = min_rank[1]
# Update ranks considering the skip
parts[i] = (parts[i][0], get_rank(parts, i, 1) or float('inf'))
if i > 0:
parts[i - 1] = (parts[i - 1][0], get_rank(parts, i - 1, 1) or float('inf'))
# Remove the part
parts.pop(i + 1)
else:
break
# Construct the output
out = [f(slice(parts[i][0], parts[i + 1][0])) for i in range(len(parts) - 1)]
return out
def byte_pair_encode(piece, ranks):
if len(piece) == 1:
# return [ranks[tuple(piece)]]
return [ranks[piece]]
# return byte_pair_merge(piece, ranks, lambda p: ranks[tuple(piece[p.start:p.stop])])
return byte_pair_merge(piece, ranks, lambda p: ranks[piece[p.start:p.stop]])
def byte_pair_split(piece, ranks):
if len(piece) == 1:
return [piece]
return byte_pair_merge(piece, ranks, lambda p: piece[p.start:p.stop])
class CoreBaseBPE:
def __init__(self):
self.encoder = {}
self.special_tokens_encoder = {}
self.decoder = {}
self.special_tokens_decoder = {}
self.regex_tls = []
self.special_regex_tls = []
self.sorted_token_bytes = []
def _get_tl_regex(self):
return self.regex_tls[hash_current_thread() % MAX_NUM_THREADS]
def _get_tl_special_regex(self):
return self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS]
def _decode_native(self, tokens):
ret = bytearray()
for token in tokens:
token_bytes = self.decoder.get(token, self.special_tokens_decoder.get(token))
if token_bytes:
ret.extend(token_bytes)
return ret
def _encode_ordinary_native(self, text):
regex = self._get_tl_regex()
ret = []
for mat in re.finditer(regex, text):
piece = mat.group().encode('utf-8')
token = self.encoder.get(piece)
if token:
ret.append(token)
continue
tokens = byte_pair_encode(piece, self.encoder)
ret.extend(tokens)
return ret
def _encode_native(self, text, allowed_special):
special_regex = self._get_tl_special_regex()
regex = self._get_tl_regex()
ret = []
start = 0
last_piece_token_len = 0
while start < len(text):
next_special = None
for mat in re.finditer(special_regex, text[start:]):
if mat.group() in allowed_special:
next_special = mat
break
for mat in re.finditer(regex, text[start:next_special.start() if next_special else None]):
piece = mat.group().encode('utf-8')
token = self.encoder.get(piece)
if token:
ret.append(token)
continue
tokens = byte_pair_encode(piece, self.encoder)
last_piece_token_len = len(tokens)
ret.extend(tokens)
if next_special:
piece = next_special.group().encode('utf-8')
token = self.special_tokens_encoder[piece]
ret.append(token)
start = next_special.end()
last_piece_token_len = 0
else:
break
return ret, last_piece_token_len
class CoreBPE(CoreBaseBPE):
# _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
def __init__(self, encoder, special_tokens_encoder, pattern):
self.encoder = encoder
self.special_tokens_encoder = special_tokens_encoder
self.regex = re.compile(pattern)
special_parts = [re.escape(key) for key in special_tokens_encoder.keys()]
self.special_regex = re.compile("|".join(special_parts))
self.decoder = {v: k for k, v in encoder.items()}
assert len(encoder) == len(self.decoder), "Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?"
self.special_tokens_decoder = {v: bytes(k, 'utf-8') for k, v in special_tokens_encoder.items()}
self.sorted_token_bytes = sorted(list(encoder.keys()))
self.regex_tls = [copy.deepcopy(self.regex) for _ in range(MAX_NUM_THREADS)]
self.special_regex_tls = [copy.deepcopy(self.special_regex) for _ in range(MAX_NUM_THREADS)]
def encode_ordinary(self, text):
return self._encode_ordinary_native(text)
def encode(self, text, allowed_special):
return self._encode_native(text, allowed_special)[0]
def _encode_bytes(self, bytes):
try:
text = bytes.decode('utf-8')
return self._encode_ordinary_native(text)
except UnicodeDecodeError as e:
text = bytes[:e.start].decode('utf-8', 'ignore')
tokens, last_piece_token_len = self._encode_native(text, set())
tokens, last_piece_token_len = self._increase_last_piece_token_len(tokens, last_piece_token_len)
if tokens and last_piece_token_len > 0:
unstable_bytes = self._decode_native(tokens[-last_piece_token_len:])
unstable_bytes.extend(bytes[e.start:])
tokens = tokens[:-last_piece_token_len]
tokens.extend(byte_pair_encode(unstable_bytes, self.encoder)) # Assuming byte_pair_encode is defined elsewhere
return tokens
def encode_with_unstable(self, text, allowed_special):
tokens, completions = self._encode_unstable_native(text, allowed_special)
py_completions = [list(seq) for seq in completions]
return tokens, py_completions
def encode_single_token(self, piece):
token = self.encoder.get(piece)
if token:
return token
piece_str = piece.decode('utf-8', 'ignore')
token = self.special_tokens_encoder.get(piece_str)
if token:
return token
raise KeyError(piece)
def encode_single_piece(self, piece):
token = self.encoder.get(piece)
if token:
return [token]
return byte_pair_encode(piece, self.encoder) # Assuming byte_pair_encode is defined elsewhere
def decode_bytes(self, tokens):
return self._decode_native(tokens)
def decode_single_token_bytes(self, token):
bytes_val = self.decoder.get(token) or self.special_tokens_decoder.get(token)
if bytes_val:
return bytes_val
raise KeyError(str(token))
def token_byte_values(self):
return [bytes(x) for x in self.sorted_token_bytes]

View File

@ -1,9 +1,10 @@
from __future__ import annotations
import functools
import importlib
import pkgutil
import threading
from typing import Any, Callable, Optional
from typing import Any, Callable, Optional, Sequence
import tiktoken_ext
@ -14,6 +15,20 @@ ENCODINGS: dict[str, Encoding] = {}
ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
@functools.lru_cache()
def _available_plugin_modules() -> Sequence[str]:
# tiktoken_ext is a namespace package
# submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
# - we use namespace package pattern so `pkgutil.iter_modules` is fast
# - it's a separate top-level package because namespace subpackages of non-namespace
# packages don't quite do what you want with editable installs
mods = []
plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
for _, mod_name, _ in plugin_mods:
mods.append(mod_name)
return mods
def _find_constructors() -> None:
global ENCODING_CONSTRUCTORS
with _lock:
@ -21,14 +36,7 @@ def _find_constructors() -> None:
return
ENCODING_CONSTRUCTORS = {}
# tiktoken_ext is a namespace package
# submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
# - we use namespace package pattern so `pkgutil.iter_modules` is fast
# - it's a separate top-level package because namespace subpackages of non-namespace
# packages don't quite do what you want with editable installs
plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
for _, mod_name, _ in plugin_mods:
for mod_name in _available_plugin_modules():
mod = importlib.import_module(mod_name)
try:
constructors = mod.ENCODING_CONSTRUCTORS
@ -57,7 +65,9 @@ def get_encoding(encoding_name: str) -> Encoding:
assert ENCODING_CONSTRUCTORS is not None
if encoding_name not in ENCODING_CONSTRUCTORS:
raise ValueError(f"Unknown encoding {encoding_name}")
raise ValueError(
f"Unknown encoding {encoding_name}. Plugins found: {_available_plugin_modules()}"
)
constructor = ENCODING_CONSTRUCTORS[encoding_name]
enc = Encoding(**constructor())

View File

@ -11,24 +11,30 @@ def gpt2():
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
)
return {
"name": "gpt2",
"explicit_n_vocab": 50257,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
# The pattern in the original GPT-2 release is:
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\w]+| ?[\d]+| ?[^\s\w]+|\s+(?!\S)|\s+"""
# This is equivalent, but executes faster:
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {"<|endoftext|>": 50256},
"special_tokens": {ENDOFTEXT: 50256},
}
def r50k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
)
return {
"name": "r50k_base",
"explicit_n_vocab": 50257,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}
@ -36,12 +42,13 @@ def r50k_base():
def p50k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
)
return {
"name": "p50k_base",
"explicit_n_vocab": 50281,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}
@ -49,12 +56,13 @@ def p50k_base():
def p50k_edit():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
)
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
return {
"name": "p50k_edit",
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
@ -62,7 +70,8 @@ def p50k_edit():
def cl100k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
)
special_tokens = {
ENDOFTEXT: 100257,
@ -73,7 +82,36 @@ def cl100k_base():
}
return {
"name": "cl100k_base",
"pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\w]?\w+|\d{1,3}| ?[^\s\w]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\w]?+\w+|\d{1,3}| ?[^\s\w]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
def o200k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
)
special_tokens = {
ENDOFTEXT: 199999,
ENDOFPROMPT: 200018,
}
# This regex could be made more efficient
pat_str = "|".join(
[
r"""[^\r\n\w]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
r"""[^\r\n\w]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
r"""\d{1,3}""",
r""" ?[^\s\w]+[\r\n/]*""",
r"""\s*[\r\n]+""",
r"""\s+(?!\S)""",
r"""\s+""",
]
)
return {
"name": "o200k_base",
"pat_str": pat_str,
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
@ -85,4 +123,5 @@ ENCODING_CONSTRUCTORS = {
"p50k_base": p50k_base,
"p50k_edit": p50k_edit,
"cl100k_base": cl100k_base,
"o200k_base": o200k_base,
}

View File

@ -1,16 +1,16 @@
Metadata-Version: 2.1
Name: zipp
Version: 3.18.1
Version: 3.18.2
Summary: Backport of pathlib-compatible object wrapper for zip files
Home-page: https://github.com/jaraco/zipp
Author: Jason R. Coombs
Author-email: jaraco@jaraco.com
Author-email: "Jason R. Coombs" <jaraco@jaraco.com>
Project-URL: Homepage, https://github.com/jaraco/zipp
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
Requires-Python: >=3.8
Description-Content-Type: text/x-rst
License-File: LICENSE
Provides-Extra: docs
Requires-Dist: sphinx >=3.5 ; extra == 'docs'
@ -20,9 +20,10 @@ Requires-Dist: furo ; extra == 'docs'
Requires-Dist: sphinx-lint ; extra == 'docs'
Requires-Dist: jaraco.tidelift >=1.4 ; extra == 'docs'
Provides-Extra: testing
Requires-Dist: pytest >=6 ; extra == 'testing'
Requires-Dist: pytest !=8.1.*,>=6 ; extra == 'testing'
Requires-Dist: pytest-checkdocs >=2.4 ; extra == 'testing'
Requires-Dist: pytest-cov ; extra == 'testing'
Requires-Dist: pytest-mypy ; extra == 'testing'
Requires-Dist: pytest-enabler >=2.2 ; extra == 'testing'
Requires-Dist: pytest-ruff >=0.2.1 ; extra == 'testing'
Requires-Dist: jaraco.itertools ; extra == 'testing'
@ -30,7 +31,7 @@ Requires-Dist: jaraco.functools ; extra == 'testing'
Requires-Dist: more-itertools ; extra == 'testing'
Requires-Dist: big-O ; extra == 'testing'
Requires-Dist: pytest-ignore-flaky ; extra == 'testing'
Requires-Dist: pytest-mypy ; (platform_python_implementation != "PyPy") and extra == 'testing'
Requires-Dist: jaraco.test ; extra == 'testing'
.. image:: https://img.shields.io/pypi/v/zipp.svg
:target: https://pypi.org/project/zipp
@ -71,7 +72,9 @@ were contributed to different versions in the standard library:
* - zipp
- stdlib
* - 3.15
* - 3.18
- 3.13
* - 3.16
- 3.12
* - 3.5
- 3.11

View File

@ -1,10 +1,10 @@
zipp-3.18.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
zipp-3.18.1.dist-info/LICENSE,sha256=htoPAa6uRjSKPD1GUZXcHOzN55956HdppkuNoEsqR0E,1023
zipp-3.18.1.dist-info/METADATA,sha256=dxGXpoBobQO4X9colqFec8eIGSng-ohxNzeKIM0Wh6U,3461
zipp-3.18.1.dist-info/RECORD,,
zipp-3.18.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
zipp-3.18.1.dist-info/top_level.txt,sha256=iAbdoSHfaGqBfVb2XuR9JqSQHCoOsOtG6y9C_LSpqFw,5
zipp/__init__.py,sha256=IB08yJFuj9F0DkmfBLKQU4Cq75n-UPFeDu1qPKZTPKk,11358
zipp-3.18.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
zipp-3.18.2.dist-info/LICENSE,sha256=htoPAa6uRjSKPD1GUZXcHOzN55956HdppkuNoEsqR0E,1023
zipp-3.18.2.dist-info/METADATA,sha256=v_qTHO-7CH99XLvAV0kA0RtRNMuw-p_WJrzJxUuafEU,3539
zipp-3.18.2.dist-info/RECORD,,
zipp-3.18.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
zipp-3.18.2.dist-info/top_level.txt,sha256=iAbdoSHfaGqBfVb2XuR9JqSQHCoOsOtG6y9C_LSpqFw,5
zipp/__init__.py,sha256=s5hbthFh66EOlVTMKyZ5azMn8y2BrJbTNQx0KsIpcBI,11361
zipp/__pycache__/__init__.cpython-38.pyc,,
zipp/__pycache__/glob.cpython-38.pyc,,
zipp/compat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@ -263,7 +263,7 @@ class Path:
>>> str(path.parent)
'mem'
If the zipfile has no filename, such attribtues are not
If the zipfile has no filename, such attributes are not
valid and accessing them will raise an Exception.
>>> zf.filename = None