Sync: devchat[main](f6590bfd) Merge pull request #389 from devchat-ai/log_llm_calls
This commit is contained in:
parent
501b7ff4ae
commit
f256ec7f4d
@ -31,7 +31,7 @@ Requires-Dist: pathspec (>=0.12.1,<0.13.0)
|
||||
Requires-Dist: pydantic (==1.10.14)
|
||||
Requires-Dist: rich_click (>=1.6.1,<2.0.0)
|
||||
Requires-Dist: tenacity (>=8.2.3,<9.0.0)
|
||||
Requires-Dist: tiktoken (>=0.4.0,<0.5.0)
|
||||
Requires-Dist: tiktoken (>0.4.0)
|
||||
Requires-Dist: tinydb (>=4.7.1,<5.0.0)
|
||||
Requires-Dist: urllib3 (<2.0)
|
||||
Description-Content-Type: text/markdown
|
||||
|
@ -1,7 +1,7 @@
|
||||
../../../bin/devchat,sha256=a8KMZYH-GZd6OA7nXki105OsOlnCcZkv9SCnT1Fa3UU,260
|
||||
devchat-0.2.10.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
devchat-0.2.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
||||
devchat-0.2.10.dist-info/METADATA,sha256=rG2jPljWa__TRpp63OJYF6TE7hh3osGAQgYvEJi4Bn0,7314
|
||||
devchat-0.2.10.dist-info/METADATA,sha256=Q5u0xLzfHCNzM0-vKtsoHE3DDbhXi9iRjUxw9KvYApI,7306
|
||||
devchat-0.2.10.dist-info/RECORD,,
|
||||
devchat-0.2.10.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
devchat-0.2.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
||||
@ -94,8 +94,8 @@ devchat/llm/__pycache__/pipeline.cpython-38.pyc,,
|
||||
devchat/llm/__pycache__/text_confirm.cpython-38.pyc,,
|
||||
devchat/llm/__pycache__/tools_call.cpython-38.pyc,,
|
||||
devchat/llm/chat.py,sha256=XWXUIpbWCMiuMCtBchrQpBpuyLwjga5KcCFzDoapbcc,3377
|
||||
devchat/llm/openai.py,sha256=8hK2OByDRq8sPgROf-UvVPA8Oz0lSDfMaAFSKh0D644,6208
|
||||
devchat/llm/pipeline.py,sha256=D214HASOUA7DsUm63_QDVFTYsHShPrrBwTbd0hM3tRI,1920
|
||||
devchat/llm/openai.py,sha256=VnYIl2XB7qNDuRWJxAcwMGQk8v9JwL8aZ-J-SXduN4Y,6492
|
||||
devchat/llm/pipeline.py,sha256=qxOCMYJi-TlA_gBN2r6ImG_U5qzcAWnbZ0oThJ1RbTc,2267
|
||||
devchat/llm/text_confirm.py,sha256=sdt7AUFDcsOZ0fLfS0vtjdS2_8xhkTF6aF8Sn05OlI0,1462
|
||||
devchat/llm/tools_call.py,sha256=OBObtFAzuqEJPq7Ro9hR4oirrcMtxGchlMQl8vL1CBc,8038
|
||||
devchat/memory/__init__.py,sha256=aPR0Dt8dcf4oWXu2HME2fFSpDJDeoBayPWMFOpO8v5k,133
|
||||
|
@ -7,6 +7,8 @@ from typing import Dict, List
|
||||
import httpx
|
||||
import openai
|
||||
|
||||
from devchat.ide import IDEService
|
||||
|
||||
from .pipeline import (
|
||||
RetryException,
|
||||
exception_handle,
|
||||
@ -83,6 +85,7 @@ def retry_timeout(chunks):
|
||||
for chunk in chunks:
|
||||
yield chunk
|
||||
except (openai.APIConnectionError, openai.APITimeoutError) as err:
|
||||
IDEService().ide_logging("info", f"in retry_timeout: err: {err}")
|
||||
raise RetryException(err) from err
|
||||
|
||||
|
||||
@ -127,8 +130,10 @@ def content_to_json(content):
|
||||
response_obj = json.loads(content_no_block)
|
||||
return response_obj
|
||||
except json.JSONDecodeError as err:
|
||||
IDEService().ide_logging("info", f"in content_to_json: json decode error: {err}")
|
||||
raise RetryException(err) from err
|
||||
except Exception as err:
|
||||
IDEService().ide_logging("info", f"in content_to_json: other error: {err}")
|
||||
raise err
|
||||
|
||||
|
||||
|
@ -1,8 +1,11 @@
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict
|
||||
|
||||
import openai
|
||||
|
||||
from devchat.ide import IDEService
|
||||
|
||||
|
||||
class RetryException(Exception):
|
||||
def __init__(self, err):
|
||||
@ -17,8 +20,10 @@ def retry(func, times):
|
||||
except RetryException as err:
|
||||
if index + 1 == times:
|
||||
raise err.error
|
||||
IDEService().ide_logging("debug", f"has retries: {index + 1}")
|
||||
continue
|
||||
except Exception as err:
|
||||
IDEService().ide_logging("info", f"exception: {err}")
|
||||
raise err.error
|
||||
|
||||
return wrapper
|
||||
@ -59,6 +64,7 @@ def exception_handle(func, handler):
|
||||
|
||||
def pipeline(*funcs):
|
||||
def wrapper(*args, **kwargs):
|
||||
start_time = time.time()
|
||||
for index, func in enumerate(funcs):
|
||||
if index > 0:
|
||||
if isinstance(args, Dict) and args.get("__type__", None) == "parallel":
|
||||
@ -67,6 +73,8 @@ def pipeline(*funcs):
|
||||
args = func(args)
|
||||
else:
|
||||
args = func(*args, **kwargs)
|
||||
end_time = time.time()
|
||||
IDEService().ide_logging("debug", f"time on pipeline: {end_time-start_time}")
|
||||
return args
|
||||
|
||||
return wrapper
|
||||
|
@ -1,6 +1,6 @@
|
||||
Metadata-Version: 2.3
|
||||
Name: openai
|
||||
Version: 1.29.0
|
||||
Version: 1.30.1
|
||||
Summary: The official Python library for the openai API
|
||||
Project-URL: Homepage, https://github.com/openai/openai-python
|
||||
Project-URL: Repository, https://github.com/openai/openai-python
|
@ -1,10 +1,10 @@
|
||||
../../../bin/openai,sha256=OM6FORuLrwfh02Zj_-DY6nOIKjU9ftrONpb5slPZlhM,253
|
||||
openai-1.29.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
openai-1.29.0.dist-info/METADATA,sha256=cZxRLlTiimxDXv2dE0M5Se6E_fMw0axdYGJzNsnFTOU,21941
|
||||
openai-1.29.0.dist-info/RECORD,,
|
||||
openai-1.29.0.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
||||
openai-1.29.0.dist-info/entry_points.txt,sha256=kAYhQEmziJwsKs5raYAIOvJ2LWmbz5dulEXOzsY71ro,43
|
||||
openai-1.29.0.dist-info/licenses/LICENSE,sha256=d0M6HDjQ76tf255XPlAGkIoECMe688MXcGEYsOFySfI,11336
|
||||
openai-1.30.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
openai-1.30.1.dist-info/METADATA,sha256=DzzOq2T6f1fkTmkYH-M9wfjpZaIaWqkvBjG6rnWBqDw,21941
|
||||
openai-1.30.1.dist-info/RECORD,,
|
||||
openai-1.30.1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
||||
openai-1.30.1.dist-info/entry_points.txt,sha256=kAYhQEmziJwsKs5raYAIOvJ2LWmbz5dulEXOzsY71ro,43
|
||||
openai-1.30.1.dist-info/licenses/LICENSE,sha256=d0M6HDjQ76tf255XPlAGkIoECMe688MXcGEYsOFySfI,11336
|
||||
openai/__init__.py,sha256=hTM-EsfeafKBLu-n5AVSQVDB2MMBGnZoLtATFeW-OL0,10007
|
||||
openai/__main__.py,sha256=bYt9eEaoRQWdejEHFD8REx9jxVEdZptECFsV7F49Ink,30
|
||||
openai/__pycache__/__init__.cpython-38.pyc,,
|
||||
@ -64,7 +64,7 @@ openai/_utils/_sync.py,sha256=8zEEYfir8iCUcAMFtWd8cDi8NVEaZonc4sfLAYr16io,2269
|
||||
openai/_utils/_transform.py,sha256=NCz3q9_O-vuj60xVe-qzhEQ8uJWlZWJTsM-GwHDccf8,12958
|
||||
openai/_utils/_typing.py,sha256=tFbktdpdHCQliwzGsWysgn0P5H0JRdagkZdb_LegGkY,3838
|
||||
openai/_utils/_utils.py,sha256=1_mm0IcPWDckpwQrb5chWTqeG7JWst_ycXaoFUTXbzE,11497
|
||||
openai/_version.py,sha256=7kEu6q_6Mk9wI8Ot8PAoWdAXrIe5hSH3VkSKRvWdNSc,159
|
||||
openai/_version.py,sha256=K2r2kM0eNbvNjtxPpgtRbJoHYQWganeQIMbwtYW_BDA,159
|
||||
openai/cli/__init__.py,sha256=soGgtqyomgddl92H0KJRqHqGuaXIaghq86qkzLuVp7U,31
|
||||
openai/cli/__pycache__/__init__.cpython-38.pyc,,
|
||||
openai/cli/__pycache__/_cli.cpython-38.pyc,,
|
||||
@ -135,12 +135,12 @@ openai/resources/audio/audio.py,sha256=1HHcDRWT58KshYelRdSnJs-0bvMBRS1vOhnU-h_oP
|
||||
openai/resources/audio/speech.py,sha256=A4_SwpCesEfHg89cxazNdrHz8JxNvUp5LlLNoMqo-0w,7876
|
||||
openai/resources/audio/transcriptions.py,sha256=bBdQZXzjamZIbe5R_Ji9JJ6W9nJCNN7EwQVinu572Pk,11128
|
||||
openai/resources/audio/translations.py,sha256=_NoBAOXYqMEtjeUhdoHF3DNb-UqnhqVrmfqgITvhajI,9070
|
||||
openai/resources/batches.py,sha256=HpMvKfSgC3F5ea8ZlmvvnJ5A0tkpzjMJkAioo4vk0Cs,17614
|
||||
openai/resources/batches.py,sha256=QsK-LsjUuW9rzRyLfgmAj-e9Idve1GAUuG4JxJ4vPWA,18188
|
||||
openai/resources/beta/__init__.py,sha256=nXoV4P8WCrbEZuNMtptbIuy_LqlVafY9lJ2qfW35GFc,1636
|
||||
openai/resources/beta/__pycache__/__init__.cpython-38.pyc,,
|
||||
openai/resources/beta/__pycache__/assistants.cpython-38.pyc,,
|
||||
openai/resources/beta/__pycache__/beta.cpython-38.pyc,,
|
||||
openai/resources/beta/assistants.py,sha256=dGJLZIqkpeS_6DTYTVmL7Gb8lRXm08_miXSGwUJI4Yo,39476
|
||||
openai/resources/beta/assistants.py,sha256=jE9tf1oWbDEf28WRRD2_lgg_pkz52aHi0xM0-B7cuwI,39768
|
||||
openai/resources/beta/beta.py,sha256=xw_dfi9ZpyRG4ChwweQtirWwsWxhAA4mXSV46D7pS5M,4485
|
||||
openai/resources/beta/threads/__init__.py,sha256=fQ_qdUVSfouVS5h47DlTb5mamChT4K-v-siPuuAB6do,1177
|
||||
openai/resources/beta/threads/__pycache__/__init__.cpython-38.pyc,,
|
||||
@ -151,9 +151,9 @@ openai/resources/beta/threads/runs/__init__.py,sha256=2FfDaqwmJJCd-IVpY_CrzWcFvw
|
||||
openai/resources/beta/threads/runs/__pycache__/__init__.cpython-38.pyc,,
|
||||
openai/resources/beta/threads/runs/__pycache__/runs.cpython-38.pyc,,
|
||||
openai/resources/beta/threads/runs/__pycache__/steps.cpython-38.pyc,,
|
||||
openai/resources/beta/threads/runs/runs.py,sha256=trveAGqtbYxNTdLct6xjcFJOiVJyYcTks1rsDDFqYOI,148671
|
||||
openai/resources/beta/threads/runs/runs.py,sha256=06N5t4J-bfTF6iFeuiJAUDeGi8hgk91rTOVLqChxlxM,149137
|
||||
openai/resources/beta/threads/runs/steps.py,sha256=uRykb4JapSNZCF8OD54f5qOWtrp2GoU1k5uAZgA4kAk,12223
|
||||
openai/resources/beta/threads/threads.py,sha256=-X8O2UODf3TIvXw6iiRTl7wcw50rJVLoLmj94sjrSwE,100560
|
||||
openai/resources/beta/threads/threads.py,sha256=IMyZG0pD7a_ZT2UJ83MxgFh8ShXna_HFdIJqGqLH1rs,100998
|
||||
openai/resources/beta/vector_stores/__init__.py,sha256=11Xn1vhgndWiI0defJHv31vmbtbDgh2GwZT3gX8GgHk,1296
|
||||
openai/resources/beta/vector_stores/__pycache__/__init__.cpython-38.pyc,,
|
||||
openai/resources/beta/vector_stores/__pycache__/file_batches.cpython-38.pyc,,
|
||||
@ -170,7 +170,7 @@ openai/resources/chat/chat.py,sha256=Edexhbq1anfSS_I0wNRQb7rx1OV6-rq4sxgVlYDGb6Y
|
||||
openai/resources/chat/completions.py,sha256=uMtKJiYRRIZ8o2MFwNTB2Kq4Tgt0KBDP2LP2B6uyyTQ,68761
|
||||
openai/resources/completions.py,sha256=4Rfv9o3XwI5GRfhN1RD4tEgNn0I2jb6TRW6j0b6bpZc,58712
|
||||
openai/resources/embeddings.py,sha256=cMSXtMc_7mBqlSiQ99B7qXYoRLGyoeIFazyYQ0jJ1O4,10755
|
||||
openai/resources/files.py,sha256=VYmoTHNjENqDRiyQGl0ZwisIy7ysP5NTGR2B8uFJDXk,26238
|
||||
openai/resources/files.py,sha256=Hdu7an1HsoYIVTp7OJiaDF2m9YmYyHwpr9_Nz8Q6DqU,26392
|
||||
openai/resources/fine_tuning/__init__.py,sha256=s6uoq7gM4gwoywdOOZQkPeYiSbUl-OwpeuMhwJJk0lc,837
|
||||
openai/resources/fine_tuning/__pycache__/__init__.cpython-38.pyc,,
|
||||
openai/resources/fine_tuning/__pycache__/fine_tuning.cpython-38.pyc,,
|
||||
@ -227,7 +227,7 @@ openai/types/audio/transcription_create_params.py,sha256=H7LOzb4VHwhF_cm0MXMIDgf
|
||||
openai/types/audio/translation.py,sha256=_PhTtQ-s1yc-4kAKlgc88FTqUpXnNYfM2ld5IuRRGkA,195
|
||||
openai/types/audio/translation_create_params.py,sha256=pynqbAozfcVwu1U6C6xvauZSFlQxIz1cswSXJLfRI30,1506
|
||||
openai/types/batch.py,sha256=eIOIaJnDuv93fdefTI0WRfTm7MZH8gLBdF0B12JCiZw,2787
|
||||
openai/types/batch_create_params.py,sha256=Kh4ZGVNBFpO3mHakKNSktaUPc-cLpBrlh9RqyLjsnqk,1183
|
||||
openai/types/batch_create_params.py,sha256=vNgtioC1ADnTCdEQ6vyOlAvtq1PBioRvnBPJduz4Xoo,1440
|
||||
openai/types/batch_error.py,sha256=Xxl-gYm0jerpYyI-mKSSVxRMQRubkoLUiOP9U3v72EM,622
|
||||
openai/types/batch_list_params.py,sha256=X1_sfRspuIMSDyXWVh0YnJ9vJLeOOH66TrvgEHueC84,705
|
||||
openai/types/batch_request_counts.py,sha256=nOzdL84OlZRycVNW99EDkdjCFqqKh68emaWT4Lx7dBE,410
|
||||
@ -267,8 +267,8 @@ openai/types/beta/__pycache__/vector_store_create_params.cpython-38.pyc,,
|
||||
openai/types/beta/__pycache__/vector_store_deleted.cpython-38.pyc,,
|
||||
openai/types/beta/__pycache__/vector_store_list_params.cpython-38.pyc,,
|
||||
openai/types/beta/__pycache__/vector_store_update_params.cpython-38.pyc,,
|
||||
openai/types/beta/assistant.py,sha256=9lrwz2SdGMf553qzYltklaVSKtdQIfR7WKBFJgUr_cg,4615
|
||||
openai/types/beta/assistant_create_params.py,sha256=5vqnBevWOOfO5DvG4EWpQ7B_heMCc9rT1eEgYm068RQ,6122
|
||||
openai/types/beta/assistant.py,sha256=m5bgNTyelK6MA1RUrdyLg2yTalyR0Xm67K6iBOqlwSk,4674
|
||||
openai/types/beta/assistant_create_params.py,sha256=AntnxPRSPdSSOYrX7anCN54aeTYry3YddIFbEGta_z0,6181
|
||||
openai/types/beta/assistant_deleted.py,sha256=bTTUl5FPHTBI5nRm7d0sGuR9VCSBDZ-IbOn9G_IpmJQ,301
|
||||
openai/types/beta/assistant_list_params.py,sha256=1-osjSX8tKieHSP0xaKBBU8j-J01fKrrxIJRHDudFHk,1220
|
||||
openai/types/beta/assistant_response_format.py,sha256=-JYxEihoHEHMak9E7KiyD5Zh_f3c-155j110mBDTFNE,378
|
||||
@ -284,7 +284,7 @@ openai/types/beta/assistant_tool_choice_option.py,sha256=WaLj1FSgQyLrss5hoKbmb19
|
||||
openai/types/beta/assistant_tool_choice_option_param.py,sha256=ODCix7ElFxtyABiL09OhaYbQy9RjICCSmILeqBFWeLE,402
|
||||
openai/types/beta/assistant_tool_choice_param.py,sha256=NOWx9SzZEwYaHeAyFZTQlG3pmogMNXzjPJDGQUlbv7Q,572
|
||||
openai/types/beta/assistant_tool_param.py,sha256=xsB-Vq93uyS69m5zMoAc7keLXB_OSwEUH6XgB2g3ex4,450
|
||||
openai/types/beta/assistant_update_params.py,sha256=8YGYglHCQhoBCleaaKsDmR13LijeDgrhIhQ5Lo8B1L0,4363
|
||||
openai/types/beta/assistant_update_params.py,sha256=Z4MA4GtxZzV3a6PlUShoDmDHAIwo7AyVk9O5wUnFhe8,4422
|
||||
openai/types/beta/chat/__init__.py,sha256=OKfJYcKb4NObdiRObqJV_dOyDQ8feXekDUge2o_4pXQ,122
|
||||
openai/types/beta/chat/__pycache__/__init__.cpython-38.pyc,,
|
||||
openai/types/beta/code_interpreter_tool.py,sha256=7mgQc9OtD_ZUnZeNhoobMFcmmvtZPFCNYGB-PEnNnfs,333
|
||||
@ -294,7 +294,7 @@ openai/types/beta/file_search_tool_param.py,sha256=nAON5EUoano9jVPYZMzMYMLCxde_4
|
||||
openai/types/beta/function_tool.py,sha256=oYGJfcfPpUohKw2ikgshDjOI1HXCK-5pAWyegYNezeU,397
|
||||
openai/types/beta/function_tool_param.py,sha256=T_k2OX1OULgkrHHXw0rY_J-O0y5qA0lM-B58C64YyfM,453
|
||||
openai/types/beta/thread.py,sha256=wd00j3ogUpOa_O0Sf1m6H4f8t1Nf05DKWiK_4m33O6s,2013
|
||||
openai/types/beta/thread_create_and_run_params.py,sha256=RXTfHQiS8dktu0bkomzqrVKHopBJoqAMSAxHpgYDTs8,12692
|
||||
openai/types/beta/thread_create_and_run_params.py,sha256=fd4N3XYkRhBkBJlRePjH2ZXvJ2oAgDyMoR103j4kzXw,12751
|
||||
openai/types/beta/thread_create_params.py,sha256=yu1ChXFvm6FQV4486PWxes88_jg3-yspp2jDwGZOBlw,4509
|
||||
openai/types/beta/thread_deleted.py,sha256=MaYG_jZIjSiB9h_ZBiTtpMsRSwFKkCY83ziM5GO_oUk,292
|
||||
openai/types/beta/thread_update_params.py,sha256=RYsR88YHwReKLiLqnLlnWiReiVIGlEGvVV9-g_wptgM,1750
|
||||
@ -369,10 +369,10 @@ openai/types/beta/threads/message_delta_event.py,sha256=7SpE4Dd3Lrc_cm97SzBwZzGG
|
||||
openai/types/beta/threads/message_list_params.py,sha256=LXqc3deSkKO6VN337OlQ4fzG7dfgBE7Iv_CLzZHhbhw,1294
|
||||
openai/types/beta/threads/message_update_params.py,sha256=bw6_U-vZA4c9_CDmeGOh7IEPIm8BU3BBOKtxnii0LKA,629
|
||||
openai/types/beta/threads/required_action_function_tool_call.py,sha256=XsR4OBbxI-RWteLvhcLEDBan6eUUGvhLORFRKjPbsLg,888
|
||||
openai/types/beta/threads/run.py,sha256=D6TDDeIGMS39jc2TVY4HrVw0mpBDXhro9VIzeH2ejdg,7656
|
||||
openai/types/beta/threads/run_create_params.py,sha256=cqbzHcQIOEIustQs1YGv8bMXuVof9NevPnyn2N9Ok7A,9035
|
||||
openai/types/beta/threads/run.py,sha256=DRc46FFjOudl1VAYKM2ni63ngCg_8TTRoRoTom9KWjU,7729
|
||||
openai/types/beta/threads/run_create_params.py,sha256=J2Une2MRcK0LgPOpGuw5ngg_kZykivtNbR1-TpHmzmw,9094
|
||||
openai/types/beta/threads/run_list_params.py,sha256=73poqeRcb5TEsIVn7OzJ_g9OajNokEzpCVLzVNKZmPk,1208
|
||||
openai/types/beta/threads/run_status.py,sha256=6KPJB7l0YfGSKzx4wuIP8SDiZSiaD2nb0KOf0uRPDP4,282
|
||||
openai/types/beta/threads/run_status.py,sha256=ky3dh-uD5OhuQB7e4BMQjRXvIDOUJnecTKGXr_PNcFY,329
|
||||
openai/types/beta/threads/run_submit_tool_outputs_params.py,sha256=aDrg0FZZoJKaPVQzcFjUg4ZKaeW8KF6UJBxhJEIjC2I,1630
|
||||
openai/types/beta/threads/run_update_params.py,sha256=76dWMNa3zCUliemCdwWv6p07GNeMYCdZoJs9KNbdZSE,621
|
||||
openai/types/beta/threads/runs/__init__.py,sha256=uhxk5F1_5c5wg2_p70AjlOy9cE3Ga8-ILn4Ep-gcls4,1515
|
@ -1,4 +1,4 @@
|
||||
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
||||
|
||||
__title__ = "openai"
|
||||
__version__ = "1.29.0" # x-release-please-version
|
||||
__version__ = "1.30.1" # x-release-please-version
|
||||
|
@ -40,7 +40,7 @@ class Batches(SyncAPIResource):
|
||||
self,
|
||||
*,
|
||||
completion_window: Literal["24h"],
|
||||
endpoint: Literal["/v1/chat/completions", "/v1/embeddings"],
|
||||
endpoint: Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"],
|
||||
input_file_id: str,
|
||||
metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
|
||||
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
||||
@ -58,7 +58,9 @@ class Batches(SyncAPIResource):
|
||||
is supported.
|
||||
|
||||
endpoint: The endpoint to be used for all requests in the batch. Currently
|
||||
`/v1/chat/completions` and `/v1/embeddings` are supported.
|
||||
`/v1/chat/completions`, `/v1/embeddings`, and `/v1/completions` are supported.
|
||||
Note that `/v1/embeddings` batches are also restricted to a maximum of 50,000
|
||||
embedding inputs across all requests in the batch.
|
||||
|
||||
input_file_id: The ID of an uploaded file that contains requests for the new batch.
|
||||
|
||||
@ -67,7 +69,8 @@ class Batches(SyncAPIResource):
|
||||
|
||||
Your input file must be formatted as a
|
||||
[JSONL file](https://platform.openai.com/docs/api-reference/batch/requestInput),
|
||||
and must be uploaded with the purpose `batch`.
|
||||
and must be uploaded with the purpose `batch`. The file can contain up to 50,000
|
||||
requests, and can be up to 100 MB in size.
|
||||
|
||||
metadata: Optional custom metadata for the batch.
|
||||
|
||||
@ -228,7 +231,7 @@ class AsyncBatches(AsyncAPIResource):
|
||||
self,
|
||||
*,
|
||||
completion_window: Literal["24h"],
|
||||
endpoint: Literal["/v1/chat/completions", "/v1/embeddings"],
|
||||
endpoint: Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"],
|
||||
input_file_id: str,
|
||||
metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
|
||||
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
||||
@ -246,7 +249,9 @@ class AsyncBatches(AsyncAPIResource):
|
||||
is supported.
|
||||
|
||||
endpoint: The endpoint to be used for all requests in the batch. Currently
|
||||
`/v1/chat/completions` and `/v1/embeddings` are supported.
|
||||
`/v1/chat/completions`, `/v1/embeddings`, and `/v1/completions` are supported.
|
||||
Note that `/v1/embeddings` batches are also restricted to a maximum of 50,000
|
||||
embedding inputs across all requests in the batch.
|
||||
|
||||
input_file_id: The ID of an uploaded file that contains requests for the new batch.
|
||||
|
||||
@ -255,7 +260,8 @@ class AsyncBatches(AsyncAPIResource):
|
||||
|
||||
Your input file must be formatted as a
|
||||
[JSONL file](https://platform.openai.com/docs/api-reference/batch/requestInput),
|
||||
and must be uploaded with the purpose `batch`.
|
||||
and must be uploaded with the purpose `batch`. The file can contain up to 50,000
|
||||
requests, and can be up to 100 MB in size.
|
||||
|
||||
metadata: Optional custom metadata for the batch.
|
||||
|
||||
|
@ -110,8 +110,9 @@ class Assistants(SyncAPIResource):
|
||||
name: The name of the assistant. The maximum length is 256 characters.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -254,8 +255,9 @@ class Assistants(SyncAPIResource):
|
||||
name: The name of the assistant. The maximum length is 256 characters.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -497,8 +499,9 @@ class AsyncAssistants(AsyncAPIResource):
|
||||
name: The name of the assistant. The maximum length is 256 characters.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -641,8 +644,9 @@ class AsyncAssistants(AsyncAPIResource):
|
||||
name: The name of the assistant. The maximum length is 256 characters.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
|
@ -164,8 +164,9 @@ class Runs(SyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -314,8 +315,9 @@ class Runs(SyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -460,8 +462,9 @@ class Runs(SyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -1097,7 +1100,7 @@ class Runs(SyncAPIResource):
|
||||
if is_given(poll_interval_ms):
|
||||
extra_headers["X-Stainless-Custom-Poll-Interval"] = str(poll_interval_ms)
|
||||
|
||||
terminal_states = {"requires_action", "cancelled", "completed", "failed", "expired"}
|
||||
terminal_states = {"requires_action", "cancelled", "completed", "failed", "expired", "incomplete"}
|
||||
while True:
|
||||
response = self.with_raw_response.retrieve(
|
||||
thread_id=thread_id,
|
||||
@ -1718,8 +1721,9 @@ class AsyncRuns(AsyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -1868,8 +1872,9 @@ class AsyncRuns(AsyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -2014,8 +2019,9 @@ class AsyncRuns(AsyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -2653,7 +2659,7 @@ class AsyncRuns(AsyncAPIResource):
|
||||
if is_given(poll_interval_ms):
|
||||
extra_headers["X-Stainless-Custom-Poll-Interval"] = str(poll_interval_ms)
|
||||
|
||||
terminal_states = {"requires_action", "cancelled", "completed", "failed", "expired"}
|
||||
terminal_states = {"requires_action", "cancelled", "completed", "failed", "expired", "incomplete"}
|
||||
while True:
|
||||
response = await self.with_raw_response.retrieve(
|
||||
thread_id=thread_id,
|
||||
|
@ -341,8 +341,9 @@ class Threads(SyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -490,8 +491,9 @@ class Threads(SyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -635,8 +637,9 @@ class Threads(SyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -1331,8 +1334,9 @@ class AsyncThreads(AsyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -1480,8 +1484,9 @@ class AsyncThreads(AsyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -1625,8 +1630,9 @@ class AsyncThreads(AsyncAPIResource):
|
||||
assistant will be used.
|
||||
|
||||
response_format: Specifies the format that the model must output. Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
[GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
|
@ -62,14 +62,18 @@ class Files(SyncAPIResource):
|
||||
) -> FileObject:
|
||||
"""Upload a file that can be used across various endpoints.
|
||||
|
||||
The size of all the
|
||||
files uploaded by one organization can be up to 100 GB.
|
||||
Individual files can be
|
||||
up to 512 MB, and the size of all files uploaded by one organization can be up
|
||||
to 100 GB.
|
||||
|
||||
The size of individual files can be a maximum of 512 MB or 2 million tokens for
|
||||
Assistants. See the
|
||||
[Assistants Tools guide](https://platform.openai.com/docs/assistants/tools) to
|
||||
learn more about the types of files supported. The Fine-tuning API only supports
|
||||
`.jsonl` files.
|
||||
The Assistants API supports files up to 2 million tokens and of specific file
|
||||
types. See the
|
||||
[Assistants Tools guide](https://platform.openai.com/docs/assistants/tools) for
|
||||
details.
|
||||
|
||||
The Fine-tuning API only supports `.jsonl` files.
|
||||
|
||||
The Batch API only supports `.jsonl` files up to 100 MB in size.
|
||||
|
||||
Please [contact us](https://help.openai.com/) if you need to increase these
|
||||
storage limits.
|
||||
@ -335,14 +339,18 @@ class AsyncFiles(AsyncAPIResource):
|
||||
) -> FileObject:
|
||||
"""Upload a file that can be used across various endpoints.
|
||||
|
||||
The size of all the
|
||||
files uploaded by one organization can be up to 100 GB.
|
||||
Individual files can be
|
||||
up to 512 MB, and the size of all files uploaded by one organization can be up
|
||||
to 100 GB.
|
||||
|
||||
The size of individual files can be a maximum of 512 MB or 2 million tokens for
|
||||
Assistants. See the
|
||||
[Assistants Tools guide](https://platform.openai.com/docs/assistants/tools) to
|
||||
learn more about the types of files supported. The Fine-tuning API only supports
|
||||
`.jsonl` files.
|
||||
The Assistants API supports files up to 2 million tokens and of specific file
|
||||
types. See the
|
||||
[Assistants Tools guide](https://platform.openai.com/docs/assistants/tools) for
|
||||
details.
|
||||
|
||||
The Fine-tuning API only supports `.jsonl` files.
|
||||
|
||||
The Batch API only supports `.jsonl` files up to 100 MB in size.
|
||||
|
||||
Please [contact us](https://help.openai.com/) if you need to increase these
|
||||
storage limits.
|
||||
|
@ -15,10 +15,12 @@ class BatchCreateParams(TypedDict, total=False):
|
||||
Currently only `24h` is supported.
|
||||
"""
|
||||
|
||||
endpoint: Required[Literal["/v1/chat/completions", "/v1/embeddings"]]
|
||||
endpoint: Required[Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"]]
|
||||
"""The endpoint to be used for all requests in the batch.
|
||||
|
||||
Currently `/v1/chat/completions` and `/v1/embeddings` are supported.
|
||||
Currently `/v1/chat/completions`, `/v1/embeddings`, and `/v1/completions` are
|
||||
supported. Note that `/v1/embeddings` batches are also restricted to a maximum
|
||||
of 50,000 embedding inputs across all requests in the batch.
|
||||
"""
|
||||
|
||||
input_file_id: Required[str]
|
||||
@ -29,7 +31,8 @@ class BatchCreateParams(TypedDict, total=False):
|
||||
|
||||
Your input file must be formatted as a
|
||||
[JSONL file](https://platform.openai.com/docs/api-reference/batch/requestInput),
|
||||
and must be uploaded with the purpose `batch`.
|
||||
and must be uploaded with the purpose `batch`. The file can contain up to 50,000
|
||||
requests, and can be up to 100 MB in size.
|
||||
"""
|
||||
|
||||
metadata: Optional[Dict[str, str]]
|
||||
|
@ -85,9 +85,9 @@ class Assistant(BaseModel):
|
||||
response_format: Optional[AssistantResponseFormatOption] = None
|
||||
"""Specifies the format that the model must output.
|
||||
|
||||
Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
|
@ -77,9 +77,9 @@ class AssistantCreateParams(TypedDict, total=False):
|
||||
response_format: Optional[AssistantResponseFormatOptionParam]
|
||||
"""Specifies the format that the model must output.
|
||||
|
||||
Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
|
@ -45,9 +45,9 @@ class AssistantUpdateParams(TypedDict, total=False):
|
||||
response_format: Optional[AssistantResponseFormatOptionParam]
|
||||
"""Specifies the format that the model must output.
|
||||
|
||||
Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
|
@ -108,9 +108,9 @@ class ThreadCreateAndRunParamsBase(TypedDict, total=False):
|
||||
response_format: Optional[AssistantResponseFormatOptionParam]
|
||||
"""Specifies the format that the model must output.
|
||||
|
||||
Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
|
@ -160,9 +160,9 @@ class Run(BaseModel):
|
||||
response_format: Optional[AssistantResponseFormatOption] = None
|
||||
"""Specifies the format that the model must output.
|
||||
|
||||
Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
@ -182,8 +182,8 @@ class Run(BaseModel):
|
||||
status: RunStatus
|
||||
"""
|
||||
The status of the run, which can be either `queued`, `in_progress`,
|
||||
`requires_action`, `cancelling`, `cancelled`, `failed`, `completed`, or
|
||||
`expired`.
|
||||
`requires_action`, `cancelling`, `cancelled`, `failed`, `completed`,
|
||||
`incomplete`, or `expired`.
|
||||
"""
|
||||
|
||||
thread_id: str
|
||||
|
@ -110,9 +110,9 @@ class RunCreateParamsBase(TypedDict, total=False):
|
||||
response_format: Optional[AssistantResponseFormatOptionParam]
|
||||
"""Specifies the format that the model must output.
|
||||
|
||||
Compatible with
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) and
|
||||
all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
Compatible with [GPT-4o](https://platform.openai.com/docs/models/gpt-4o),
|
||||
[GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4),
|
||||
and all GPT-3.5 Turbo models since `gpt-3.5-turbo-1106`.
|
||||
|
||||
Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
|
||||
message the model generates is valid JSON.
|
||||
|
@ -5,5 +5,13 @@ from typing_extensions import Literal
|
||||
__all__ = ["RunStatus"]
|
||||
|
||||
RunStatus = Literal[
|
||||
"queued", "in_progress", "requires_action", "cancelling", "cancelled", "failed", "completed", "expired"
|
||||
"queued",
|
||||
"in_progress",
|
||||
"requires_action",
|
||||
"cancelling",
|
||||
"cancelled",
|
||||
"failed",
|
||||
"completed",
|
||||
"incomplete",
|
||||
"expired",
|
||||
]
|
||||
|
@ -1,6 +1,6 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: rich-click
|
||||
Version: 1.8.1
|
||||
Version: 1.8.2
|
||||
Summary: Format click help output nicely with rich
|
||||
Author-email: Phil Ewels <phil@ewels.co.uk>
|
||||
Maintainer-email: Phil Ewels <phil@ewels.co.uk>, Daniel Reeves <xdanielreeves@gmail.com>
|
@ -1,12 +1,12 @@
|
||||
../../../bin/rich-click,sha256=ueTpBQA5XZGwZsxmrQ8SOCO4y6uUvAchzTiBaaXEWmU,257
|
||||
rich_click-1.8.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
rich_click-1.8.1.dist-info/LICENSE,sha256=1GDP5mZhei-Gy3xm-QQfCodhHIsMHy8_Z0ogIq3B8q8,1067
|
||||
rich_click-1.8.1.dist-info/METADATA,sha256=B-td1s31-T6sp-ju-AI3LJSG4JWhYeBpEXBxBmxiPVM,7880
|
||||
rich_click-1.8.1.dist-info/RECORD,,
|
||||
rich_click-1.8.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
||||
rich_click-1.8.1.dist-info/entry_points.txt,sha256=q-JckrJEfhmzklT5lIpe1oTf68OaWJpAy1Mik7lGeXs,51
|
||||
rich_click-1.8.1.dist-info/top_level.txt,sha256=tKHPQk1z9Wd8Lu2HqxHQyF7oqOeQE5__SUDHezQZ4WE,11
|
||||
rich_click/__init__.py,sha256=_TOw4JYtBxYm4pD4_GVAk1e2ivCX0nCcnN_l1DhSq8g,4575
|
||||
rich_click-1.8.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
rich_click-1.8.2.dist-info/LICENSE,sha256=1GDP5mZhei-Gy3xm-QQfCodhHIsMHy8_Z0ogIq3B8q8,1067
|
||||
rich_click-1.8.2.dist-info/METADATA,sha256=2qjyywq8tVOwerVIev_5AWGSDArVIpTIphJDGwU0aQM,7880
|
||||
rich_click-1.8.2.dist-info/RECORD,,
|
||||
rich_click-1.8.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
||||
rich_click-1.8.2.dist-info/entry_points.txt,sha256=q-JckrJEfhmzklT5lIpe1oTf68OaWJpAy1Mik7lGeXs,51
|
||||
rich_click-1.8.2.dist-info/top_level.txt,sha256=tKHPQk1z9Wd8Lu2HqxHQyF7oqOeQE5__SUDHezQZ4WE,11
|
||||
rich_click/__init__.py,sha256=a0dCmP1_hnPToMQJLCEzxKCLuQFDkPlhbegtVIAMfOc,4575
|
||||
rich_click/__main__.py,sha256=FvI_e9IrNyHmo39uQtXeMdv2HbkuaIUf4uKkir3LdzY,417
|
||||
rich_click/__pycache__/__init__.cpython-38.pyc,,
|
||||
rich_click/__pycache__/__main__.cpython-38.pyc,,
|
||||
@ -31,7 +31,7 @@ rich_click/rich_click.py,sha256=Qob8c8vE0BeJgZnPyJoVfTI12_NtClmM5t7LQM6iTFM,6963
|
||||
rich_click/rich_command.py,sha256=EbRaCMkg9EqfWQWnArqWyYlZ-znjq3V-EwiTKoDWREQ,16148
|
||||
rich_click/rich_context.py,sha256=HuVUWlEunWTN-CqTrkSEgdxSncs_1GYBvKWmXOgtFaY,3721
|
||||
rich_click/rich_group.py,sha256=76PCL148Jb9rd9c6KGiELBtoPXTcqlGa-xPRjPRQNAA,254
|
||||
rich_click/rich_help_configuration.py,sha256=OC3KUP8xMQNWWkzd8mCPMjVyOeSQR_JGiCVwTkBkMSU,11499
|
||||
rich_click/rich_help_configuration.py,sha256=JnjIpfRFqM1Mc3DT1GE-iMdmf76ezSlCyb6oyqcbp_I,11602
|
||||
rich_click/rich_help_formatter.py,sha256=eA1Ri-W2Oq4KsIlCkKQzr4CMlq0SnXVn5kFttdtBk7A,4409
|
||||
rich_click/rich_help_rendering.py,sha256=OTIlltVfCW_Z-xHqhwtVwGPTGWjWSQiCT_zCXYUf8u4,31289
|
||||
rich_click/utils.py,sha256=B7PEW-S9hNnPBXehkiPbgrZv8EgzhmNR7psVvbh8eC8,1463
|
@ -6,7 +6,7 @@ The intention is to provide attractive help output from Click, formatted with Ri
|
||||
customisation required.
|
||||
"""
|
||||
|
||||
__version__ = "1.8.1"
|
||||
__version__ = "1.8.2"
|
||||
|
||||
# Import the entire click API here.
|
||||
# We need to manually import these instead of `from click import *` to force
|
||||
|
@ -172,17 +172,19 @@ class RichHelpConfiguration:
|
||||
legacy_windows: Optional[bool] = field(default=None)
|
||||
|
||||
def __post_init__(self) -> None: # noqa: D105
|
||||
if self.highlighter is not None:
|
||||
import warnings
|
||||
# Todo: Fix this so that the deprecation warning works properly.
|
||||
|
||||
warnings.warn(
|
||||
"`highlighter` kwarg is deprecated in RichHelpConfiguration."
|
||||
" Please do one of the following instead: either set highlighter_patterns=[...] if you want"
|
||||
" to use regex; or for more advanced use cases where you'd like to use a different type"
|
||||
" of rich.highlighter.Highlighter, subclass the `RichHelpFormatter` and update its `highlighter`.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
# if self.highlighter is not None:
|
||||
# import warnings
|
||||
#
|
||||
# warnings.warn(
|
||||
# "`highlighter` kwarg is deprecated in RichHelpConfiguration."
|
||||
# " Please do one of the following instead: either set highlighter_patterns=[...] if you want"
|
||||
# " to use regex; or for more advanced use cases where you'd like to use a different type"
|
||||
# " of rich.highlighter.Highlighter, subclass the `RichHelpFormatter` and update its `highlighter`.",
|
||||
# DeprecationWarning,
|
||||
# stacklevel=2,
|
||||
# )
|
||||
|
||||
self.__dataclass_fields__.pop("highlighter", None)
|
||||
|
||||
|
@ -1,112 +0,0 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: tiktoken
|
||||
Version: 0.4.0
|
||||
Summary: tiktoken is a fast BPE tokeniser for use with OpenAI's models
|
||||
Author: Shantanu Jain
|
||||
Author-email: shantanu@openai.com
|
||||
License: MIT License
|
||||
|
||||
Copyright (c) 2022 OpenAI, Shantanu Jain
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
Project-URL: homepage, https://github.com/openai/tiktoken
|
||||
Project-URL: repository, https://github.com/openai/tiktoken
|
||||
Project-URL: changelog, https://github.com/openai/tiktoken/blob/main/CHANGELOG.md
|
||||
Requires-Python: >=3.8
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Requires-Dist: requests >=2.26.0
|
||||
Provides-Extra: blobfile
|
||||
Requires-Dist: blobfile >=2 ; extra == 'blobfile'
|
||||
|
||||
# Clone and added Pure python implementation.
|
||||
|
||||
This is a fork of https://github.com/openai/tiktoken with the tokenizer available as a pure python implementation.
|
||||
You can use it locally like this (after pip install -e .)
|
||||
|
||||
```
|
||||
import tiktoken.registry as registry
|
||||
from tiktoken.registry import _find_constructors
|
||||
from tiktoken.core import Encoding
|
||||
|
||||
_find_constructors()
|
||||
constructor = registry.ENCODING_CONSTRUCTORS['cl100k_base']
|
||||
params = constructor()
|
||||
enc = Encoding(**params, use_pure_python=True)
|
||||
enc.encode("hello world")
|
||||
```
|
||||
|
||||
The port to python (from Rust) is 99% done with the help of GPT4 and these are the tests I did
|
||||
(so use it at your own risk but it should be trivial to compare both across a much diverse charset and
|
||||
phrases)
|
||||
|
||||
```
|
||||
Encoding for 'a': [64]
|
||||
Encoding for '!': [0]
|
||||
Encoding for '1': [16]
|
||||
Encoding for '&': [5]
|
||||
Encoding for 'hello': [15339]
|
||||
Encoding for 'world': [14957]
|
||||
Encoding for 'python': [12958]
|
||||
Encoding for 'rust': [36888]
|
||||
Encoding for 'hello world': [15339, 1917]
|
||||
Encoding for 'rust is fast': [36888, 374, 5043]
|
||||
Encoding for '.': [13]
|
||||
Encoding for ',': [11]
|
||||
Encoding for '?': [30]
|
||||
Encoding for '!': [0]
|
||||
Encoding for 'Hello, world!': [9906, 11, 1917, 0]
|
||||
Encoding for 'How's it going?': [4438, 596, 433, 2133, 30]
|
||||
Encoding for '
|
||||
': [198]
|
||||
Encoding for ' ': [197]
|
||||
Encoding for '0': [15]
|
||||
Encoding for '1': [16]
|
||||
Encoding for '9': [24]
|
||||
Encoding for '10': [605]
|
||||
Encoding for '100': [1041]
|
||||
Encoding for '12345': [4513, 1774]
|
||||
Encoding for '0.1': [15, 13, 16]
|
||||
Encoding for '3.14': [18, 13, 975]
|
||||
Encoding for '10.001': [605, 13, 4119]
|
||||
Encoding for 'abc123': [13997, 4513]
|
||||
Encoding for '42rocks': [2983, 299, 14895]
|
||||
Encoding for 'HELLO': [51812, 1623]
|
||||
Encoding for 'World': [10343]
|
||||
Encoding for 'Python': [31380]
|
||||
Encoding for 'helloWorld': [15339, 10343]
|
||||
Encoding for 'rust_rocks': [36888, 27706, 14895]
|
||||
Encoding for '✓': [38798, 241]
|
||||
Encoding for '❤️': [49633, 97, 31643]
|
||||
Encoding for '©': [20644]
|
||||
Encoding for 'hola': [71, 8083]
|
||||
Encoding for 'こんにちは': [90115]
|
||||
Encoding for 'Привет': [54745, 28089, 8341]
|
||||
Encoding for 'The quick brown fox jumps over the lazy dog.': [791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679, 13]
|
||||
Encoding for '': []
|
||||
Encoding for ' ': [220]
|
||||
Encoding for ' ': [197]
|
||||
Encoding for '
|
||||
': [198]
|
||||
Encoding for '@@@': [19741, 31]
|
||||
Encoding for '###': [14711]
|
||||
```
|
||||
|
||||
|
@ -1,25 +0,0 @@
|
||||
tiktoken-0.4.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
tiktoken-0.4.0.dist-info/LICENSE,sha256=QYy0mbQ2Eo1lPXmUEzOlQ3t74uqSE9zC8E0V1dLFHYY,1078
|
||||
tiktoken-0.4.0.dist-info/METADATA,sha256=182HHps9h3oj1XnrbG_manrVy8FjK1BYBkiS8Mlvs3E,4154
|
||||
tiktoken-0.4.0.dist-info/RECORD,,
|
||||
tiktoken-0.4.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
tiktoken-0.4.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
||||
tiktoken-0.4.0.dist-info/direct_url.json,sha256=PfrcId0UxJKjhG_jK4kY5QCAFNuHX1M31KoLYQw6LPU,140
|
||||
tiktoken-0.4.0.dist-info/top_level.txt,sha256=54G5MceQnuD7EXvp7jzGxDDapA1iOwsh77jhCN9WKkc,22
|
||||
tiktoken/__init__.py,sha256=bDWpVFXWe5PRN-K6IRad2UeVs24x3hVlSPELD7MdQCo,215
|
||||
tiktoken/__pycache__/__init__.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/_educational.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/core.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/load.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/model.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/python_tiktoken.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/registry.cpython-38.pyc,,
|
||||
tiktoken/_educational.py,sha256=i7eHAkrpsb0yulANGwTrMZB64nU3xD28PjFMUa-iC3Q,7761
|
||||
tiktoken/core.py,sha256=_83ZiibR-9Iig7nBtY2Egn-MVT-6hR_eGBAAl9Y_fHA,15671
|
||||
tiktoken/load.py,sha256=j5kjYrCM_Lbic71WxZ6CmdwZXYaokhLfTtBKt81-_ek,4180
|
||||
tiktoken/model.py,sha256=Qv2lU2CJ-_vtXlUq4m1asZpRupDps_uGmBQQM2Bc8hg,2771
|
||||
tiktoken/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
tiktoken/python_tiktoken.py,sha256=ukYyPFHZ-6NZ0BBtsgbgV6ORfwWC1rRUtWOdZYo1jYw,8504
|
||||
tiktoken/registry.py,sha256=urGm3u0ZQlv63daO3GKt45wPpZzzMQpNZXhRgun3wU4,2549
|
||||
tiktoken_ext/__pycache__/openai_public.cpython-38.pyc,,
|
||||
tiktoken_ext/openai_public.py,sha256=FrwXijob7DBruofS9xn5GC7aG9a1f5VcKp1xTviZuc4,2798
|
@ -1 +0,0 @@
|
||||
{"url": "https://github.com/yangbobo2021/tiktoken.git", "vcs_info": {"commit_id": "87539844cfeb6289e430804311a81cb3548636a3", "vcs": "git"}}
|
168
site-packages/tiktoken-0.7.0.dist-info/METADATA
Normal file
168
site-packages/tiktoken-0.7.0.dist-info/METADATA
Normal file
@ -0,0 +1,168 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: tiktoken
|
||||
Version: 0.7.0
|
||||
Summary: tiktoken is a fast BPE tokeniser for use with OpenAI's models
|
||||
Author: Shantanu Jain
|
||||
Author-email: shantanu@openai.com
|
||||
License: MIT License
|
||||
|
||||
Copyright (c) 2022 OpenAI, Shantanu Jain
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
Project-URL: homepage, https://github.com/openai/tiktoken
|
||||
Project-URL: repository, https://github.com/openai/tiktoken
|
||||
Project-URL: changelog, https://github.com/openai/tiktoken/blob/main/CHANGELOG.md
|
||||
Requires-Python: >=3.8
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Requires-Dist: requests >=2.26.0
|
||||
Provides-Extra: blobfile
|
||||
Requires-Dist: blobfile >=2 ; extra == 'blobfile'
|
||||
|
||||
# ⏳ tiktoken
|
||||
|
||||
tiktoken is a fast [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with
|
||||
OpenAI's models.
|
||||
|
||||
```python
|
||||
import tiktoken
|
||||
enc = tiktoken.get_encoding("o200k_base")
|
||||
assert enc.decode(enc.encode("hello world")) == "hello world"
|
||||
|
||||
# To get the tokeniser corresponding to a specific model in the OpenAI API:
|
||||
enc = tiktoken.encoding_for_model("gpt-4o")
|
||||
```
|
||||
|
||||
The open source version of `tiktoken` can be installed from PyPI:
|
||||
```
|
||||
pip install tiktoken
|
||||
```
|
||||
|
||||
The tokeniser API is documented in `tiktoken/core.py`.
|
||||
|
||||
Example code using `tiktoken` can be found in the
|
||||
[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
|
||||
|
||||
|
||||
## Performance
|
||||
|
||||
`tiktoken` is between 3-6x faster than a comparable open source tokeniser:
|
||||
|
||||

|
||||
|
||||
Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from
|
||||
`tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`.
|
||||
|
||||
|
||||
## Getting help
|
||||
|
||||
Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).
|
||||
|
||||
If you work at OpenAI, make sure to check the internal documentation or feel free to contact
|
||||
@shantanu.
|
||||
|
||||
## What is BPE anyway?
|
||||
|
||||
Language models don't see text like you and I, instead they see a sequence of numbers (known as tokens).
|
||||
Byte pair encoding (BPE) is a way of converting text into tokens. It has a couple desirable
|
||||
properties:
|
||||
1) It's reversible and lossless, so you can convert tokens back into the original text
|
||||
2) It works on arbitrary text, even text that is not in the tokeniser's training data
|
||||
3) It compresses the text: the token sequence is shorter than the bytes corresponding to the
|
||||
original text. On average, in practice, each token corresponds to about 4 bytes.
|
||||
4) It attempts to let the model see common subwords. For instance, "ing" is a common subword in
|
||||
English, so BPE encodings will often split "encoding" into tokens like "encod" and "ing"
|
||||
(instead of e.g. "enc" and "oding"). Because the model will then see the "ing" token again and
|
||||
again in different contexts, it helps models generalise and better understand grammar.
|
||||
|
||||
`tiktoken` contains an educational submodule that is friendlier if you want to learn more about
|
||||
the details of BPE, including code that helps visualise the BPE procedure:
|
||||
```python
|
||||
from tiktoken._educational import *
|
||||
|
||||
# Train a BPE tokeniser on a small amount of text
|
||||
enc = train_simple_encoding()
|
||||
|
||||
# Visualise how the GPT-4 encoder encodes text
|
||||
enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base")
|
||||
enc.encode("hello world aaaaaaaaaaaa")
|
||||
```
|
||||
|
||||
|
||||
## Extending tiktoken
|
||||
|
||||
You may wish to extend `tiktoken` to support new encodings. There are two ways to do this.
|
||||
|
||||
|
||||
**Create your `Encoding` object exactly the way you want and simply pass it around.**
|
||||
|
||||
```python
|
||||
cl100k_base = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
# In production, load the arguments directly instead of accessing private attributes
|
||||
# See openai_public.py for examples of arguments for specific encodings
|
||||
enc = tiktoken.Encoding(
|
||||
# If you're changing the set of special tokens, make sure to use a different name
|
||||
# It should be clear from the name what behaviour to expect.
|
||||
name="cl100k_im",
|
||||
pat_str=cl100k_base._pat_str,
|
||||
mergeable_ranks=cl100k_base._mergeable_ranks,
|
||||
special_tokens={
|
||||
**cl100k_base._special_tokens,
|
||||
"<|im_start|>": 100264,
|
||||
"<|im_end|>": 100265,
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**
|
||||
|
||||
This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer
|
||||
option 1.
|
||||
|
||||
To do this, you'll need to create a namespace package under `tiktoken_ext`.
|
||||
|
||||
Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:
|
||||
```
|
||||
my_tiktoken_extension
|
||||
├── tiktoken_ext
|
||||
│ └── my_encodings.py
|
||||
└── setup.py
|
||||
```
|
||||
|
||||
`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.
|
||||
This is a dictionary from an encoding name to a function that takes no arguments and returns
|
||||
arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see
|
||||
`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.
|
||||
|
||||
Your `setup.py` should look something like this:
|
||||
```python
|
||||
from setuptools import setup, find_namespace_packages
|
||||
|
||||
setup(
|
||||
name="my_tiktoken_extension",
|
||||
packages=find_namespace_packages(include=['tiktoken_ext*']),
|
||||
install_requires=["tiktoken"],
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
Then simply `pip install ./my_tiktoken_extension` and you should be able to use your
|
||||
custom encodings! Make sure **not** to use an editable install.
|
25
site-packages/tiktoken-0.7.0.dist-info/RECORD
Normal file
25
site-packages/tiktoken-0.7.0.dist-info/RECORD
Normal file
@ -0,0 +1,25 @@
|
||||
tiktoken-0.7.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
tiktoken-0.7.0.dist-info/LICENSE,sha256=QYy0mbQ2Eo1lPXmUEzOlQ3t74uqSE9zC8E0V1dLFHYY,1078
|
||||
tiktoken-0.7.0.dist-info/METADATA,sha256=Uh4FPcMTr6wxg79NeyNXwWkImgSGn8uTqHPEcNYUwn4,6598
|
||||
tiktoken-0.7.0.dist-info/RECORD,,
|
||||
tiktoken-0.7.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
tiktoken-0.7.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
||||
tiktoken-0.7.0.dist-info/direct_url.json,sha256=N3P589ng7z0KBpW-bWO-8Sst21MaC5OT1LJE1SYeuGE,138
|
||||
tiktoken-0.7.0.dist-info/top_level.txt,sha256=54G5MceQnuD7EXvp7jzGxDDapA1iOwsh77jhCN9WKkc,22
|
||||
tiktoken/__init__.py,sha256=FNmz8KgZfaG62vRgMMkTL9jj0a2AI7JGV1b-RZ29_tY,322
|
||||
tiktoken/__pycache__/__init__.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/_educational.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/_tiktoken.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/core.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/load.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/model.cpython-38.pyc,,
|
||||
tiktoken/__pycache__/registry.cpython-38.pyc,,
|
||||
tiktoken/_educational.py,sha256=l_bTeohxYJ2RHrXDFT2QfRF7aD89S38VFZndzZTI_cM,8234
|
||||
tiktoken/_tiktoken.py,sha256=uSdqUIlUBtyyCwSPnVGA5eBWI74noFq-uEPdUxWGXgU,3541
|
||||
tiktoken/core.py,sha256=l9ozzJP6zQ_rlPvV1ZAF1ENZBC3jyiin_rwlkmenxTQ,16129
|
||||
tiktoken/load.py,sha256=YDbOfHhKn1MEWn9cWc1cVqDxZNwpGifWnuvfEcKeJ4w,5351
|
||||
tiktoken/model.py,sha256=fCcuegWlKwFFmD1crVXHxFQBlBV6BGWCfwYTIhUcADs,3647
|
||||
tiktoken/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
tiktoken/registry.py,sha256=ksP_k8jlqLyefL1sr5OAc-yOK0McOFaHZM4oF8KQdYg,2811
|
||||
tiktoken_ext/__pycache__/openai_public.cpython-38.pyc,,
|
||||
tiktoken_ext/openai_public.py,sha256=pVz8DaOyPbbPzJc2xhS61vL_Ubim8n0lgTD1v4TaZBc,4515
|
1
site-packages/tiktoken-0.7.0.dist-info/direct_url.json
Normal file
1
site-packages/tiktoken-0.7.0.dist-info/direct_url.json
Normal file
@ -0,0 +1 @@
|
||||
{"url": "https://github.com/devchat-ai/tiktoken.git", "vcs_info": {"commit_id": "01de9f4ed2b290ecd33805406a3954488018a783", "vcs": "git"}}
|
@ -1,4 +1,6 @@
|
||||
# This is the public API of tiktoken
|
||||
from .core import Encoding as Encoding
|
||||
from .model import encoding_for_model as encoding_for_model
|
||||
from .model import encoding_name_for_model as encoding_name_for_model
|
||||
from .registry import get_encoding as get_encoding
|
||||
from .registry import list_encoding_names as list_encoding_names
|
||||
|
@ -1,11 +1,8 @@
|
||||
"""This is an educational implementation of the byte pair encoding algorithm."""
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import itertools
|
||||
from typing import Optional
|
||||
|
||||
import re as regex
|
||||
import regex
|
||||
|
||||
import tiktoken
|
||||
|
||||
@ -187,11 +184,23 @@ def bpe_train(
|
||||
|
||||
|
||||
def visualise_tokens(token_values: list[bytes]) -> None:
|
||||
backgrounds = itertools.cycle(
|
||||
[f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]]
|
||||
)
|
||||
interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values))
|
||||
print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8"))
|
||||
background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
|
||||
# If token boundaries do not occur at unicode character boundaries, it's unclear how best to
|
||||
# visualise the token. Here, we'll just use the unicode replacement character to represent some
|
||||
# fraction of a character.
|
||||
unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values]
|
||||
|
||||
running_length = 0
|
||||
last_color = None
|
||||
for token in unicode_token_values:
|
||||
color = background[running_length % len(background)]
|
||||
if color == last_color:
|
||||
color = background[(running_length + 1) % len(background)]
|
||||
assert color != last_color
|
||||
last_color = color
|
||||
running_length += len(token)
|
||||
print(color + token, end="")
|
||||
print("\u001b[0m")
|
||||
|
||||
|
||||
def train_simple_encoding():
|
||||
|
89
site-packages/tiktoken/_tiktoken.py
Normal file
89
site-packages/tiktoken/_tiktoken.py
Normal file
@ -0,0 +1,89 @@
|
||||
import re
|
||||
import hashlib
|
||||
from typing import Dict, List, Tuple, Union
|
||||
|
||||
Rank = int
|
||||
|
||||
def _byte_pair_merge(ranks: Dict[bytes, Rank], piece: bytes) -> List[Tuple[bytes, Rank]]:
|
||||
parts = []
|
||||
min_rank = (float('inf'), float('inf'))
|
||||
for i in range(len(piece) - 1):
|
||||
rank = ranks.get(piece[i:i + 2], float('inf'))
|
||||
if rank < min_rank[0]:
|
||||
min_rank = (rank, i)
|
||||
parts.append((piece[i:i + 2], rank))
|
||||
parts.append((piece[len(piece) - 1:], float('inf')))
|
||||
parts.append((piece[len(piece):], float('inf')))
|
||||
|
||||
while min_rank[0] != float('inf'):
|
||||
i = min_rank[1]
|
||||
if i > 0:
|
||||
parts[i - 1] = (parts[i - 1][0], get_rank_with_ranks(piece, parts, i - 1, ranks))
|
||||
parts[i] = (parts[i][0], get_rank_with_ranks(piece, parts, i, ranks))
|
||||
del parts[i + 1]
|
||||
|
||||
min_rank = (float('inf'), float('inf'))
|
||||
for j, (_, rank) in enumerate(parts[:-1]):
|
||||
if rank < min_rank[0]:
|
||||
min_rank = (rank, j)
|
||||
return parts
|
||||
|
||||
def get_rank_with_ranks(piece: bytes, parts: List[Tuple[bytes, Rank]], i: int, ranks: Dict[bytes, Rank]) -> Rank:
|
||||
if (i + 3) < len(parts):
|
||||
key = piece[parts[i][0].start:parts[i + 3][0].start]
|
||||
return ranks.get(key, float('inf'))
|
||||
else:
|
||||
return float('inf')
|
||||
|
||||
def byte_pair_encode(piece: bytes, ranks: Dict[bytes, Rank]) -> List[Rank]:
|
||||
assert len(piece) > 1
|
||||
parts = _byte_pair_merge(ranks, piece)
|
||||
tokens = []
|
||||
current_token = []
|
||||
for part in parts[:-1]:
|
||||
if len(current_token) == 0:
|
||||
current_token.append(part[0])
|
||||
elif ranks.get(b''.join(current_token + [part[0]])) is not None:
|
||||
current_token.append(part[0])
|
||||
else:
|
||||
tokens.append(ranks[b''.join(current_token)])
|
||||
current_token = [part[0]]
|
||||
tokens.append(ranks[b''.join(current_token)])
|
||||
return tokens
|
||||
|
||||
def byte_pair_split(piece: bytes, ranks: Dict[bytes, Rank]) -> List[bytes]:
|
||||
assert len(piece) > 1
|
||||
parts = _byte_pair_merge(ranks, piece)
|
||||
return [part[0] for part in parts[:-1]]
|
||||
|
||||
class CoreBPE:
|
||||
def __init__(self, encoder: Dict[bytes, Rank], special_tokens_encoder: Dict[str, Rank], pattern: str):
|
||||
self.encoder = encoder
|
||||
self.special_tokens_encoder = special_tokens_encoder
|
||||
self.decoder = {v: k for k, v in encoder.items()}
|
||||
self.special_tokens_decoder = {v: k.encode('utf-8') for k, v in special_tokens_encoder.items()}
|
||||
self.regex = re.compile(pattern)
|
||||
self.special_regex = re.compile('|'.join(map(re.escape, special_tokens_encoder.keys())))
|
||||
|
||||
def encode_ordinary(self, text: str) -> List[Rank]:
|
||||
return [self.encoder[piece.encode("utf-8")] for piece in self.regex.findall(text)]
|
||||
|
||||
def encode(self, text: str, allowed_special: set) -> List[Rank]:
|
||||
tokens = []
|
||||
start = 0
|
||||
for match in self.special_regex.finditer(text):
|
||||
if match.start() > start:
|
||||
tokens.extend(self.encode_ordinary(text[start:match.start()]))
|
||||
if match.group(0) in allowed_special:
|
||||
tokens.append(self.special_tokens_encoder[match.group(0)])
|
||||
start = match.end()
|
||||
if start < len(text):
|
||||
tokens.extend(self.encode_ordinary(text[start:]))
|
||||
return tokens
|
||||
|
||||
def decode_bytes(self, tokens: List[Rank]) -> bytes:
|
||||
return b''.join(self.decoder.get(token, self.special_tokens_decoder.get(token)) for token in tokens)
|
||||
|
||||
def token_byte_values(self) -> List[bytes]:
|
||||
return self.sorted_token_bytes
|
||||
|
@ -5,12 +5,8 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union
|
||||
|
||||
import re as regex
|
||||
try:
|
||||
from tiktoken import _tiktoken
|
||||
from .python_tiktoken import CoreBPE
|
||||
except ImportError:
|
||||
# print("Unable to import rust py binding for _tiktoken, must use pure python implementation")
|
||||
from .python_tiktoken import CoreBPE
|
||||
|
||||
from tiktoken import _tiktoken
|
||||
|
||||
|
||||
class Encoding:
|
||||
@ -22,7 +18,6 @@ class Encoding:
|
||||
mergeable_ranks: dict[bytes, int],
|
||||
special_tokens: dict[str, int],
|
||||
explicit_n_vocab: Optional[int] = None,
|
||||
use_pure_python: bool = False
|
||||
):
|
||||
"""Creates an Encoding object.
|
||||
|
||||
@ -52,11 +47,7 @@ class Encoding:
|
||||
assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab
|
||||
assert self.max_token_value == explicit_n_vocab - 1
|
||||
|
||||
if use_pure_python:
|
||||
self._core_bpe = CoreBPE(mergeable_ranks, special_tokens, pat_str)
|
||||
else:
|
||||
self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
|
||||
|
||||
self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<Encoding {self.name!r}>"
|
||||
@ -125,6 +116,10 @@ class Encoding:
|
||||
if match := _special_token_regex(disallowed_special).search(text):
|
||||
raise_disallowed_special_token(match.group())
|
||||
|
||||
# https://github.com/PyO3/pyo3/pull/3632
|
||||
if isinstance(allowed_special, frozenset):
|
||||
allowed_special = set(allowed_special)
|
||||
|
||||
try:
|
||||
return self._core_bpe.encode(text, allowed_special)
|
||||
except UnicodeEncodeError:
|
||||
@ -373,6 +368,26 @@ class Encoding:
|
||||
def _encode_bytes(self, text: bytes) -> list[int]:
|
||||
return self._core_bpe._encode_bytes(text)
|
||||
|
||||
def __getstate__(self) -> object:
|
||||
import tiktoken.registry
|
||||
|
||||
# As an optimisation, pickle registered encodings by reference
|
||||
if self is tiktoken.registry.ENCODINGS.get(self.name):
|
||||
return self.name
|
||||
return {
|
||||
"name": self.name,
|
||||
"pat_str": self._pat_str,
|
||||
"mergeable_ranks": self._mergeable_ranks,
|
||||
"special_tokens": self._special_tokens,
|
||||
}
|
||||
|
||||
def __setstate__(self, value: object) -> None:
|
||||
import tiktoken.registry
|
||||
|
||||
if isinstance(value, str):
|
||||
self.__dict__ = tiktoken.registry.get_encoding(value).__dict__
|
||||
return
|
||||
self.__init__(**value)
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=128)
|
||||
|
@ -6,6 +6,7 @@ import json
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
@ -26,13 +27,20 @@ def read_file(blobpath: str) -> bytes:
|
||||
return resp.content
|
||||
|
||||
|
||||
def read_file_cached(blobpath: str) -> bytes:
|
||||
def check_hash(data: bytes, expected_hash: str) -> bool:
|
||||
actual_hash = hashlib.sha256(data).hexdigest()
|
||||
return actual_hash == expected_hash
|
||||
|
||||
|
||||
def read_file_cached(blobpath: str, expected_hash: Optional[str] = None) -> bytes:
|
||||
user_specified_cache = True
|
||||
if "TIKTOKEN_CACHE_DIR" in os.environ:
|
||||
cache_dir = os.environ["TIKTOKEN_CACHE_DIR"]
|
||||
elif "DATA_GYM_CACHE_DIR" in os.environ:
|
||||
cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
|
||||
else:
|
||||
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
|
||||
user_specified_cache = False
|
||||
|
||||
if cache_dir == "":
|
||||
# disable caching
|
||||
@ -43,21 +51,42 @@ def read_file_cached(blobpath: str) -> bytes:
|
||||
cache_path = os.path.join(cache_dir, cache_key)
|
||||
if os.path.exists(cache_path):
|
||||
with open(cache_path, "rb") as f:
|
||||
return f.read()
|
||||
data = f.read()
|
||||
if expected_hash is None or check_hash(data, expected_hash):
|
||||
return data
|
||||
|
||||
# the cached file does not match the hash, remove it and re-fetch
|
||||
try:
|
||||
os.remove(cache_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
contents = read_file(blobpath)
|
||||
if expected_hash and not check_hash(contents, expected_hash):
|
||||
raise ValueError(
|
||||
f"Hash mismatch for data downloaded from {blobpath} (expected {expected_hash}). "
|
||||
f"This may indicate a corrupted download. Please try again."
|
||||
)
|
||||
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
|
||||
with open(tmp_filename, "wb") as f:
|
||||
f.write(contents)
|
||||
os.rename(tmp_filename, cache_path)
|
||||
try:
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
|
||||
with open(tmp_filename, "wb") as f:
|
||||
f.write(contents)
|
||||
os.rename(tmp_filename, cache_path)
|
||||
except OSError:
|
||||
# don't raise if we can't write to the default cache, e.g. issue #75
|
||||
if user_specified_cache:
|
||||
raise
|
||||
|
||||
return contents
|
||||
|
||||
|
||||
def data_gym_to_mergeable_bpe_ranks(
|
||||
vocab_bpe_file: str, encoder_json_file: str
|
||||
vocab_bpe_file: str,
|
||||
encoder_json_file: str,
|
||||
vocab_bpe_hash: Optional[str] = None,
|
||||
encoder_json_hash: Optional[str] = None,
|
||||
) -> dict[bytes, int]:
|
||||
# NB: do not add caching to this function
|
||||
rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "]
|
||||
@ -72,7 +101,7 @@ def data_gym_to_mergeable_bpe_ranks(
|
||||
assert len(rank_to_intbyte) == 2**8
|
||||
|
||||
# vocab_bpe contains the merges along with associated ranks
|
||||
vocab_bpe_contents = read_file_cached(vocab_bpe_file).decode()
|
||||
vocab_bpe_contents = read_file_cached(vocab_bpe_file, vocab_bpe_hash).decode()
|
||||
bpe_merges = [tuple(merge_str.split()) for merge_str in vocab_bpe_contents.split("\n")[1:-1]]
|
||||
|
||||
def decode_data_gym(value: str) -> bytes:
|
||||
@ -89,7 +118,7 @@ def data_gym_to_mergeable_bpe_ranks(
|
||||
# check that the encoder file matches the merges file
|
||||
# this sanity check is important since tiktoken assumes that ranks are ordered the same
|
||||
# as merge priority
|
||||
encoder_json = json.loads(read_file_cached(encoder_json_file))
|
||||
encoder_json = json.loads(read_file_cached(encoder_json_file, encoder_json_hash))
|
||||
encoder_json_loaded = {decode_data_gym(k): v for k, v in encoder_json.items()}
|
||||
# drop these two special tokens if present, since they're not mergeable bpe tokens
|
||||
encoder_json_loaded.pop(b"<|endoftext|>", None)
|
||||
@ -111,9 +140,11 @@ def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> No
|
||||
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
|
||||
|
||||
|
||||
def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
|
||||
def load_tiktoken_bpe(
|
||||
tiktoken_bpe_file: str, expected_hash: Optional[str] = None
|
||||
) -> dict[bytes, int]:
|
||||
# NB: do not add caching to this function
|
||||
contents = read_file_cached(tiktoken_bpe_file)
|
||||
contents = read_file_cached(tiktoken_bpe_file, expected_hash)
|
||||
return {
|
||||
base64.b64decode(token): int(rank)
|
||||
for token, rank in (line.split() for line in contents.splitlines() if line)
|
||||
|
@ -6,17 +6,33 @@ from .registry import get_encoding
|
||||
# TODO: these will likely be replaced by an API endpoint
|
||||
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
|
||||
# chat
|
||||
"gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13
|
||||
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
||||
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
||||
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
||||
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
|
||||
# fine-tuned
|
||||
"ft:gpt-4": "cl100k_base",
|
||||
"ft:gpt-3.5-turbo": "cl100k_base",
|
||||
"ft:davinci-002": "cl100k_base",
|
||||
"ft:babbage-002": "cl100k_base",
|
||||
}
|
||||
|
||||
MODEL_TO_ENCODING: dict[str, str] = {
|
||||
# chat
|
||||
"gpt-4o": "o200k_base",
|
||||
"gpt-4": "cl100k_base",
|
||||
"gpt-3.5-turbo": "cl100k_base",
|
||||
"gpt-3.5": "cl100k_base", # Common shorthand
|
||||
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
||||
# text
|
||||
# base
|
||||
"davinci-002": "cl100k_base",
|
||||
"babbage-002": "cl100k_base",
|
||||
# embeddings
|
||||
"text-embedding-ada-002": "cl100k_base",
|
||||
"text-embedding-3-small": "cl100k_base",
|
||||
"text-embedding-3-large": "cl100k_base",
|
||||
# DEPRECATED MODELS
|
||||
# text (DEPRECATED)
|
||||
"text-davinci-003": "p50k_base",
|
||||
"text-davinci-002": "p50k_base",
|
||||
"text-davinci-001": "r50k_base",
|
||||
@ -27,19 +43,17 @@ MODEL_TO_ENCODING: dict[str, str] = {
|
||||
"curie": "r50k_base",
|
||||
"babbage": "r50k_base",
|
||||
"ada": "r50k_base",
|
||||
# code
|
||||
# code (DEPRECATED)
|
||||
"code-davinci-002": "p50k_base",
|
||||
"code-davinci-001": "p50k_base",
|
||||
"code-cushman-002": "p50k_base",
|
||||
"code-cushman-001": "p50k_base",
|
||||
"davinci-codex": "p50k_base",
|
||||
"cushman-codex": "p50k_base",
|
||||
# edit
|
||||
# edit (DEPRECATED)
|
||||
"text-davinci-edit-001": "p50k_edit",
|
||||
"code-davinci-edit-001": "p50k_edit",
|
||||
# embeddings
|
||||
"text-embedding-ada-002": "cl100k_base",
|
||||
# old embeddings
|
||||
# old embeddings (DEPRECATED)
|
||||
"text-similarity-davinci-001": "r50k_base",
|
||||
"text-similarity-curie-001": "r50k_base",
|
||||
"text-similarity-babbage-001": "r50k_base",
|
||||
@ -52,11 +66,15 @@ MODEL_TO_ENCODING: dict[str, str] = {
|
||||
"code-search-ada-code-001": "r50k_base",
|
||||
# open source
|
||||
"gpt2": "gpt2",
|
||||
"gpt-2": "gpt2", # Maintains consistency with gpt-4
|
||||
}
|
||||
|
||||
|
||||
def encoding_for_model(model_name: str) -> Encoding:
|
||||
"""Returns the encoding used by a model."""
|
||||
def encoding_name_for_model(model_name: str) -> str:
|
||||
"""Returns the name of the encoding used by a model.
|
||||
|
||||
Raises a KeyError if the model name is not recognised.
|
||||
"""
|
||||
encoding_name = None
|
||||
if model_name in MODEL_TO_ENCODING:
|
||||
encoding_name = MODEL_TO_ENCODING[model_name]
|
||||
@ -66,7 +84,7 @@ def encoding_for_model(model_name: str) -> Encoding:
|
||||
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
|
||||
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
|
||||
if model_name.startswith(model_prefix):
|
||||
return get_encoding(model_encoding_name)
|
||||
return model_encoding_name
|
||||
|
||||
if encoding_name is None:
|
||||
raise KeyError(
|
||||
@ -74,4 +92,12 @@ def encoding_for_model(model_name: str) -> Encoding:
|
||||
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
|
||||
) from None
|
||||
|
||||
return get_encoding(encoding_name)
|
||||
return encoding_name
|
||||
|
||||
|
||||
def encoding_for_model(model_name: str) -> Encoding:
|
||||
"""Returns the encoding used by a model.
|
||||
|
||||
Raises a KeyError if the model name is not recognised.
|
||||
"""
|
||||
return get_encoding(encoding_name_for_model(model_name))
|
||||
|
@ -1,238 +0,0 @@
|
||||
from typing import Dict, List, Callable, Union, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
import threading
|
||||
import re
|
||||
import copy
|
||||
|
||||
MAX_NUM_THREADS = 128
|
||||
|
||||
def hash_current_thread():
|
||||
return hash(threading.current_thread().ident)
|
||||
|
||||
def byte_pair_merge(piece: bytes, ranks: Dict[bytes, int], f: Callable[[slice], Union[int, str]]) -> List[Union[int, str]]:
|
||||
"""
|
||||
Translates the _byte_pair_merge function from Rust to Python.
|
||||
"""
|
||||
|
||||
# This is a list of (start, rank).
|
||||
# The rank is of the byte pair starting at position start.
|
||||
# The rank of the last item in the list is not a valid value.
|
||||
parts = [(i, float('inf')) for i in range(len(piece) + 1)]
|
||||
|
||||
def get_rank(parts: List[Tuple[int, int]], start_idx: int, skip: int) -> Union[int, None]:
|
||||
"""
|
||||
Inner function to get the rank of a byte pair or sequence.
|
||||
"""
|
||||
if start_idx + skip + 2 < len(parts):
|
||||
return ranks.get(piece[parts[start_idx][0]:parts[start_idx + skip + 2][0]])
|
||||
else:
|
||||
return None
|
||||
|
||||
# We look up the ranks once in the beginning and iteratively update
|
||||
# them during each merge, which reduces the number of rank lookups.
|
||||
for i in range(len(parts) - 2):
|
||||
rank = get_rank(parts, i, 0)
|
||||
if rank is not None:
|
||||
assert rank != float('inf') # Check if rank is not the sentinel value
|
||||
parts[i] = (parts[i][0], rank)
|
||||
|
||||
# Main merging loop
|
||||
while len(parts) > 1:
|
||||
# float('inf') is a sentinel rank value allowing us to take the min more quickly
|
||||
min_rank = (float('inf'), 0)
|
||||
for i, (_, rank) in enumerate(parts[:-1]):
|
||||
if rank < min_rank[0]:
|
||||
min_rank = (rank, i)
|
||||
|
||||
if min_rank[0] != float('inf'):
|
||||
i = min_rank[1]
|
||||
# Update ranks considering the skip
|
||||
parts[i] = (parts[i][0], get_rank(parts, i, 1) or float('inf'))
|
||||
if i > 0:
|
||||
parts[i - 1] = (parts[i - 1][0], get_rank(parts, i - 1, 1) or float('inf'))
|
||||
# Remove the part
|
||||
parts.pop(i + 1)
|
||||
else:
|
||||
break
|
||||
|
||||
# Construct the output
|
||||
out = [f(slice(parts[i][0], parts[i + 1][0])) for i in range(len(parts) - 1)]
|
||||
return out
|
||||
|
||||
def byte_pair_encode(piece, ranks):
|
||||
if len(piece) == 1:
|
||||
# return [ranks[tuple(piece)]]
|
||||
return [ranks[piece]]
|
||||
# return byte_pair_merge(piece, ranks, lambda p: ranks[tuple(piece[p.start:p.stop])])
|
||||
return byte_pair_merge(piece, ranks, lambda p: ranks[piece[p.start:p.stop]])
|
||||
|
||||
def byte_pair_split(piece, ranks):
|
||||
if len(piece) == 1:
|
||||
return [piece]
|
||||
return byte_pair_merge(piece, ranks, lambda p: piece[p.start:p.stop])
|
||||
|
||||
class CoreBaseBPE:
|
||||
def __init__(self):
|
||||
self.encoder = {}
|
||||
self.special_tokens_encoder = {}
|
||||
self.decoder = {}
|
||||
self.special_tokens_decoder = {}
|
||||
self.regex_tls = []
|
||||
self.special_regex_tls = []
|
||||
self.sorted_token_bytes = []
|
||||
|
||||
def _get_tl_regex(self):
|
||||
return self.regex_tls[hash_current_thread() % MAX_NUM_THREADS]
|
||||
|
||||
def _get_tl_special_regex(self):
|
||||
return self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS]
|
||||
|
||||
def _decode_native(self, tokens):
|
||||
ret = bytearray()
|
||||
for token in tokens:
|
||||
token_bytes = self.decoder.get(token, self.special_tokens_decoder.get(token))
|
||||
if token_bytes:
|
||||
ret.extend(token_bytes)
|
||||
return ret
|
||||
|
||||
def _encode_ordinary_native(self, text):
|
||||
regex = self._get_tl_regex()
|
||||
ret = []
|
||||
for mat in re.finditer(regex, text):
|
||||
piece = mat.group().encode('utf-8')
|
||||
token = self.encoder.get(piece)
|
||||
if token:
|
||||
ret.append(token)
|
||||
continue
|
||||
tokens = byte_pair_encode(piece, self.encoder)
|
||||
ret.extend(tokens)
|
||||
return ret
|
||||
|
||||
def _encode_native(self, text, allowed_special):
|
||||
special_regex = self._get_tl_special_regex()
|
||||
regex = self._get_tl_regex()
|
||||
ret = []
|
||||
start = 0
|
||||
last_piece_token_len = 0
|
||||
|
||||
while start < len(text):
|
||||
next_special = None
|
||||
for mat in re.finditer(special_regex, text[start:]):
|
||||
if mat.group() in allowed_special:
|
||||
next_special = mat
|
||||
break
|
||||
|
||||
for mat in re.finditer(regex, text[start:next_special.start() if next_special else None]):
|
||||
piece = mat.group().encode('utf-8')
|
||||
token = self.encoder.get(piece)
|
||||
if token:
|
||||
ret.append(token)
|
||||
continue
|
||||
tokens = byte_pair_encode(piece, self.encoder)
|
||||
last_piece_token_len = len(tokens)
|
||||
ret.extend(tokens)
|
||||
|
||||
if next_special:
|
||||
piece = next_special.group().encode('utf-8')
|
||||
token = self.special_tokens_encoder[piece]
|
||||
ret.append(token)
|
||||
start = next_special.end()
|
||||
last_piece_token_len = 0
|
||||
else:
|
||||
break
|
||||
|
||||
return ret, last_piece_token_len
|
||||
|
||||
|
||||
|
||||
class CoreBPE(CoreBaseBPE):
|
||||
|
||||
# _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
|
||||
def __init__(self, encoder, special_tokens_encoder, pattern):
|
||||
self.encoder = encoder
|
||||
self.special_tokens_encoder = special_tokens_encoder
|
||||
|
||||
self.regex = re.compile(pattern)
|
||||
|
||||
special_parts = [re.escape(key) for key in special_tokens_encoder.keys()]
|
||||
self.special_regex = re.compile("|".join(special_parts))
|
||||
|
||||
self.decoder = {v: k for k, v in encoder.items()}
|
||||
|
||||
assert len(encoder) == len(self.decoder), "Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?"
|
||||
|
||||
self.special_tokens_decoder = {v: bytes(k, 'utf-8') for k, v in special_tokens_encoder.items()}
|
||||
|
||||
self.sorted_token_bytes = sorted(list(encoder.keys()))
|
||||
|
||||
self.regex_tls = [copy.deepcopy(self.regex) for _ in range(MAX_NUM_THREADS)]
|
||||
self.special_regex_tls = [copy.deepcopy(self.special_regex) for _ in range(MAX_NUM_THREADS)]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def encode_ordinary(self, text):
|
||||
return self._encode_ordinary_native(text)
|
||||
|
||||
def encode(self, text, allowed_special):
|
||||
return self._encode_native(text, allowed_special)[0]
|
||||
|
||||
|
||||
def _encode_bytes(self, bytes):
|
||||
try:
|
||||
text = bytes.decode('utf-8')
|
||||
return self._encode_ordinary_native(text)
|
||||
except UnicodeDecodeError as e:
|
||||
text = bytes[:e.start].decode('utf-8', 'ignore')
|
||||
tokens, last_piece_token_len = self._encode_native(text, set())
|
||||
tokens, last_piece_token_len = self._increase_last_piece_token_len(tokens, last_piece_token_len)
|
||||
|
||||
if tokens and last_piece_token_len > 0:
|
||||
unstable_bytes = self._decode_native(tokens[-last_piece_token_len:])
|
||||
unstable_bytes.extend(bytes[e.start:])
|
||||
|
||||
tokens = tokens[:-last_piece_token_len]
|
||||
tokens.extend(byte_pair_encode(unstable_bytes, self.encoder)) # Assuming byte_pair_encode is defined elsewhere
|
||||
|
||||
return tokens
|
||||
|
||||
def encode_with_unstable(self, text, allowed_special):
|
||||
tokens, completions = self._encode_unstable_native(text, allowed_special)
|
||||
py_completions = [list(seq) for seq in completions]
|
||||
return tokens, py_completions
|
||||
|
||||
def encode_single_token(self, piece):
|
||||
token = self.encoder.get(piece)
|
||||
if token:
|
||||
return token
|
||||
|
||||
piece_str = piece.decode('utf-8', 'ignore')
|
||||
token = self.special_tokens_encoder.get(piece_str)
|
||||
if token:
|
||||
return token
|
||||
|
||||
raise KeyError(piece)
|
||||
|
||||
def encode_single_piece(self, piece):
|
||||
token = self.encoder.get(piece)
|
||||
if token:
|
||||
return [token]
|
||||
|
||||
return byte_pair_encode(piece, self.encoder) # Assuming byte_pair_encode is defined elsewhere
|
||||
|
||||
def decode_bytes(self, tokens):
|
||||
return self._decode_native(tokens)
|
||||
|
||||
def decode_single_token_bytes(self, token):
|
||||
bytes_val = self.decoder.get(token) or self.special_tokens_decoder.get(token)
|
||||
if bytes_val:
|
||||
return bytes_val
|
||||
raise KeyError(str(token))
|
||||
|
||||
def token_byte_values(self):
|
||||
return [bytes(x) for x in self.sorted_token_bytes]
|
||||
|
||||
|
||||
|
@ -1,9 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import importlib
|
||||
import pkgutil
|
||||
import threading
|
||||
from typing import Any, Callable, Optional
|
||||
from typing import Any, Callable, Optional, Sequence
|
||||
|
||||
import tiktoken_ext
|
||||
|
||||
@ -14,6 +15,20 @@ ENCODINGS: dict[str, Encoding] = {}
|
||||
ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
def _available_plugin_modules() -> Sequence[str]:
|
||||
# tiktoken_ext is a namespace package
|
||||
# submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
|
||||
# - we use namespace package pattern so `pkgutil.iter_modules` is fast
|
||||
# - it's a separate top-level package because namespace subpackages of non-namespace
|
||||
# packages don't quite do what you want with editable installs
|
||||
mods = []
|
||||
plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
|
||||
for _, mod_name, _ in plugin_mods:
|
||||
mods.append(mod_name)
|
||||
return mods
|
||||
|
||||
|
||||
def _find_constructors() -> None:
|
||||
global ENCODING_CONSTRUCTORS
|
||||
with _lock:
|
||||
@ -21,14 +36,7 @@ def _find_constructors() -> None:
|
||||
return
|
||||
ENCODING_CONSTRUCTORS = {}
|
||||
|
||||
# tiktoken_ext is a namespace package
|
||||
# submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
|
||||
# - we use namespace package pattern so `pkgutil.iter_modules` is fast
|
||||
# - it's a separate top-level package because namespace subpackages of non-namespace
|
||||
# packages don't quite do what you want with editable installs
|
||||
plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
|
||||
|
||||
for _, mod_name, _ in plugin_mods:
|
||||
for mod_name in _available_plugin_modules():
|
||||
mod = importlib.import_module(mod_name)
|
||||
try:
|
||||
constructors = mod.ENCODING_CONSTRUCTORS
|
||||
@ -57,7 +65,9 @@ def get_encoding(encoding_name: str) -> Encoding:
|
||||
assert ENCODING_CONSTRUCTORS is not None
|
||||
|
||||
if encoding_name not in ENCODING_CONSTRUCTORS:
|
||||
raise ValueError(f"Unknown encoding {encoding_name}")
|
||||
raise ValueError(
|
||||
f"Unknown encoding {encoding_name}. Plugins found: {_available_plugin_modules()}"
|
||||
)
|
||||
|
||||
constructor = ENCODING_CONSTRUCTORS[encoding_name]
|
||||
enc = Encoding(**constructor())
|
||||
|
@ -11,24 +11,30 @@ def gpt2():
|
||||
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
|
||||
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
|
||||
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
|
||||
vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
|
||||
encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
|
||||
)
|
||||
return {
|
||||
"name": "gpt2",
|
||||
"explicit_n_vocab": 50257,
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
|
||||
# The pattern in the original GPT-2 release is:
|
||||
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\w]+| ?[\d]+| ?[^\s\w]+|\s+(?!\S)|\s+"""
|
||||
# This is equivalent, but executes faster:
|
||||
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": {"<|endoftext|>": 50256},
|
||||
"special_tokens": {ENDOFTEXT: 50256},
|
||||
}
|
||||
|
||||
|
||||
def r50k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
|
||||
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
|
||||
expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
|
||||
)
|
||||
return {
|
||||
"name": "r50k_base",
|
||||
"explicit_n_vocab": 50257,
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
|
||||
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": {ENDOFTEXT: 50256},
|
||||
}
|
||||
@ -36,12 +42,13 @@ def r50k_base():
|
||||
|
||||
def p50k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
|
||||
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
|
||||
)
|
||||
return {
|
||||
"name": "p50k_base",
|
||||
"explicit_n_vocab": 50281,
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
|
||||
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": {ENDOFTEXT: 50256},
|
||||
}
|
||||
@ -49,12 +56,13 @@ def p50k_base():
|
||||
|
||||
def p50k_edit():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
|
||||
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
|
||||
)
|
||||
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
|
||||
return {
|
||||
"name": "p50k_edit",
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
|
||||
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\w+| ?\d+| ?[^\s\w]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": special_tokens,
|
||||
}
|
||||
@ -62,7 +70,8 @@ def p50k_edit():
|
||||
|
||||
def cl100k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
|
||||
expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
|
||||
)
|
||||
special_tokens = {
|
||||
ENDOFTEXT: 100257,
|
||||
@ -73,7 +82,36 @@ def cl100k_base():
|
||||
}
|
||||
return {
|
||||
"name": "cl100k_base",
|
||||
"pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\w]?\w+|\d{1,3}| ?[^\s\w]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
|
||||
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\w]?+\w+|\d{1,3}| ?[^\s\w]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": special_tokens,
|
||||
}
|
||||
|
||||
|
||||
def o200k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
|
||||
expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
|
||||
)
|
||||
special_tokens = {
|
||||
ENDOFTEXT: 199999,
|
||||
ENDOFPROMPT: 200018,
|
||||
}
|
||||
# This regex could be made more efficient
|
||||
pat_str = "|".join(
|
||||
[
|
||||
r"""[^\r\n\w]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
|
||||
r"""[^\r\n\w]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
|
||||
r"""\d{1,3}""",
|
||||
r""" ?[^\s\w]+[\r\n/]*""",
|
||||
r"""\s*[\r\n]+""",
|
||||
r"""\s+(?!\S)""",
|
||||
r"""\s+""",
|
||||
]
|
||||
)
|
||||
return {
|
||||
"name": "o200k_base",
|
||||
"pat_str": pat_str,
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": special_tokens,
|
||||
}
|
||||
@ -85,4 +123,5 @@ ENCODING_CONSTRUCTORS = {
|
||||
"p50k_base": p50k_base,
|
||||
"p50k_edit": p50k_edit,
|
||||
"cl100k_base": cl100k_base,
|
||||
"o200k_base": o200k_base,
|
||||
}
|
||||
|
@ -1,16 +1,16 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: zipp
|
||||
Version: 3.18.1
|
||||
Version: 3.18.2
|
||||
Summary: Backport of pathlib-compatible object wrapper for zip files
|
||||
Home-page: https://github.com/jaraco/zipp
|
||||
Author: Jason R. Coombs
|
||||
Author-email: jaraco@jaraco.com
|
||||
Author-email: "Jason R. Coombs" <jaraco@jaraco.com>
|
||||
Project-URL: Homepage, https://github.com/jaraco/zipp
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Requires-Python: >=3.8
|
||||
Description-Content-Type: text/x-rst
|
||||
License-File: LICENSE
|
||||
Provides-Extra: docs
|
||||
Requires-Dist: sphinx >=3.5 ; extra == 'docs'
|
||||
@ -20,9 +20,10 @@ Requires-Dist: furo ; extra == 'docs'
|
||||
Requires-Dist: sphinx-lint ; extra == 'docs'
|
||||
Requires-Dist: jaraco.tidelift >=1.4 ; extra == 'docs'
|
||||
Provides-Extra: testing
|
||||
Requires-Dist: pytest >=6 ; extra == 'testing'
|
||||
Requires-Dist: pytest !=8.1.*,>=6 ; extra == 'testing'
|
||||
Requires-Dist: pytest-checkdocs >=2.4 ; extra == 'testing'
|
||||
Requires-Dist: pytest-cov ; extra == 'testing'
|
||||
Requires-Dist: pytest-mypy ; extra == 'testing'
|
||||
Requires-Dist: pytest-enabler >=2.2 ; extra == 'testing'
|
||||
Requires-Dist: pytest-ruff >=0.2.1 ; extra == 'testing'
|
||||
Requires-Dist: jaraco.itertools ; extra == 'testing'
|
||||
@ -30,7 +31,7 @@ Requires-Dist: jaraco.functools ; extra == 'testing'
|
||||
Requires-Dist: more-itertools ; extra == 'testing'
|
||||
Requires-Dist: big-O ; extra == 'testing'
|
||||
Requires-Dist: pytest-ignore-flaky ; extra == 'testing'
|
||||
Requires-Dist: pytest-mypy ; (platform_python_implementation != "PyPy") and extra == 'testing'
|
||||
Requires-Dist: jaraco.test ; extra == 'testing'
|
||||
|
||||
.. image:: https://img.shields.io/pypi/v/zipp.svg
|
||||
:target: https://pypi.org/project/zipp
|
||||
@ -71,7 +72,9 @@ were contributed to different versions in the standard library:
|
||||
|
||||
* - zipp
|
||||
- stdlib
|
||||
* - 3.15
|
||||
* - 3.18
|
||||
- 3.13
|
||||
* - 3.16
|
||||
- 3.12
|
||||
* - 3.5
|
||||
- 3.11
|
@ -1,10 +1,10 @@
|
||||
zipp-3.18.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
zipp-3.18.1.dist-info/LICENSE,sha256=htoPAa6uRjSKPD1GUZXcHOzN55956HdppkuNoEsqR0E,1023
|
||||
zipp-3.18.1.dist-info/METADATA,sha256=dxGXpoBobQO4X9colqFec8eIGSng-ohxNzeKIM0Wh6U,3461
|
||||
zipp-3.18.1.dist-info/RECORD,,
|
||||
zipp-3.18.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
||||
zipp-3.18.1.dist-info/top_level.txt,sha256=iAbdoSHfaGqBfVb2XuR9JqSQHCoOsOtG6y9C_LSpqFw,5
|
||||
zipp/__init__.py,sha256=IB08yJFuj9F0DkmfBLKQU4Cq75n-UPFeDu1qPKZTPKk,11358
|
||||
zipp-3.18.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
zipp-3.18.2.dist-info/LICENSE,sha256=htoPAa6uRjSKPD1GUZXcHOzN55956HdppkuNoEsqR0E,1023
|
||||
zipp-3.18.2.dist-info/METADATA,sha256=v_qTHO-7CH99XLvAV0kA0RtRNMuw-p_WJrzJxUuafEU,3539
|
||||
zipp-3.18.2.dist-info/RECORD,,
|
||||
zipp-3.18.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
||||
zipp-3.18.2.dist-info/top_level.txt,sha256=iAbdoSHfaGqBfVb2XuR9JqSQHCoOsOtG6y9C_LSpqFw,5
|
||||
zipp/__init__.py,sha256=s5hbthFh66EOlVTMKyZ5azMn8y2BrJbTNQx0KsIpcBI,11361
|
||||
zipp/__pycache__/__init__.cpython-38.pyc,,
|
||||
zipp/__pycache__/glob.cpython-38.pyc,,
|
||||
zipp/compat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@ -263,7 +263,7 @@ class Path:
|
||||
>>> str(path.parent)
|
||||
'mem'
|
||||
|
||||
If the zipfile has no filename, such attribtues are not
|
||||
If the zipfile has no filename, such attributes are not
|
||||
valid and accessing them will raise an Exception.
|
||||
|
||||
>>> zf.filename = None
|
||||
|
Loading…
x
Reference in New Issue
Block a user