Merge pull request #75 from devchat-ai/switch-model

Use env var `DEVCHAT_UNIT_TESTS_USE_USER_MODEL` to switch to user selected model for `/unit_tests`
2024-03-13 14:34:57 +08:00 · 2024-03-13 14:34:57 +08:00 · ccc1d97c90
commit ccc1d97c90
parent 874da1710f c44891a39f
6 changed files with 247 additions and 76 deletions
--- a/unit_tests/assistants/directory_structure/relevant_file_finder.py
+++ b/unit_tests/assistants/directory_structure/relevant_file_finder.py
@ -4,16 +4,28 @@ from typing import Callable, List
 from assistants.directory_structure.base import DirectoryStructureBase
 from assistants.rerank_files import rerank_files
 from devchat.llm.openai import chat_completion_no_stream_return_json
 from llm_conf import (
    CONTEXT_SIZE,
    DEFAULT_CONTEXT_SIZE,
    DEFAULT_ENCODING,
    USE_USER_MODEL,
    USER_LLM_MODEL,
 )
 from openai_util import create_chat_completion_content
 from tools.directory_viewer import ListViewer
 from tools.tiktoken_util import get_encoding
 MODEL = USER_LLM_MODEL if USE_USER_MODEL else "gpt-3.5-turbo"
 ENCODING = (
    get_encoding(DEFAULT_ENCODING)  # Use default encoding as an approximation
    if USE_USER_MODEL
    else get_encoding("cl100k_base")
 )
 TOKEN_BUDGET = int(CONTEXT_SIZE.get(MODEL, DEFAULT_CONTEXT_SIZE) * 0.95)
 class RelevantFileFinder(DirectoryStructureBase):
    model_name = "gpt-3.5-turbo-1106"
    dir_token_budget = 16000 * 0.95
    encoding = get_encoding("cl100k_base")
    def _paginate_dir_structure(
        self, criteria: Callable[[Path], bool], style: str = "list"
    ) -> List[str]:
@ -38,8 +50,8 @@ class RelevantFileFinder(DirectoryStructureBase):
            # Check if each page is within the token budget
            within_budget = True
            for p in pages:
-                tokens = len(self.encoding.encode(p, disallowed_special=()))
+                tokens = len(ENCODING.encode(p, disallowed_special=()))
-                if tokens > self.dir_token_budget:
+                if tokens > TOKEN_BUDGET:
                    within_budget = False
                    break
@ -82,15 +94,30 @@ class RelevantFileFinder(DirectoryStructureBase):
        for dir_structure in dir_structure_pages:
            user_msg = self._mk_message(objective, dir_structure)
            json_res = {}
            if USE_USER_MODEL:
                # Use the wrapped api parameters
                json_res = (
                    chat_completion_no_stream_return_json(
                        messages=[{"role": "user", "content": user_msg}],
                        llm_config={
                            "model": MODEL,
                            "temperature": 0.1,
                        },
                    )
                    or {}
                )
            else:
                # Use the openai api parameters
                response = create_chat_completion_content(
-                model=self.model_name,
+                    model=MODEL,
                    messages=[
                        {"role": "user", "content": user_msg},
                    ],
                    response_format={"type": "json_object"},
                    temperature=0.1,
                )
                json_res = json.loads(response)
            files.extend(json_res.get("files", []))
--- a/unit_tests/assistants/recommend_test_context.py
+++ b/unit_tests/assistants/recommend_test_context.py
@ -1,13 +1,26 @@
 import json
 from typing import List, Optional
 from devchat.llm.openai import chat_completion_no_stream_return_json
 from llm_conf import (
    CONTEXT_SIZE,
    DEFAULT_CONTEXT_SIZE,
    DEFAULT_ENCODING,
    USE_USER_MODEL,
    USER_LLM_MODEL,
 )
 from model import FuncToTest
 from openai_util import create_chat_completion_content
 from tools.tiktoken_util import get_encoding
-MODEL = "gpt-4-1106-preview"
+MODEL = USER_LLM_MODEL if USE_USER_MODEL else "gpt-4-turbo-preview"
-ENCODING = "cl100k_base"
+ENCODING = (
-# TODO: handle token budget
+    get_encoding(DEFAULT_ENCODING)  # Use default encoding as an approximation
-TOKEN_BUDGET = int(128000 * 0.9)
+    if USE_USER_MODEL
    else get_encoding("cl100k_base")
 )
 TOKEN_BUDGET = int(CONTEXT_SIZE.get(MODEL, DEFAULT_CONTEXT_SIZE) * 0.9)
 # ruff: noqa: E501
@ -45,11 +58,13 @@ JSON Format Example:
 """
-def get_recommended_symbols(
+def _mk_user_msg(func_to_test: FuncToTest, contexts: List) -> str:
-    func_to_test: FuncToTest, known_context: Optional[List] = None
+    """
-) -> List[str]:
+    Create a user message to be sent to the model within the token budget.
-    known_context = known_context or []
+    """
-    context_content = "\n\n".join([str(c) for c in known_context])
+    msg = None
    while msg is None:
        context_content = "\n\n".join([str(c) for c in contexts])
        msg = recommend_symbol_context_prompt.format(
            function_content=func_to_test.func_content,
@ -58,13 +73,44 @@ def get_recommended_symbols(
            file_path=func_to_test.file_path,
        )
        token_count = len(ENCODING.encode(msg, disallowed_special=()))
        if contexts and token_count > TOKEN_BUDGET:
            # Remove the last context and try again
            contexts.pop()
            msg = None
    return msg
 def get_recommended_symbols(
    func_to_test: FuncToTest, known_context: Optional[List] = None
 ) -> List[str]:
    known_context = known_context or []
    msg = _mk_user_msg(func_to_test, known_context)
    json_res = {}
    if USE_USER_MODEL:
        # Use the wrapped api parameters
        json_res = (
            chat_completion_no_stream_return_json(
                messages=[{"role": "user", "content": msg}],
                llm_config={
                    "model": MODEL,
                    "temperature": 0.1,
                },
            )
            or {}
        )
    else:
        response = create_chat_completion_content(
            model=MODEL,
            messages=[{"role": "user", "content": msg}],
            response_format={"type": "json_object"},
            temperature=0.1,
        )
        json_res = json.loads(response)
-    key_symbols = json.loads(response).get("key_symbols", [])
+    key_symbols = json_res.get("key_symbols", [])
    return key_symbols
--- a/unit_tests/assistants/rerank_files.py
+++ b/unit_tests/assistants/rerank_files.py
@ -1,8 +1,15 @@
 import json
 from typing import List, Tuple
 from devchat.llm.openai import chat_completion_no_stream_return_json
 from llm_conf import (
    USE_USER_MODEL,
    USER_LLM_MODEL,
 )
 from openai_util import create_chat_completion_content
 MODEL = USER_LLM_MODEL if USE_USER_MODEL else "gpt-3.5-turbo"
 # ruff: noqa: E501
 rerank_file_prompt = """
@ -28,8 +35,6 @@ Accumulated Knowledge: {accumulated_knowledge}
 Answer:
 """
 RERANK_MODEL = "gpt-3.5-turbo-1106"
 def rerank_files(
    question: str,
@ -56,8 +61,29 @@ def rerank_files(
        accumulated_knowledge=knowledge,
    )
    result = {}
    if USE_USER_MODEL:
        # Use the wrapped api parameters
        result = (
            chat_completion_no_stream_return_json(
                messages=[
                    {
                        "role": "user",
                        "content": user_msg,
                    },
                ],
                llm_config={
                    "model": MODEL,
                    "temperature": 0.1,
                },
            )
            or {}
        )
    else:
        # Use the openai api parameters
        response = create_chat_completion_content(
-        model=RERANK_MODEL,
+            model=MODEL,
            messages=[
                {
                    "role": "user",
@ -67,8 +93,8 @@ def rerank_files(
            response_format={"type": "json_object"},
            temperature=0.1,
        )
        result = json.loads(response)
-    reranked = [(i["item"], i["relevance"]) for i in result["result"]]
+
    reranked = [(i["item"], i["relevance"]) for i in result.get("result", [])]
    return reranked
--- a/unit_tests/llm_conf.py
+++ b/unit_tests/llm_conf.py
@ -0,0 +1,22 @@
 import os
 USE_USER_MODEL = bool(os.environ.get("DEVCHAT_UNIT_TESTS_USE_USER_MODEL", False))
 USER_LLM_MODEL = os.environ.get("LLM_MODEL", "gpt-4-turbo-preview")
 DEFAULT_CONTEXT_SIZE = 4000
 CONTEXT_SIZE = {
    "gpt-3.5-turbo": 16000,
    "gpt-4": 8000,
    "gpt-4-turbo-preview": 128000,
    "claude-3-sonnet": 1000000,
    "claude-3-opus": 1000000,
    "xinghuo-3.5": 8000,
    "GLM-4": 8000,
    "ERNIE-Bot-4.0": 8000,
    "togetherai/codellama/CodeLlama-70b-Instruct-hf": 4000,
    "togetherai/mistralai/Mixtral-8x7B-Instruct-v0.1": 16000,
    "minimax/abab6-chat": 8000,
    "llama-2-70b-chat": 4000,
 }
 DEFAULT_ENCODING = "cl100k_base"
--- a/unit_tests/propose_test.py
+++ b/unit_tests/propose_test.py
@ -2,16 +2,27 @@ import json
 from functools import partial
 from typing import List, Optional
 from devchat.llm.openai import chat_completion_no_stream_return_json
 from find_context import Context
 from llm_conf import (
    CONTEXT_SIZE,
    DEFAULT_CONTEXT_SIZE,
    DEFAULT_ENCODING,
    USE_USER_MODEL,
    USER_LLM_MODEL,
 )
 from model import FuncToTest, TokenBudgetExceededException
 from openai_util import create_chat_completion_content
 from prompts import PROPOSE_TEST_PROMPT
 from tools.tiktoken_util import get_encoding
-MODEL = "gpt-3.5-turbo-1106"
+MODEL = USER_LLM_MODEL if USE_USER_MODEL else "gpt-3.5-turbo"
-# MODEL = "gpt-4-1106-preview"
+ENCODING = (
-ENCODING = "cl100k_base"
+    get_encoding(DEFAULT_ENCODING)  # Use default encoding as an approximation
-TOKEN_BUDGET = int(16000 * 0.9)
+    if USE_USER_MODEL
    else get_encoding("cl100k_base")
 )
 TOKEN_BUDGET = int(CONTEXT_SIZE.get(MODEL, DEFAULT_CONTEXT_SIZE) * 0.9)
 def _mk_user_msg(
@ -23,7 +34,6 @@ def _mk_user_msg(
    """
    Create a user message to be sent to the model within the token budget.
    """
    encoding = get_encoding(ENCODING)
    func_content = f"function code\n```\n{func_to_test.func_content}\n```\n"
    class_content = ""
@ -61,7 +71,7 @@ def _mk_user_msg(
    prioritized_msgs = [msg_0, msg_1, msg_2]
    for msg in prioritized_msgs:
-        token_count = len(encoding.encode(msg, disallowed_special=()))
+        token_count = len(ENCODING.encode(msg, disallowed_special=()))
        if token_count <= TOKEN_BUDGET:
            return msg
@ -97,14 +107,31 @@ def propose_test(
        chat_language=chat_language,
    )
    json_res = {}
    if USE_USER_MODEL:
        # Use the wrapped api parameters
        json_res = (
            chat_completion_no_stream_return_json(
                messages=[{"role": "user", "content": user_msg}],
                llm_config={
                    "model": MODEL,
                    "temperature": 0.1,
                },
            )
            or {}
        )
    else:
        # Use the openai api parameters
        content = create_chat_completion_content(
            model=MODEL,
            messages=[{"role": "user", "content": user_msg}],
            response_format={"type": "json_object"},
            temperature=0.1,
        )
        json_res = json.loads(content)
-    cases = json.loads(content).get("test_cases", [])
+    cases = json_res.get("test_cases", [])
    descriptions = []
    for case in cases:
--- a/unit_tests/write_tests.py
+++ b/unit_tests/write_tests.py
@ -1,16 +1,28 @@
 from functools import partial
 from typing import List, Optional
 from devchat.llm.openai import chat_completion_stream
 from find_context import Context
 from llm_conf import (
    CONTEXT_SIZE,
    DEFAULT_CONTEXT_SIZE,
    DEFAULT_ENCODING,
    USE_USER_MODEL,
    USER_LLM_MODEL,
 )
 from model import FuncToTest, TokenBudgetExceededException
 from openai_util import create_chat_completion_chunks
 from prompts import WRITE_TESTS_PROMPT
 from tools.file_util import retrieve_file_content
 from tools.tiktoken_util import get_encoding
-MODEL = "gpt-4-1106-preview"
+MODEL = USER_LLM_MODEL if USE_USER_MODEL else "gpt-4-turbo-preview"
-ENCODING = "cl100k_base"
+ENCODING = (
-TOKEN_BUDGET = int(128000 * 0.9)
+    get_encoding(DEFAULT_ENCODING)  # Use default encoding as an approximation
    if USE_USER_MODEL
    else get_encoding("cl100k_base")
 )
 TOKEN_BUDGET = int(CONTEXT_SIZE.get(MODEL, DEFAULT_CONTEXT_SIZE) * 0.9)
 def _mk_write_tests_msg(
@ -23,8 +35,6 @@ def _mk_write_tests_msg(
    symbol_contexts: Optional[List[Context]] = None,
    user_requirements: str = "",
 ) -> Optional[str]:
    encoding = get_encoding(ENCODING)
    additional_requirements = user_requirements
    test_cases_str = ""
@ -94,7 +104,7 @@ def _mk_write_tests_msg(
    prioritized_msgs = [msg_0, msg_1, msg_2, msg_3]
    for msg in prioritized_msgs:
-        tokens = len(encoding.encode(msg, disallowed_special=()))
+        tokens = len(ENCODING.encode(msg, disallowed_special=()))
        if tokens <= TOKEN_BUDGET:
            return msg
@ -124,13 +134,26 @@ def write_and_print_tests(
        chat_language=chat_language,
    )
    if USE_USER_MODEL:
        # Use the wrapped api
        res = chat_completion_stream(
            messages=[{"role": "user", "content": user_msg}],
            llm_config={"model": MODEL, "temperature": 0.1},
        )
        if res:
            print(res.get("content", ""))
    else:
        # Use the openai api parameters
        chunks = create_chat_completion_chunks(
            model=MODEL,
            messages=[{"role": "user", "content": user_msg}],
            temperature=0.1,
        )
        for chunk in chunks:
            if chunk.choices[0].finish_reason == "stop":
                break
-        print(chunk.choices[0].delta.content, flush=True, end="")
+
            content = chunk.choices[0].delta.content
            if content is not None:
                print(content, flush=True, end="")