From 6e52b4fb4200b25af5b07a8529bde0347927b941 Mon Sep 17 00:00:00 2001 From: sakher Date: Mon, 24 Jun 2024 19:58:16 +0100 Subject: [PATCH 1/3] Fixed Anthropic ContentBlock - Replaced with TextBlock --- .../providers/anthropic/anthropic_solver.py | 4 +- .../anthropic/anthropic_solver_test.py | 37 ++++++------------- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/evals/solvers/providers/anthropic/anthropic_solver.py b/evals/solvers/providers/anthropic/anthropic_solver.py index bb7fe50e24..e0cdbebc74 100644 --- a/evals/solvers/providers/anthropic/anthropic_solver.py +++ b/evals/solvers/providers/anthropic/anthropic_solver.py @@ -2,7 +2,7 @@ import anthropic from anthropic import Anthropic -from anthropic.types import ContentBlock, MessageParam, Usage +from anthropic.types import MessageParam, TextBlock, Usage from evals.record import record_sampling from evals.solvers.solver import Solver, SolverResult @@ -99,7 +99,7 @@ def _convert_msgs_to_anthropic_format(msgs: list[Message]) -> list[MessageParam] anth_msgs = [ MessageParam( role=oai_to_anthropic_role[msg.role], - content=[ContentBlock(text=msg.content, type="text")], + content=[TextBlock(text=msg.content, type="text")], ) for msg in msgs ] diff --git a/evals/solvers/providers/anthropic/anthropic_solver_test.py b/evals/solvers/providers/anthropic/anthropic_solver_test.py index 9ba8fb1470..864c67d073 100644 --- a/evals/solvers/providers/anthropic/anthropic_solver_test.py +++ b/evals/solvers/providers/anthropic/anthropic_solver_test.py @@ -1,14 +1,11 @@ import os + import pytest +from anthropic.types import MessageParam, TextBlock, Usage from evals.record import DummyRecorder +from evals.solvers.providers.anthropic.anthropic_solver import AnthropicSolver, anth_to_openai_usage from evals.task_state import Message, TaskState -from evals.solvers.providers.anthropic.anthropic_solver import ( - AnthropicSolver, - anth_to_openai_usage, -) - -from anthropic.types import ContentBlock, MessageParam, Usage IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" MODEL_NAME = "claude-instant-1.2" @@ -32,9 +29,7 @@ def dummy_recorder(): yield recorder -@pytest.mark.skipif( - IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit." -) +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") def test_solver(dummy_recorder, anthropic_solver): """ Test that the solver generates a response coherent with the message history @@ -55,9 +50,7 @@ def test_solver(dummy_recorder, anthropic_solver): ) solver_res = solver(task_state=task_state) - assert ( - solver_res.output == answer - ), f"Expected '{answer}', but got {solver_res.output}" + assert solver_res.output == answer, f"Expected '{answer}', but got {solver_res.output}" def test_message_format(): @@ -71,9 +64,7 @@ def test_message_format(): msgs = [ Message(role="user", content="What is 2 + 2?"), Message(role="system", content="reason step by step"), - Message( - role="assistant", content="I don't need to reason for this, 2+2 is just 4" - ), + Message(role="assistant", content="I don't need to reason for this, 2+2 is just 4"), Message(role="system", content="now, given your reasoning, provide the answer"), ] anth_msgs = AnthropicSolver._convert_msgs_to_anthropic_format(msgs) @@ -82,24 +73,20 @@ def test_message_format(): MessageParam( role="user", content=[ - ContentBlock(text="What is 2 + 2?", type="text"), - ContentBlock(text="reason step by step", type="text"), + TextBlock(text="What is 2 + 2?", type="text"), + TextBlock(text="reason step by step", type="text"), ], ), MessageParam( role="assistant", content=[ - ContentBlock( - text="I don't need to reason for this, 2+2 is just 4", type="text" - ), + TextBlock(text="I don't need to reason for this, 2+2 is just 4", type="text"), ], ), MessageParam( role="user", content=[ - ContentBlock( - text="now, given your reasoning, provide the answer", type="text" - ), + TextBlock(text="now, given your reasoning, provide the answer", type="text"), ], ), ] @@ -126,6 +113,4 @@ def test_anth_to_openai_usage_zero_tokens(): "prompt_tokens": 0, "total_tokens": 0, } - assert ( - anth_to_openai_usage(usage) == expected - ), "Zero token cases are not handled correctly." + assert anth_to_openai_usage(usage) == expected, "Zero token cases are not handled correctly." From bd3f062f7104ff4700e9ff4f0a483eb58e1d44c9 Mon Sep 17 00:00:00 2001 From: sakher Date: Mon, 24 Jun 2024 22:09:43 +0100 Subject: [PATCH 2/3] Fixed Gemini resolver unit test --- evals/solvers/providers/google/gemini_solver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/solvers/providers/google/gemini_solver.py b/evals/solvers/providers/google/gemini_solver.py index 33a8a93a25..1960c0d7fa 100644 --- a/evals/solvers/providers/google/gemini_solver.py +++ b/evals/solvers/providers/google/gemini_solver.py @@ -124,7 +124,7 @@ def _solve( else: # Get text response solver_result = SolverResult( - gen_content_resp.text, + gen_content_resp.parts[0].text, error=gen_content_resp.prompt_feedback, ) except (google.api_core.exceptions.GoogleAPIError,) as e: From 9d1d2b43d95451ab8d59c48de636d382b2d139ad Mon Sep 17 00:00:00 2001 From: sakher Date: Mon, 24 Jun 2024 22:11:34 +0100 Subject: [PATCH 3/3] Fixed open AI Assistants interface breaking changes: retrieval -> file_search assistant.file_ids -> tool_resources message.file_ids -> attachments --- .../openai/openai_assistants_solver.py | 17 +++++++++++++++-- .../openai/openai_assistants_solver_test.py | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/evals/solvers/providers/openai/openai_assistants_solver.py b/evals/solvers/providers/openai/openai_assistants_solver.py index eddfb952eb..2614754b28 100644 --- a/evals/solvers/providers/openai/openai_assistants_solver.py +++ b/evals/solvers/providers/openai/openai_assistants_solver.py @@ -6,7 +6,9 @@ import backoff import openai from openai.types.beta import Assistant +from openai.types.beta.assistant_create_params import ToolResourcesCodeInterpreter from openai.types.beta.thread import Thread +from openai.types.beta.threads import message_create_params from openai.types.beta.threads.run import Run from evals.record import record_sampling @@ -74,7 +76,12 @@ def __init__( name=name, description=description, tools=tools, - file_ids=file_ids, # Files attached here are available to all threads. + tool_resources={ + "code_interpreter": ToolResourcesCodeInterpreter(file_ids=file_ids), + "file_search": { + "vector_store_ids": file_ids, + }, + }, ) else: # This is a special init case for copying the solver - see `OpenAIAssistantsSolver.copy()` @@ -140,7 +147,13 @@ def _solve( thread_id=self.thread.id, role=user_message.role, content=user_message.content, - file_ids=thread_file_ids, + attachments=[ + message_create_params.Attachment( + file_id=file_id, + tools=[{"type": "code_interpreter"}, {"type": "file_search"}], + ) + for file_id in thread_file_ids + ], ) # Run Assistant on the Thread diff --git a/evals/solvers/providers/openai/openai_assistants_solver_test.py b/evals/solvers/providers/openai/openai_assistants_solver_test.py index 7a0d6b5761..f66c759eef 100644 --- a/evals/solvers/providers/openai/openai_assistants_solver_test.py +++ b/evals/solvers/providers/openai/openai_assistants_solver_test.py @@ -59,7 +59,7 @@ def code_interpreter_solver(): def retrieval_solver(): solver = OpenAIAssistantsSolver( model=MODEL, - tools=[{"type": "retrieval"}], + tools=[{"type": "file_search"}], ) return solver