Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Unit Test Failures in OpenAI, Anthropic, and Google Gemini Resolvers #1537

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions evals/solvers/providers/anthropic/anthropic_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import anthropic
from anthropic import Anthropic
from anthropic.types import ContentBlock, MessageParam, Usage
from anthropic.types import MessageParam, TextBlock, Usage

from evals.record import record_sampling
from evals.solvers.solver import Solver, SolverResult
Expand Down Expand Up @@ -99,7 +99,7 @@ def _convert_msgs_to_anthropic_format(msgs: list[Message]) -> list[MessageParam]
anth_msgs = [
MessageParam(
role=oai_to_anthropic_role[msg.role],
content=[ContentBlock(text=msg.content, type="text")],
content=[TextBlock(text=msg.content, type="text")],
)
for msg in msgs
]
Expand Down
37 changes: 11 additions & 26 deletions evals/solvers/providers/anthropic/anthropic_solver_test.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import os

import pytest
from anthropic.types import MessageParam, TextBlock, Usage

from evals.record import DummyRecorder
from evals.solvers.providers.anthropic.anthropic_solver import AnthropicSolver, anth_to_openai_usage
from evals.task_state import Message, TaskState
from evals.solvers.providers.anthropic.anthropic_solver import (
AnthropicSolver,
anth_to_openai_usage,
)

from anthropic.types import ContentBlock, MessageParam, Usage

IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
MODEL_NAME = "claude-instant-1.2"
Expand All @@ -32,9 +29,7 @@ def dummy_recorder():
yield recorder


@pytest.mark.skipif(
IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit."
)
@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.")
def test_solver(dummy_recorder, anthropic_solver):
"""
Test that the solver generates a response coherent with the message history
Expand All @@ -55,9 +50,7 @@ def test_solver(dummy_recorder, anthropic_solver):
)

solver_res = solver(task_state=task_state)
assert (
solver_res.output == answer
), f"Expected '{answer}', but got {solver_res.output}"
assert solver_res.output == answer, f"Expected '{answer}', but got {solver_res.output}"


def test_message_format():
Expand All @@ -71,9 +64,7 @@ def test_message_format():
msgs = [
Message(role="user", content="What is 2 + 2?"),
Message(role="system", content="reason step by step"),
Message(
role="assistant", content="I don't need to reason for this, 2+2 is just 4"
),
Message(role="assistant", content="I don't need to reason for this, 2+2 is just 4"),
Message(role="system", content="now, given your reasoning, provide the answer"),
]
anth_msgs = AnthropicSolver._convert_msgs_to_anthropic_format(msgs)
Expand All @@ -82,24 +73,20 @@ def test_message_format():
MessageParam(
role="user",
content=[
ContentBlock(text="What is 2 + 2?", type="text"),
ContentBlock(text="reason step by step", type="text"),
TextBlock(text="What is 2 + 2?", type="text"),
TextBlock(text="reason step by step", type="text"),
],
),
MessageParam(
role="assistant",
content=[
ContentBlock(
text="I don't need to reason for this, 2+2 is just 4", type="text"
),
TextBlock(text="I don't need to reason for this, 2+2 is just 4", type="text"),
],
),
MessageParam(
role="user",
content=[
ContentBlock(
text="now, given your reasoning, provide the answer", type="text"
),
TextBlock(text="now, given your reasoning, provide the answer", type="text"),
],
),
]
Expand All @@ -126,6 +113,4 @@ def test_anth_to_openai_usage_zero_tokens():
"prompt_tokens": 0,
"total_tokens": 0,
}
assert (
anth_to_openai_usage(usage) == expected
), "Zero token cases are not handled correctly."
assert anth_to_openai_usage(usage) == expected, "Zero token cases are not handled correctly."
2 changes: 1 addition & 1 deletion evals/solvers/providers/google/gemini_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def _solve(
else:
# Get text response
solver_result = SolverResult(
gen_content_resp.text,
gen_content_resp.parts[0].text,
error=gen_content_resp.prompt_feedback,
)
except (google.api_core.exceptions.GoogleAPIError,) as e:
Expand Down
17 changes: 15 additions & 2 deletions evals/solvers/providers/openai/openai_assistants_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import backoff
import openai
from openai.types.beta import Assistant
from openai.types.beta.assistant_create_params import ToolResourcesCodeInterpreter
from openai.types.beta.thread import Thread
from openai.types.beta.threads import message_create_params
from openai.types.beta.threads.run import Run

from evals.record import record_sampling
Expand Down Expand Up @@ -74,7 +76,12 @@ def __init__(
name=name,
description=description,
tools=tools,
file_ids=file_ids, # Files attached here are available to all threads.
tool_resources={
"code_interpreter": ToolResourcesCodeInterpreter(file_ids=file_ids),
"file_search": {
"vector_store_ids": file_ids,
},
},
)
else:
# This is a special init case for copying the solver - see `OpenAIAssistantsSolver.copy()`
Expand Down Expand Up @@ -140,7 +147,13 @@ def _solve(
thread_id=self.thread.id,
role=user_message.role,
content=user_message.content,
file_ids=thread_file_ids,
attachments=[
message_create_params.Attachment(
file_id=file_id,
tools=[{"type": "code_interpreter"}, {"type": "file_search"}],
)
for file_id in thread_file_ids
],
)

# Run Assistant on the Thread
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def code_interpreter_solver():
def retrieval_solver():
solver = OpenAIAssistantsSolver(
model=MODEL,
tools=[{"type": "retrieval"}],
tools=[{"type": "file_search"}],
)
return solver

Expand Down