Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Human-Relative MLAgentBench #1496

Merged
merged 21 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions evals/elsuite/hr_ml_agent_bench/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
benchmarks/babylm/env/babylm_data
benchmarks/**/prepared
benchmarks/**/submission.txt
benchmarks/**/*.checkpoint
benchmarks/**/*.log
scripts/**/*.log
data
226 changes: 226 additions & 0 deletions evals/elsuite/hr_ml_agent_bench/README.md

Large diffs are not rendered by default.

Empty file.
60 changes: 60 additions & 0 deletions evals/elsuite/hr_ml_agent_bench/actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import json
import re
from typing import Optional

from evals.elsuite.hr_ml_agent_bench.high_level_actions import HIGH_LEVEL_ACTIONS
from evals.elsuite.hr_ml_agent_bench.low_level_actions import LOW_LEVEL_ACTIONS
from evals.elsuite.hr_ml_agent_bench.schema import Action

ACTION_SPACE = LOW_LEVEL_ACTIONS + HIGH_LEVEL_ACTIONS


def make_action_string(name: str, args: dict) -> str:
stringified_args = json.dumps(args, indent=4)
return f"Action: {name}\nAction Input: {stringified_args}"


def get_action(s: str) -> Optional[Action]:
"""Return an `Action` object from a string representation of an action, if it exists."""

action_pattern = r"Action:\s*(.+)"
args_pattern = r"Action Input:\s*(\{.*?\}|\S.*)"

action_match = re.search(action_pattern, s)
args_match = re.search(args_pattern, s, re.DOTALL)

if not action_match:
return None

action_name = action_match.group(1).strip()
action_args = None

if args_match:
args_str = args_match.group(1).strip()

try:
action_args = json.loads(args_str)
except json.JSONDecodeError:
action_args = args_str # Return raw string if JSON parsing fails

return Action(name=action_name, args=action_args)


def is_valid_action(action: Action) -> bool:
"""Return True if the action has a valid name and arguments, False otherwise."""

assert isinstance(action, Action)

if isinstance(action.args, str):
return False

for valid_action in ACTION_SPACE:
if action.name != valid_action.name:
continue

actual_args = action.args.keys()
expected_args = valid_action.usage.keys()

return actual_args == expected_args

return False
79 changes: 79 additions & 0 deletions evals/elsuite/hr_ml_agent_bench/auto_marking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import importlib
import json
from dataclasses import dataclass
from json import JSONDecodeError
from pathlib import Path


@dataclass(frozen=True)
class EvaluationResult:
# Raw scores in the original unit of the task.
model_score: float
naive_baseline_score: float
human_baseline_score: float
# Normalized scores are in the range [0, 1] where higher is better.
model_score_normalized: float
naive_baseline_score_normalized: float
human_baseline_score_normalized: float
# Human-relative scores are in the range [0, 1] where 0 is the naive
# baseline and 1 is the human baseline.
model_score_humanrelative: float


def grade_submission(log_dir: Path, task_name: str) -> EvaluationResult:
"""
Grades the submission in `log_dir` using a task-specific grading script.
"""

# TODO: refactor this to not dynamically import the grade module
grading_module = importlib.import_module(
f"evals.elsuite.hr_ml_agent_bench.benchmarks.{task_name}.scripts.grade"
)

# Get baselines
naive_baseline_score_raw = grading_module.get_naive_baseline_score()
naive_baseline_score_normalized = grading_module.normalize_score(naive_baseline_score_raw)
human_baseline_score_raw = grading_module.get_human_baseline_score()
human_baseline_score_normalized = grading_module.normalize_score(human_baseline_score_raw)

traces = list(log_dir.glob("**/trace.json"))

assert len(traces) == 1, f"Expected to find exactly one submission. Found {len(traces)}."

best_raw_score = naive_baseline_score_raw
best_normalized_score = naive_baseline_score_normalized

for trace in traces:
with open(trace) as f:
contents = f.read()

try:
data = json.loads(contents)
except JSONDecodeError:
continue

n_steps = len(data["steps"])

for step in range(n_steps):
submission_dir = trace.parent / "traces" / f"step_{step}_files"
raw_score = grading_module.get_score(submission_dir)
normalized_score = grading_module.normalize_score(raw_score)

if normalized_score > best_normalized_score:
best_raw_score = raw_score
best_normalized_score = normalized_score

# Calculate final human-relative score using normalized scores
model_score_humanrelative = (best_normalized_score - naive_baseline_score_normalized) / (
human_baseline_score_normalized - naive_baseline_score_normalized
)

return EvaluationResult(
model_score=best_raw_score,
naive_baseline_score=naive_baseline_score_raw,
human_baseline_score=human_baseline_score_raw,
model_score_normalized=best_normalized_score,
naive_baseline_score_normalized=naive_baseline_score_normalized,
human_baseline_score_normalized=human_baseline_score_normalized,
model_score_humanrelative=model_score_humanrelative,
)
214 changes: 214 additions & 0 deletions evals/elsuite/hr_ml_agent_bench/autoeval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import json
import time
from dataclasses import dataclass, replace
from logging import getLogger
from pathlib import Path

from evals.elsuite.hr_ml_agent_bench.actions import get_action, is_valid_action
from evals.elsuite.hr_ml_agent_bench.auto_marking import EvaluationResult, grade_submission
from evals.elsuite.hr_ml_agent_bench.environment import Environment
from evals.elsuite.hr_ml_agent_bench.prompts import get_task_description
from evals.elsuite.hr_ml_agent_bench.schema import ActionInfo
from evals.solvers.solver import Solver
from evals.task_state import Message, TaskState

logger = getLogger(__name__)


@dataclass(frozen=True)
class Step:
step_idx: int
action: dict[str, str]
observation: str


@dataclass(frozen=True)
class TaskStateMetadata:
history_steps: tuple[Step, ...]
actions: dict[str, ActionInfo]
max_steps_in_context: int
max_retries: int
max_steps: int
log_dir: Path
env: Environment


@dataclass(frozen=True)
class FunctionCall:
name: str
args: dict[str, str]


def run(
solver: Solver,
task_name: str,
research_problem: str,
log_dir: Path,
work_dir: Path,
max_steps: int,
max_time: int,
max_seconds_per_step: int,
device: int = 0,
python_command: str = "python",
resume: bool = False,
resume_step: int = 0,
max_steps_in_context: int = 3,
max_retries: int = 5,
) -> EvaluationResult:
"""Evaluates the solver on a given task."""

env = Environment(
log_dir=log_dir / "env_log",
work_dir=work_dir / task_name,
task=task_name,
python_command=python_command,
resume=resume,
resume_step=resume_step,
device=device,
max_steps=max_steps,
max_time=max_time,
solver=solver,
)

task_description = get_task_description(research_problem)

logger.info(task_description)

messages = [
Message(
role="system",
content=f"You have a maximum of {max_steps} steps to solve the task. "
f"Each step is subject to a maximum time limit of {max_seconds_per_step} "
f"seconds. Additionally, your entire attempt is subject to a maximum "
f"time limit of {max_time} seconds.",
),
]

task_state = TaskState(
task_description=task_description,
messages=messages,
current_state=TaskStateMetadata(
history_steps=(),
actions=env.action_infos,
max_steps_in_context=max_steps_in_context,
max_retries=max_retries,
max_steps=max_steps,
log_dir=log_dir,
env=env,
),
)

start_time = time.time()

for step in range(max_steps):
time_elapsed = time.time() - start_time
time_remaining = max_time - time_elapsed

task_state = replace(
task_state,
messages=task_state.messages
+ [
Message(
role="system",
content=f"You have {time_remaining:.2f} seconds and {max_steps - step} steps remaining.",
),
],
)

result = solver(task_state)
new_messages = [Message(role="assistant", content=result.output)]

action = get_action(result.output)

if action is None or not is_valid_action(action):
logger.info(
f"Step {step}: [invalid response; no action executed]\n\nAssistant:\n{result.output}"
)

new_messages = task_state.messages + [
Message(
role="assistant",
content=result.output,
),
Message(
role="system",
content="No valid action found! The list of valid actions was specified at the start; please pick an action from that list.",
),
]

task_state = TaskState(
task_description=task_state.task_description,
messages=new_messages,
current_state=task_state.current_state,
)

continue

logger.info(f"\n\nAction: {action.name}\nAction Input: {json.dumps(action.args)}\n")

new_max_seconds_per_step = min(max_seconds_per_step, time_remaining)

if attempted_to_use_stable_baselines(result.output):
observation = "The action has been terminated because it used the `stable_baselines` "
"library, which is not allowed. Please write your implementation from scratch."
elif time_remaining > 0:
observation = env.execute(action, max_seconds_per_step=new_max_seconds_per_step)
else:
observation = "Time's up! Your attempt has now exceeded the maximum time limit "
f"of {max_time} seconds. The last action attempted was not executed, "
"and your current solution will be graded."

new_messages = task_state.messages + [
Message(
role="assistant",
content=result.output,
),
Message(
role="system",
content=f"Observation:\n\n```\n{observation}\n```",
),
]

new_history_steps = task_state.current_state.history_steps + (
{
"step_idx": step,
"action": {
"Action": action.name,
"Action Input": json.dumps(action.args, indent=4),
},
"observation": observation,
},
)

new_task_state_metadata = replace(
task_state.current_state,
history_steps=new_history_steps,
)

task_state = TaskState(
task_description=task_state.task_description,
messages=new_messages,
current_state=new_task_state_metadata,
)

logger.info(f"\n\nObservation:\n```\n{observation}\n```\n")

env.save(step)

if env.is_done():
break

env.save("final")

result = grade_submission(log_dir=log_dir, task_name=task_name)

return result


def attempted_to_use_stable_baselines(s: str) -> bool:
s = s.lower() # be case-insensitive

if "stable" in s and "baseline" in s:
return True

return False
Empty file.
Loading
Loading