Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Self-Prompting eval #1401

Merged
merged 4 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,7 @@ build

openai-key.txt
*.code-workspace

# Ignore run_experiments.sh results
evals/elsuite/**/logs/
evals/elsuite/**/outputs/
261 changes: 261 additions & 0 deletions evals/elsuite/self_prompting/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
import json
import logging
import random
from pathlib import Path
from typing import Any, Optional, Union

import numpy as np

import evals
import evals.metrics
from evals.api import CompletionFn
from evals.elsuite.self_prompting.task_description import sample_in_token, task_description_template
from evals.eval import SolverEval
from evals.registry import registry
from evals.solvers.solver import Solver
from evals.task_state import TaskState
from evals.utils.log_utils import extract_final_results, extract_spec

logger = logging.getLogger(__name__)


class SelfPrompting(SolverEval):
def __init__(
self,
completion_fns: list[CompletionFn],
samples_jsonl: str,
tasker_models: list[str],
n_tasks: int = 50,
n_samples_per_task: int = 10,
n_preview_samples: int = 5,
baseline_logpath: Optional[str] = None,
*args,
**kwargs,
):
super().__init__(completion_fns, *args, **kwargs)
# CI doesn't have access to model APIs, so replace tasker_models with dummy models
# if we're running in CI (i.e. if the first completion_fn is a DummyCompletionFn)
if isinstance(completion_fns[0], evals.api.DummyCompletionFn):
tasker_models = ["dummy" for _ in tasker_models]

self.samples_jsonl = samples_jsonl
self.tasker_models = tasker_models
self.n_tasks = n_tasks
self.n_samples_per_task = n_samples_per_task
self.n_preview_samples = n_preview_samples
self.baseline_logpath = (
self._prefix_registry_path(baseline_logpath) if baseline_logpath else None
)
assert len(self.tasker_models) > 0, "Must provide at least one tasker model"
assert self.n_tasks > 0, "Must provide at least one task"
assert self.n_samples_per_task > 0, "Must provide at least one sample per task"

np.random.seed(self.seed)

self.tasker_completion_fns = {}
for tasker_model in self.tasker_models:
self.tasker_completion_fns[tasker_model] = registry.make_completion_fn(tasker_model)

def eval_sample(self, solver: Solver, sample: Any, rng: random.Random):
if sample["stage"] == "prompting":
return self._run_prompting(solver, sample)
elif sample["stage"] == "tasking":
return self._run_tasking(sample)
else:
raise ValueError(f"Invalid stage {sample['stage']}")

def _run_prompting(self, solver: Solver, sample: Any, *_):
# Prompt the prompter_model to generate a prompt for the tasker_model
task_description = task_description_template.format(
instruction=sample["task"]["instruction"],
samples=json.dumps(sample["task"]["train_samples"], indent=2),
tasker_model=sample["tasker_model"],
)
task_state = TaskState(
task_description=task_description,
current_state={
"instruction": sample["task"]["instruction"],
"samples": sample["task"]["train_samples"],
"tasker_model": sample["tasker_model"],
},
)
solver_result = solver(task_state)
model_instruction = solver_result.output

prompt_rule_violation = sample_in_token not in model_instruction

output = {
**sample,
"task_description": task_description,
"current_state": task_state.current_state,
"prompting_solver_metadata": solver_result.to_json(),
"model_instruction": model_instruction,
"prompt_rule_violation": prompt_rule_violation,
}
return output

def _run_tasking(self, sample: Any, *_):
tasker_completion_fn = self.tasker_completion_fns[sample["tasker_model"]]

if sample_in_token in sample["model_instruction"]:
# Fill in the sample input
full_prompt = sample["model_instruction"].replace(sample_in_token, sample["input"])
else:
# Append the sample input
full_prompt = f"{sample['model_instruction']}\n{sample['input']}"
tasker_output = tasker_completion_fn(full_prompt).get_completions()[0]

exact = 1 if tasker_output == sample["output"] else 0
fuzzy = 1 if tasker_output in sample["output"] or sample["output"] in tasker_output else 0

output = {
**sample,
"full_prompt": full_prompt,
"tasker_output": tasker_output,
"exact": exact,
"fuzzy": fuzzy,
}
evals.record.record_metrics(**output)
return output

def _calculate_improvement_wrt_baseline(
self, current_res: dict[str, float]
) -> dict[str, float]:
if self.baseline_logpath is None:
logger.warn("SKIPPING IMPROVEMENT METRICS. (No baseline logpath provided.)")
return {}

# Check that baseline was run on the same tasker models, tasks, and samples
baseline_spec = extract_spec(Path(self.baseline_logpath))
try:
spec_args = baseline_spec["run_config"]["eval_spec"]["args"]
except KeyError:
logger.warn("SKIPPING IMPROVEMENT METRICS. (Failed to validate baseline spec.)")
return {}
if set(spec_args["tasker_models"]) != set(self.tasker_models):
logger.warn(
f"SKIPPING IMPROVEMENT METRICS. (Baseline tasker_models {spec_args['tasker_models']} do not match {self.tasker_models}.)"
)
return {}
if (
spec_args["n_tasks"] != self.n_tasks
): # TODO: Ideally we would check that the tasks are the same
logger.warn(
f"SKIPPING IMPROVEMENT METRICS. (Baseline n_tasks {spec_args['n_tasks']} does not match {self.n_tasks}.)"
)
return {}
if spec_args["n_samples_per_task"] != self.n_samples_per_task:
logger.warn(
f"SKIPPING IMPROVEMENT METRICS. (Baseline n_samples_per_task {spec_args['n_samples_per_task']} does not match {self.n_samples_per_task}.)"
)
return {}

baseline_res = extract_final_results(Path(self.baseline_logpath))

def normalized_improvement(current, baseline):
"""
Returns a score between -1 and 1, where
-1 means the current score maximally regresses from the baseline (i.e. the current score is 0)
0 means the current score is the same as the baseline
+1 means the current score achieves max improvement over the baseline
"""
if current < baseline:
return (current - baseline) / baseline
else:
return (current - baseline) / (1 - baseline)

improvement_scores = {
"accuracy_improvement_wrt_oriprompt": normalized_improvement(
current_res["accuracy"], baseline_res["accuracy"]
),
"accuracy_fuzzy_improvement_wrt_oriprompt": normalized_improvement(
current_res["accuracy_fuzzy"], baseline_res["accuracy_fuzzy"]
),
"baseline_accuracy": baseline_res["accuracy"],
"baseline_accuracy_fuzzy": baseline_res["accuracy_fuzzy"],
}
logger.info(f"Improvement scores: {improvement_scores}")
return improvement_scores

def run(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]:
samples = self.get_samples()

# Shuffle and limit samples
np.random.shuffle(samples)
samples_by_task = samples[: self.n_tasks]
assert len(samples_by_task) == self.n_tasks
for task in samples_by_task:
np.random.shuffle(task["test_samples"])
np.random.shuffle(task["train_samples"])
task["test_samples"] = task["test_samples"][: self.n_samples_per_task]
task["train_samples"] = task["train_samples"][: self.n_preview_samples]
assert len(task["test_samples"]) == self.n_samples_per_task
assert len(task["train_samples"]) == self.n_preview_samples

# Run prompting
prompting_samples = []
for task in samples_by_task:
for tasker_model in self.tasker_models:
prompting_samples.append(
{
"stage": "prompting",
"tasker_model": tasker_model,
"task": task,
}
)
assert len(prompting_samples) == len(self.tasker_models) * self.n_tasks
prompting_results = self.eval_all_samples(recorder, prompting_samples)

# Run tasking
tasking_samples = [] # Store in flattened list for parallel eval
for prompt_res in prompting_results:
prompt_res["stage"] = "tasking" # Update stage
for sample in prompt_res["task"]["test_samples"]:
tasking_samples.append(
{
**prompt_res,
"input": sample["input"],
"output": sample["output"],
}
)
assert len(tasking_samples) == len(prompting_results) * self.n_samples_per_task
self.eval_all_samples(recorder, tasking_samples)

# The score of a Prompter is the average score of all Tasker models it writes prompts for
metrics = recorder.get_metrics()

# Primary metrics
result = {
"accuracy": np.mean([metric["exact"] for metric in metrics]),
"accuracy_fuzzy": np.mean([metric["fuzzy"] for metric in metrics]),
}
# Relative improvement against baseline
improvement_scores = self._calculate_improvement_wrt_baseline(result)
if improvement_scores:
result.update(improvement_scores)

# Peripheral metrics
result.update(
{
"prompt_rule_violation_rate": np.mean(
[int(metric["prompt_rule_violation"]) for metric in metrics]
),
"n_samples": len(metrics),
}
)

# Breakdown by tasker model
def compute_mean_tasker(key, tasker_model):
return np.mean(
[metric[key] for metric in metrics if metric["tasker_model"] == tasker_model]
)

for tasker in self.tasker_models:
result.update(
{
f"accuracy_{tasker}": compute_mean_tasker("exact", tasker),
f"accuracy_fuzzy_{tasker}": compute_mean_tasker("fuzzy", tasker),
}
)

return result
58 changes: 58 additions & 0 deletions evals/elsuite/self_prompting/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Eval description

How well can LMs write prompts for themselves to perform various tasks?

In the Self-Prompting eval, models (Prompters) write prompts for other models (Taskers) to perform various tasks -- which are other evals from this repository (listed below). Prompters are given an initial human-written prompt for the task, and asked to rewrite it for a given Tasker model. The effectiveness of the Prompters are measured in terms of the accuracy of downstream Taskers on the tasks. We measure this prompting ability for a variety of different downstream models: gpt-3.5-turbo, gpt-4-base, and gpt-4.

The headline metric for a Prompter’s success is the mean accuracy of the predictions of all its Taskers on all tasks.
- For our primary metric `accuracy`, the accuracy score uses an exact match criterion to judge if the tasker response is correct or not (a response is correct if and only if it exactly matches the true label in the dataset).
- As a secondary metric `accuracy_fuzzy`, we also compute results with a fuzzy match criterion, which counts a response as correct if either the model response contains the label or the label contains the response.

Additionally, we also present `accuracy_improvement_wrt_oriprompt` and `accuracy_fuzzy_improvement_wrt_oriprompt` which are the accuracies normalized relative to the score of the original prompt baseline. This is a score between -1 and +1, where -1 means the current score maximally regresses from the baseline (i.e. the current score is 0), 0 means the current score is the same as the baseline, and +1 means the current score achieves max improvement over the baseline. By default, the baseline score is a cached score of the original prompt (`self_prompting/oriprompt/baseline`) on the `self_prompting.full` eval.

# Usage

To run the eval, use the following command:
```bash
oaieval {solver} self_prompting
```
where `{solver}` is the name of the solver you want to evaluate, e.g. `self_prompting/chat_completion/gpt-4-32k`.

# Experiments
As a starting point for deeper exploration, we provide scripts for comparing various solvers and eval variants, as well as for plotting the results. To run these:
```
cd scripts/
bash run_experiments.sh
```

# Dataset

To form the self-prompting dataset, we extract tasks from this `evals` repository, selecting for datasets with
1. A system prompt that can be straightforwardly converted into a generic instruction for all task samples
2. A straightforward input-output format for each task sample.
3. Designed to be evaluated with an exact match criterion.

The full list of 50 evals we use can be found in `scripts/dataset/eval_list.py`.

# Token estimate
Below, we present a rough estimate of the total number of tokens consumed by the eval, including both input and output tokens.

For self-prompting, each eval run queries multiple models. In the following table, we present the number of tokens consumed by Prompter models:

| Model | Solver type | Tokens |
|-------------------|-----------------|---------|
| code-davinci-002 | completion_hhh | 400 000 |
| gpt-4-base | completion_hhh | 360 000 |
| gpt-3.5-turbo-16k | chat_completion | 180 000 |
| gpt-4-32k | chat_completion | 155 000 |
| gpt-3.5-turbo-16k | cot | 480 000 |
| gpt-4-32k | cot | 420 000 |
| gpt-3.5-turbo-16k | cotexpert | 495 000 |
| gpt-4-32k | cotexpert | 450 000 |

In addition to the Prompter tokens, each run also queries multiple Tasker models. By default, we use gpt-3.5-turbo, gpt-4-base, and gpt-4, consuming an additional 100k-200k tokens per model.

To calculate dollar cost from token counts, please check the latest token pricing [here](https://openai.com/pricing). Note that we count both input and output tokens together, so a lower and upper estimate of the cost of each variant can be predicted.

# Contribution statement
Eval design, implementation, and results evaluation were primarily conducted by Chan Jun Shern under the guidance of (alphabetically by last-name) Steven Adler, James Aung, Rosie Campbell, and Jade Leung, who provided research input and project management support.
Loading
Loading