diff --git a/evals/elsuite/error_recovery/README.md b/evals/elsuite/error_recovery/README.md new file mode 100644 index 0000000000..78f6ceb726 --- /dev/null +++ b/evals/elsuite/error_recovery/README.md @@ -0,0 +1,96 @@ +# Error Recovery + +This eval evaluates how well LMs can recover from having made reasoning errors while solving a problem. + +## Usage + +Run with: + +``` +oaieval error-recovery +``` + +We suggest using `error_recovery/gpt-4-0613` or `error_recovery/gpt-3.5-turbo-0613` as default choices for `` + +For more examples of running this eval, see `scripts/run_experiments.sh` + +## Dataset + +The dataset consists of 1436 problems from [BIG-Bench Mistake](https://github.com/WHGTyen/BIG-Bench-Mistake) which have reasoning traces with mistakes in them, and incorrect answers. + +## Evaluation Process + +For each example, we run the solver on three variations of the problem: + +1) No Reasoning (NR) variation - Give the solver the question and ask it to generate reasoning and an answer. Example below: +``` +[SYSTEM] +Solve the given problem, writing your reasoning along the way. + +[USER] +Solve this problem for x: x = ((3 + 3) * 5) + 2 +``` + + +2) Correct Reasoning (CR) variation - Give the solver the question and some correct reasoning steps and have it keep generating reasoning and an answer. Example below: + +``` +[SYSTEM] +Solve the given problem, writing your reasoning along the way. + +[USER] +Solve this problem for x: x = ((3 + 3) * 5) + 2 + +[ASSISTANT] +3 + 3 = 6 +``` + +3) Incorrect Reasoning (IR) variation - Give the solver the question, the same correct reasoning steps as before, and an incorrect reasoning step and have it keep generating reasoning and an answer. + +``` +[SYSTEM] +Solve the given problem, writing your reasoning along the way. + +[USER] +Solve this problem for x: x = ((3 + 3) * 5) + 2 + +[ASSISTANT] +3 + 3 = 6 +6 * 5 = 35 +``` + + +## Metrics + +The below are the key metrics of this eval: + +| Metric | Interpretation | +| --- | --- | +| `NR_correct_rate` | The accuracy of the solver in the NR variation. +| `CR_correct_rate` | The accuracy of the solver in the CR variation. +| `IR_correct_rate` | The accuracy of the solver in the IR variation. + +## Variants + +| Variant | Notes | +| --- | --- | +| Default: `error-recovery.main` | Default setting. Includes all 1436 examples. | +| `error-recovery.medium` | A balanced subset of 250 examples, 50 from each task. | +| `error-recovery.small` | A balanced subset of 10 examples, 2 from each task (primarily for quick tests). | + +## Token Usage Estimates + +Below is a rough estimate of the total number of tokens consumed by the default variant: + +| Solver | Tokens | +| --- | --- | +| error_recovery/gpt-3.5-turbo-0613 | 4,750,781 | +| error_recovery/gpt-4-0613 | 4,386,440 | + +## Version History + +- v0: Initial version released + +## Contribution statement + +Eval design, implementation, results evaluation, and report were primarily done by Ian McKenzie based on an initial version by Oliver Jaffe. Work done under the guidance of (alphabetically by last-name) Steven Adler, James Aung, and Chan Jun Shern, who provided research input, report revisions, and project management support. diff --git a/evals/elsuite/error_recovery/defaults.py b/evals/elsuite/error_recovery/defaults.py new file mode 100644 index 0000000000..ee5560a08d --- /dev/null +++ b/evals/elsuite/error_recovery/defaults.py @@ -0,0 +1,15 @@ +DEFAULT_TASK_DESCRIPTION = "Solve the given problem, writing your reasoning along the way." + +DEFAULT_MISTAKE_MESSAGE = "There might be a mistake in your reasoning." + +DEFAULT_FINAL_ANSWER_MESSAGE = ( + "Given this reasoning, write your final answer. Only write your final answer, and nothing else." +) + +TASK_SPECIFIC_EXTRACTION_INFO = { + "dyck_languages": "\n\nAnswer with just the end of the sequence, separated by spaces. Do not repeat the part of the sequence given in the question. Only write the sequence of symbols, nothing else.", + "logical_deduction": "\n\nAnswer with the selected single letter indicating your answer, wrapped with parentheses. Do not write anything else.", + "multistep_arithmetic": "\n\nAnswer with a single number.", + "tracking_shuffled_objects": "\n\nAnswer with the selected single letter indicating your answer, wrapped with parentheses. Do not write anything else.", + "word_sorting": "\n\nAnswer with the sorted words, each lower case and separated by a single space.", +} diff --git a/evals/elsuite/error_recovery/eval.py b/evals/elsuite/error_recovery/eval.py new file mode 100644 index 0000000000..89512179fe --- /dev/null +++ b/evals/elsuite/error_recovery/eval.py @@ -0,0 +1,284 @@ +import copy +import random +from dataclasses import dataclass +from typing import Any, List, Literal, Optional, Sequence + +import evals +import evals.metrics +import evals.record +from evals.api import CompletionFn +from evals.elsuite.error_recovery.defaults import ( + DEFAULT_FINAL_ANSWER_MESSAGE, + DEFAULT_MISTAKE_MESSAGE, + DEFAULT_TASK_DESCRIPTION, + TASK_SPECIFIC_EXTRACTION_INFO, +) +from evals.eval import SolverEval +from evals.solvers.solver import Solver +from evals.task_state import Message, TaskState + +# possible Mistake NOTIFiciation POSitions +MistakeNotifPos = Literal["immediate", "end"] + + +@dataclass +class Sample: + question: str + correct_steps: Sequence[str] + incorrect_step: str + target: Any + task: str + num_ground_truth_steps: int + mistake_index: int + + +class ErrorRecovery(SolverEval): + def __init__( + self, + completion_fns: Sequence[CompletionFn], + samples_jsonl: str, + n_samples: Optional[int] = None, + mistake_notification_position: Optional[MistakeNotifPos] = None, + mistake_notification_for_ir_only: bool = False, + mark_as_own_reasoning: bool = True, + final_answer_prompt_role: str = "system", + *args, + **kwargs, + ): + """Evaluate a solver on the error recovery task. + + Args: + completion_fns: The completion functions to evaluate. (should be a single solver) + samples_jsonl: The relative path to the samples jsonl file in evals/registry/data. + n_samples: The number of samples to use. If None, use all samples. + mistake_notification_position: The position of the mistake + notification. Options are "immediate" for right after the provided + reasoning, or "end" for right after the model-generated reasoning. + If None, no mistake notification is added. + mistake_notification_for_ir_only: Whether to only add the mistake notification + for the incorrect reasoning case. If True, the mistake notification is + added for the incorrect reasoning case, and not for the correct reasoning + or no reasoning cases. + mark_as_own_reasoning: Whether to include the sample reasoning as an + 'assistant' or 'user' message. + final_answer_prompt_role: The role to use for the final answer prompt. Should + be either "system" or "user". + """ + super().__init__( + completion_fns=completion_fns, samples_jsonl=samples_jsonl, *args, **kwargs + ) + + self.n_samples = n_samples + self.mistake_notif_pos: Optional[MistakeNotifPos] = mistake_notification_position + self.mistake_notif_ir_only = mistake_notification_for_ir_only + + # there are some issues with passing bools in from extra_eval_params + assert isinstance(mark_as_own_reasoning, bool) + self.mark_as_own_reasoning = mark_as_own_reasoning + + self.final_answer_prompt_role = final_answer_prompt_role + assert self.final_answer_prompt_role in ["system", "user"] + + def eval_sample(self, solver: Solver, sample: Sample, rng: random.Random, extra_logging=None): + task = sample.task + + # Get the baseline with no provided reasoning + nr_task_state = self._get_no_reasoning_task_state(sample) + # only "end" makes sense for 'no reasoning' + nr_notif_pos = "end" if self.mistake_notif_pos == "end" else None + if self.mistake_notif_ir_only: + nr_notif_pos = None + + nr_answer = self._get_answer( + solver=solver, + task_state=nr_task_state, + sample=sample, + mistake_notif_pos=nr_notif_pos, + ) + + # Run with correct reasoning + cr_task_state = self._get_correct_reasoning_task_state(sample) + cr_notif_pos = self.mistake_notif_pos + if self.mistake_notif_ir_only: + cr_notif_pos = None + + cr_answer = self._get_answer( + solver=solver, + task_state=cr_task_state, + sample=sample, + mistake_notif_pos=cr_notif_pos, + ) + + # Run with incorrect reasoning + ir_task_state = self._get_incorrect_reasoning_task_state(sample) + ir_notif_pos = self.mistake_notif_pos + + ir_answer = self._get_answer( + solver=solver, + task_state=ir_task_state, + sample=sample, + mistake_notif_pos=ir_notif_pos, + ) + + assert len(sample.correct_steps) == sample.mistake_index + + metrics = { + "task": task, + "num_ground_truth_steps": sample.num_ground_truth_steps, + "mistake_index": sample.mistake_index, + "target": str(sample.target), # ground truth answer + "mistake_notification_position": self.mistake_notif_pos, + "mistake_notification_for_ir_only": self.mistake_notif_ir_only, + "NR_sampled": nr_answer, + "CR_sampled": cr_answer, + "IR_sampled": ir_answer, + "NR_correct": nr_answer == str(sample.target), + "CR_correct": cr_answer == str(sample.target), + "IR_correct": ir_answer == str(sample.target), + } + evals.record.record_metrics(**metrics) + + def _get_no_reasoning_task_state(self, sample: Sample) -> TaskState: + task_description = DEFAULT_TASK_DESCRIPTION + no_reasoning_messages = [ + Message(role="user", content=sample.question), + ] + no_reasoning_task_state = TaskState( + task_description=task_description, + messages=no_reasoning_messages, + ) + return no_reasoning_task_state + + def _get_correct_reasoning_task_state(self, sample: Sample) -> TaskState: + task_description = DEFAULT_TASK_DESCRIPTION + correct_steps = "\n".join(sample.correct_steps) + reasoning_role = "assistant" if self.mark_as_own_reasoning else "user" + correct_reasoning_messages = [ + Message(role="user", content=sample.question), + Message(role=reasoning_role, content=correct_steps), + ] + correct_reasoning_task_state = TaskState( + task_description=task_description, + messages=correct_reasoning_messages, + ) + return correct_reasoning_task_state + + def _get_incorrect_reasoning_task_state( + self, + sample: Sample, + ) -> TaskState: + task_description = DEFAULT_TASK_DESCRIPTION + correct_steps = "\n".join(sample.correct_steps) + steps_with_incorrect_reasoning = f"{correct_steps}\n{sample.incorrect_step}" + reasoning_role = "assistant" if self.mark_as_own_reasoning else "user" + incorrect_reasoning_messages = [ + Message(role="user", content=sample.question), + Message(role=reasoning_role, content=steps_with_incorrect_reasoning), + ] + + incorrect_reasoning_task_state = TaskState( + task_description=task_description, + messages=incorrect_reasoning_messages, + ) + return incorrect_reasoning_task_state + + def _get_answer( + self, + solver: Solver, + task_state: TaskState, + sample: Sample, + mistake_notif_pos: Optional[MistakeNotifPos], + ) -> str: + """Get a final answer from the solver for a given sample. + + Args: + solver: The solver to use. + task_state: The task state to use. + sample: The Sample being evaluated (relevant for answer extraction). + mistake_notification_position: The position of the mistake notification. + Options are "immediate" for right after the provided reasoning, or "end" for right + after the model-generated reasoning. If None, no mistake notification is added. + + TODO (ian): Work out whether to add mistake notification to 'no reasoning' baseline + """ + mistake_message = Message("user", DEFAULT_MISTAKE_MESSAGE) + if mistake_notif_pos == "immediate": + task_state.messages.append(mistake_message) + + output = solver(task_state=task_state).output + task_state.messages.append(Message("assistant", output)) + + # run solver again if mistake notification is at the end + if mistake_notif_pos == "end": + task_state.messages.append(mistake_message) + output = solver(task_state=task_state).output + task_state.messages.append(Message("assistant", output)) + + answer = self._extract_final_answer(solver=solver, task_state=task_state, sample=sample) + return answer + + def run(self, recorder: evals.record.Recorder): + samples = self.get_samples() + + self.eval_all_samples(recorder, samples) + metrics = recorder.get_metrics() + + NR_correct_rate = len([i for i in metrics if i["NR_correct"]]) / len(metrics) + CR_correct_rate = len([i for i in metrics if i["CR_correct"]]) / len(metrics) + IR_correct_rate = len([i for i in metrics if i["IR_correct"]]) / len(metrics) + + results = { + "NR_correct_rate": NR_correct_rate, + "CR_correct_rate": CR_correct_rate, + "IR_correct_rate": IR_correct_rate, + } + + # Split results per type of task + all_tasks = set([i["task"] for i in metrics]) + for task in all_tasks: + filtered_metrics = [i for i in metrics if i["task"] == task] + NR_correct_rate = len([i for i in filtered_metrics if i["NR_correct"]]) / len( + filtered_metrics + ) + CR_correct_rate = len([i for i in filtered_metrics if i["CR_correct"]]) / len( + filtered_metrics + ) + IR_correct_rate = len([i for i in filtered_metrics if i["IR_correct"]]) / len( + filtered_metrics + ) + + # we use hyphens in the task name so they can be extracted by splitting on underscores + task_string = task.replace("_", "-") + results.update( + { + f"task_{task_string}_NR_correct_rate": NR_correct_rate, + f"task_{task_string}_CR_correct_rate": CR_correct_rate, + f"task_{task_string}_IR_correct_rate": IR_correct_rate, + } + ) + + return results + + def _extract_final_answer(self, solver: Solver, task_state: TaskState, sample: Sample): + """Extract the final answer from the solver output using the same solver.""" + task_state = copy.deepcopy(task_state) + + task_specific_info = TASK_SPECIFIC_EXTRACTION_INFO[sample.task] + final_answer_prompt = DEFAULT_FINAL_ANSWER_MESSAGE + task_specific_info + + task_state.messages.append( + Message(role=self.final_answer_prompt_role, content=final_answer_prompt) + ) + answer = solver(task_state=task_state).output + + return answer + + def get_samples(self) -> List[Sample]: + samples = super().get_samples() + + if self.n_samples is not None: + assert ( + len(samples) >= self.n_samples + ), f"Can't get {self.n_samples} samples from a dataset with {len(samples)} samples" + samples = samples[: self.n_samples] + return [Sample(**sample_dict) for sample_dict in samples] diff --git a/evals/elsuite/error_recovery/scripts/dataset_creation.py b/evals/elsuite/error_recovery/scripts/dataset_creation.py new file mode 100644 index 0000000000..c6c14b2417 --- /dev/null +++ b/evals/elsuite/error_recovery/scripts/dataset_creation.py @@ -0,0 +1,156 @@ +import subprocess +from pathlib import Path + +import matplotlib.pyplot as plt +import pandas as pd + +TASK_PREFIX = { + "dyck_languages": ( + "Given the following sequence of opening and closing brackets, " + "provide the minimal sequence of additional brackets that would " + "balance the original sequence:\n" + ), + "logical_deduction": "", + "multistep_arithmetic": "", + "tracking_shuffled_objects": "", + "word_sorting": "Sort the following list of words alphabetically:\n", +} + + +def main(): + data = clone_and_load_data() + # plot_hist(data) + pos_data = create_positive_examples(data) + # don't use examples where last step is mistake + pos_data = pos_data[pos_data["mistake_index"] < pos_data["num_steps"] - 1] + + # only save a subset of the columns + pos_data = pos_data[ + ["input", "correct_steps", "incorrect_step", "mistake_index", "num_steps", "target", "task"] + ] + pos_data.rename( + columns={ + "input": "question", + "num_steps": "num_ground_truth_steps", + }, + inplace=True, + ) + + # save data + save_path = Path("evals/registry/data/error_recovery/main.jsonl") + pos_data.to_json(save_path, orient="records", lines=True) + + small_save_path = Path("evals/registry/data/error_recovery/small.jsonl") + # get small dataset with two examples from each task + small_data = create_data_subset(pos_data, examples_per_task=2) + small_data.to_json(small_save_path, orient="records", lines=True) + + medium_save_path = Path("evals/registry/data/error_recovery/medium.jsonl") + # get medium dataset with 50 examples from each task + medium_data = create_data_subset(pos_data, examples_per_task=50) + medium_data.to_json(medium_save_path, orient="records", lines=True) + + +def create_data_subset(data: pd.DataFrame, examples_per_task: int) -> pd.DataFrame: + # get small dataset with a subset of examples from each task + small_data = pd.DataFrame() + for task in data["task"].unique(): + task_data = data[data["task"] == task] + task_subset = task_data[:examples_per_task] + if len(task_subset) < examples_per_task: + raise ValueError( + f"Task {task} has only {len(task_subset)} examples, less than {examples_per_task}" + ) + small_data = pd.concat((small_data, task_subset)) + return small_data + + +def create_positive_examples(data: pd.DataFrame) -> pd.DataFrame: + has_incorrect_reasoning = ~data["mistake_index"].isnull() + has_incorrect_answer = data["target"] != data["answer"] + positive_condition = has_incorrect_reasoning & has_incorrect_answer + + positive_data = data.copy() + positive_data = positive_data[positive_condition].reset_index() + positive_data["label"] = "positive" + positive_data["correct_steps"] = positive_data.apply( + lambda row: row["steps"][: int(row["mistake_index"])], axis=1 + ) + positive_data["incorrect_step"] = positive_data.apply( + lambda row: row["steps"][int(row["mistake_index"])], axis=1 + ) + return positive_data + + +def create_negative_examples(data: pd.DataFrame) -> pd.DataFrame: + """Create a dataset of examples with correct reasoning and answer. + + The 'negative' naming is a bit misleading, but these are the examples + we don't use. + TODO (ian): think about renaming + """ + has_correct_reasoning = data["mistake_index"].isnull() + has_correct_answer = data["target"] == data["answer"] + negative_condition = has_correct_reasoning & has_correct_answer + negative_data = data.copy() + negative_data = negative_data[negative_condition].reset_index() + negative_data["label"] = "negative" + negative_data["correct_steps"] = negative_data["steps"] + negative_data["incorrect_step"] = "" + return negative_data + + +def clone_and_load_data(): + clone_dir = Path("/tmp/BIG-Bench-Mistake") + maybe_clone_repo(clone_dir) + + data = pd.DataFrame() + for jsonl_file in clone_dir.glob("*.jsonl"): + file_data = pd.read_json(jsonl_file, lines=True) + + # Manually append task description to datasets missing one + task = jsonl_file.stem + prefix = TASK_PREFIX[task] + file_data["input"] = prefix + file_data["input"] + file_data["task"] = task + + data = pd.concat((data, file_data)) + + data["num_steps"] = data["steps"].apply(lambda x: len(x)) + return data + + +def maybe_clone_repo(clone_dir): + if not clone_dir.exists(): + subprocess.run( + ["git", "clone", "https://github.com/WHGTyen/BIG-Bench-Mistake.git", str(clone_dir)] + ) + + +def plot_hist(data): + data["num_steps"].hist(bins=max(data["num_steps"])) + plt.show() + + +def print_example(): + data = clone_and_load_data() + # printing some examples + subset_data = create_positive_examples(data) + # subset_data = create_negative_examples(data) + # # print one negative object swapping example + # neg_example = neg_data[neg_data["task"] == "tracking_shuffled_objects"].iloc[0] + # # print one negative dyck example + # neg_example = neg_data[neg_data["task"] == "dyck_languages"].iloc[0] + # neg_example = neg_data[neg_data["task"] == "logical_deduction"].iloc[0] + example = subset_data[subset_data["task"] == "multistep_arithmetic"].iloc[1] + print(f"INPUT ======\n{example['input']}") + steps = "\n".join(example["steps"]) + print(f"STEPS ======\n{steps}") + print(f"MISTAKE INDEX ======\n{example['mistake_index']}") + print(f"ANSWER ======\n{example['answer']}") + print(f"TARGET ======\n{example['target']}") + print("========") + + +if __name__ == "__main__": + main() diff --git a/evals/elsuite/error_recovery/scripts/make_plots.py b/evals/elsuite/error_recovery/scripts/make_plots.py new file mode 100644 index 0000000000..0d2dcfaa43 --- /dev/null +++ b/evals/elsuite/error_recovery/scripts/make_plots.py @@ -0,0 +1,597 @@ +import argparse +import os +from pathlib import Path +from typing import Optional + +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt + +from evals.utils import log_utils + +# MODEL_NAMES = { +# "error_recovery/gpt-4-0613": "GPT-4", +# "generation/hhh/gpt-4-base": "GPT-4 Base", +# "error_recovery/gpt-3.5-turbo-0613": "GPT-3.5", +# # "gpt-4-base": "gpt-4-base", +# } +# using model checkpoint names +MODEL_NAMES = { + "error_recovery/gpt-4-0613": "gpt-4-0613", + "generation/hhh/gpt-4-base": "gpt-4-base", + "error_recovery/gpt-3.5-turbo-0613": "gpt-3.5-turbo-0613", + # "generation/direct/llama-2-13b-chat": "llama-2-13b-chat", + "generation/direct/llama-2-70b-chat": "llama-2-70b-chat", + "generation/direct/mixtral-8x7b-instruct": "mixtral-8x7b-instruct", + "generation/direct/gemini-pro": "gemini-pro-1.0", +} + +MODEL_COLOR_MAP = { + "error_recovery/gpt-4-0613": "purple", + "generation/hhh/gpt-4-base": "plum", + "error_recovery/gpt-3.5-turbo-0613": "g", + # "generation/direct/llama-2-13b-chat": "wheat", + "generation/direct/llama-2-70b-chat": "orange", + "generation/direct/mixtral-8x7b-instruct": "red", + "generation/direct/gemini-pro": "cornflowerblue", +} +VARIATION_NAMES = { + "nr_name": "From Scratch", + "cr_name": "Correct Basis", + "ir_name": "Incorrect Basis", +} + +VARIATION_COLOR_MAP = { + "nr_name": "blue", + "cr_name": "green", + "ir_name": "red", +} + +TASK_NAMES = { + "word_sorting": "Word Sorting", + "tracking_shuffled_objects": "Tracking Shuffled Objects", + "logical_deduction": "Logical Deduction", + "multistep_arithmetic": "Multi-Step Arithmetic", + "dyck_languages": "Dyck Languages", +} + + +def maybe_show(fig): + if DISPLAY: + fig.show() + plt.close(fig) + + +def extract_results(datadir: Path) -> pd.DataFrame: + df_rows = [] + for path, results in log_utils.get_final_results_from_dir(datadir).items(): + spec = log_utils.extract_spec(path) + model = spec["completion_fns"][0] + base_eval = spec["base_eval"] + df_rows.append( + { + "model": model, + "base_eval": base_eval, + **results, + } + ) + df = pd.DataFrame(df_rows) + return df + + +def extract_metrics(datadir: Path) -> pd.DataFrame: + df_rows = [] + for path, results in sorted(list(log_utils.get_final_results_from_dir(datadir).items())): + spec = log_utils.extract_spec(path) + solver = spec["completion_fns"][0] + for res in log_utils.extract_individual_results(path): + df_rows.append( + { + "solver": solver, + **res, + } + ) + df = pd.DataFrame(df_rows) + # Sort rows + # print(df.columns) + df.sort_values(by=["solver", "task"], inplace=True) + return df + + +def get_all_tasks(results_df: pd.DataFrame) -> list[str]: + # Find all types of tasks + all_tasks = [] + for i in results_df.columns: + if i.startswith("task_") and i.endswith("_CR_correct_rate"): + all_tasks.append(i) + + # Make ordering consistent + all_tasks.sort() + return all_tasks + + +def get_all_tasks_renamed(results_df: pd.DataFrame) -> list[str]: + all_tasks = get_all_tasks(results_df) + all_tasks_renamed = [i.split("task_")[1].split("_CR_correct_rate")[0] for i in all_tasks] + # replace hyphens with underscores + all_tasks_renamed = [i.replace("-", "_") for i in all_tasks_renamed] + return all_tasks_renamed + + +def get_unique_models(results_df: pd.DataFrame) -> list[str]: + models = results_df["model"].to_list() + # TODO: work out how to order a variable set of models + if set(models) == set(MODEL_NAMES.keys()): + unique_models = list(MODEL_NAMES.keys()) + else: + unique_models = sorted(list(set(models)), reverse=True) + return unique_models + + +def get_cleaned_model_name(model: str) -> str: + return model.replace("/", "_") + + +def corrects_to_accuracy_and_sem(corrects: pd.Series): + accuracy = corrects.mean() + sem = corrects.sem() + return accuracy, sem + + +def annotate_axes(ax, errors: Optional[pd.DataFrame]): + """Annotate each bar in the plot with its value""" + ABOVE_OFFSET = 0.01 + BELOW_OFFSET = 0.1 + if errors is not None: + # This gets it into a shape to match the order of the patch objects. + # I don't have a principled reason to transpose, this is just what works. + error_values = errors.to_numpy().T.flatten() + + for i, p in enumerate(ax.patches): + # patch objects aren't typed correctly + p_height = p.get_height() # type: ignore + p_x = p.get_x() # type: ignore + p_width = p.get_width() # type: ignore + # Calculate the label position + x = p_x + p_width / 2 + if errors is not None: + error = error_values[i] + else: + error = 0 + + if p_height > 0: + y = p_height + error + ABOVE_OFFSET + else: + y = p_height - error - BELOW_OFFSET + + # Annotate the bar with its value + # ax.annotate(f"{p_height:.2f}\n±{error:.2f}", (x, y), ha="center", va="bottom") + ax.annotate(f"{p_height:.2f}", (x, y), ha="center", va="bottom") + + +def corrects_to_performance_loss_and_error(CR_corrects: pd.Series, IR_corrects: pd.Series): + CR_correct_rate = CR_corrects.mean() + IR_correct_rate = IR_corrects.mean() + + performance_recovered = IR_correct_rate / CR_correct_rate + performance_loss = 1 - performance_recovered + # propagate error from CR_corrects and IR_corrects to performance_loss + CR_correct_rate_sem = CR_corrects.sem() + IR_correct_rate_sem = IR_corrects.sem() + assert isinstance(CR_correct_rate_sem, float) + assert isinstance(IR_correct_rate_sem, float) + # using the formula for error propagation for a ratio from + # https://en.wikipedia.org/wiki/Propagation_of_uncertainty#Example_formulae + # (assuming errors in CR and IR are independent). + # NOTE: the 1 in performance_loss is a constant, + # so doesn't affect the uncertainty bounds on the ratio. + CR_term = (CR_correct_rate_sem / CR_correct_rate) ** 2 + IR_term = (IR_correct_rate_sem / IR_correct_rate) ** 2 + performance_loss_error = abs(performance_recovered) * ((CR_term + IR_term) ** 0.5) + print(f"Performance loss: {performance_loss:.2f} ± {performance_loss_error:.2f}") + return performance_loss, performance_loss_error + + +def accuracy_by_task(metrics_df, results_df: pd.DataFrame, out_dir: Path): + all_tasks = get_all_tasks(results_df) + unique_models = get_unique_models(results_df) + all_tasks_renamed = get_all_tasks_renamed(results_df) + + # Plot results separately for each model + for model in unique_models: + plot_accuracy_by_task(model, metrics_df, all_tasks, all_tasks_renamed, out_dir) + + +def accuracy_by_model_dfs(metrics_df, results_df: pd.DataFrame): + unique_models = get_unique_models(results_df) + accuracies = {} + sems = {} + for model in unique_models: + pass + # for all tasks + model_mask = metrics_df.solver == model + model_CR_corrects = metrics_df[model_mask]["CR_correct"] + model_IR_corrects = metrics_df[model_mask]["IR_correct"] + model_NR_corrects = metrics_df[model_mask]["NR_correct"] + + model_CR_accuracy, model_CR_sem = corrects_to_accuracy_and_sem(model_CR_corrects) + model_IR_accuracy, model_IR_sem = corrects_to_accuracy_and_sem(model_IR_corrects) + model_NR_accuracy, model_NR_sem = corrects_to_accuracy_and_sem(model_NR_corrects) + + pretty_model_name = MODEL_NAMES[model] + sems[pretty_model_name] = { + "nr_name": model_NR_sem, + "cr_name": model_CR_sem, + "ir_name": model_IR_sem, + } + accuracies[pretty_model_name] = { + "nr_name": model_NR_accuracy, + "cr_name": model_CR_accuracy, + "ir_name": model_IR_accuracy, + } + + order = ["nr_name", "cr_name", "ir_name"] + plot_df = pd.DataFrame(accuracies) + plot_df = plot_df.reindex(order) + sems_df = pd.DataFrame(sems) + sems_df = sems_df.reindex(order) + return plot_df, sems_df + + +def accuracy_by_model(metrics_df, results_df: pd.DataFrame, out_dir: Path): + unique_models = get_unique_models(results_df) + plot_df, sems_df = accuracy_by_model_dfs(metrics_df, results_df) + + fig, ax = plt.subplots(figsize=(12, 6), constrained_layout=True) + colors = [MODEL_COLOR_MAP[model] for model in unique_models] + plot_df.index = list(VARIATION_NAMES.values()) + sems_df.index = list(VARIATION_NAMES.values()) + ax = plot_df.plot.bar( + rot=0, + yerr=sems_df, + capsize=4, + ax=ax, + width=0.8, + color=colors, + ) + annotate_axes(ax, sems_df) + ax.set_ylim(top=1.0) + ax.set_xlabel("Reasoning variations") + ax.set_ylabel("Accuracy") + ax.set_title("Accuracy for each variation (higher is better)") + + outpath = os.path.join(out_dir, "accuracy_by_model.png") + fig.savefig(outpath) + maybe_show(fig) + + +def accuracy_by_model_and_reasoning( + own_metrics_df: pd.DataFrame, + own_results_df: pd.DataFrame, + other_metrics_df: pd.DataFrame, + other_results_df: pd.DataFrame, + out_dir: Path, +): + own_plot_df, own_sems_df = accuracy_by_model_dfs(own_metrics_df, own_results_df) + other_plot_df, other_sems_df = accuracy_by_model_dfs(other_metrics_df, other_results_df) + # drop the no reasoning baseline + own_plot_df = own_plot_df.drop("nr_name", axis=0) + own_sems_df = own_sems_df.drop("nr_name", axis=0) + other_plot_df = other_plot_df.drop("nr_name", axis=0) + other_sems_df = other_sems_df.drop("nr_name", axis=0) + + own_plot_df = own_plot_df.T + own_sems_df = own_sems_df.T + other_plot_df = other_plot_df.T + other_sems_df = other_sems_df.T + models = own_plot_df.index # e.g., ["No reasoning (baseline)", "Correct reasoning", ...] + n_models = len(models) + bar_width = 0.35 # width of the bars + n_variations = len(own_plot_df.columns) + assert n_variations == len(other_plot_df.columns) + group_width = 0.8 # Total width for one group of bars + bar_width = group_width / (n_variations * 2) # Width of one bar + + # Create figure and axis + fig, ax = plt.subplots(figsize=(12, 8), constrained_layout=True) + + # Set position of bar on X axis + ind = np.arange(n_models) # the x locations for the groups + + colors = [VARIATION_COLOR_MAP[variation] for variation in own_plot_df.columns] + VARIATION_OFFSET = 0.03 + for i, variation in enumerate(own_plot_df.columns): + # Position of bars for this model + # bars for a given model are grouped together, and then within that group, the bars for each variation are grouped + r1 = ind + i * VARIATION_OFFSET + i * (n_variations * bar_width) + r2 = [x + bar_width for x in r1] + + ax.bar( + r1, + own_plot_df[variation], + width=bar_width, + yerr=own_sems_df[variation], + capsize=5, + label=f"{VARIATION_NAMES[variation]} ('assistant' message)", + color=colors[i], + # add outline to bars + edgecolor="black", + ) + ax.bar( + r2, + other_plot_df[variation], + width=bar_width, + yerr=other_sems_df[variation], + capsize=5, + label=f"{VARIATION_NAMES[variation]} ('user' message)", + hatch="//", + color=colors[i], + edgecolor="black", + ) + + for j, model in enumerate(models): + x_own = r1[j] + x_other = r2[j] + y1 = own_plot_df.loc[model, variation] + y2 = other_plot_df.loc[model, variation] + y1_err = own_sems_df.loc[model, variation] + y2_err = other_sems_df.loc[model, variation] + ax.text(x_own, y1 + y1_err, f"{y1:.2f}", ha="center", va="bottom") + ax.text(x_other, y2 + y2_err, f"{y2:.2f}", ha="center", va="bottom") + + # Add xticks on the middle of the group bars + xtick_positions = ind + bar_width * n_variations + (VARIATION_OFFSET - bar_width) / 2 + ax.set_xticks(xtick_positions) + ax.set_xticklabels(models) + + # Create legend & Show graphic + ax.set_xlabel("Model") + ax.set_ylabel("Accuracy") + ax.set_ylim(top=1.0) + ax.legend() + ax.set_title("Accuracy for each variation (higher is better)") + outpath = os.path.join(out_dir, "accuracy_by_category_and_reasoning.png") + fig.savefig(outpath) + maybe_show(fig) + + +def plot_accuracy_by_steps_all(metrics_df, results_df, out_dir): + """ + Create plots of accuracy of: + - num_steps - mistake_index + - mistake_index / num_steps + """ + get_all_tasks(results_df) + all_tasks_renamed = get_all_tasks_renamed(results_df) + all_models = get_unique_models(results_df) + # one plot per task, one subplot per model + for task in all_tasks_renamed: + fig, axs = plt.subplots( + 1, len(all_models), figsize=(15, 6), constrained_layout=True, squeeze=False + ) + axs = axs.flatten() + for ax, model in zip(axs, all_models): + task_model_df = metrics_df[(metrics_df.solver == model) & (metrics_df.task == task)] + plot_accuracy_by_steps(task_model_df, task, model, ax) + # only put legend on last plot + final_ax = axs[-1] + final_ax.legend(loc="upper center") + outpath = os.path.join(out_dir, f"results-split-by-steps_{task}.png") + fig.suptitle(f"Accuracy by steps for {TASK_NAMES[task]} (higher is better)") + fig.savefig(outpath) + maybe_show(fig) + + +def plot_accuracy_by_steps(df, task, model, ax): + df["steps_diff"] = df["num_ground_truth_steps"] - df["mistake_index"] + + # due to the way pandas works, we have to group, then filter, then regroup + grouped_df = df.groupby("steps_diff") + + MIN_SAMPLES = 10 + filtered_groups = grouped_df.filter(lambda x: len(x) >= MIN_SAMPLES) + + # Now, re-group the filtered DataFrame by 'steps_diff' again and calculate the mean + plot_df = filtered_groups.groupby("steps_diff")[ + ["NR_correct", "CR_correct", "IR_correct"] + ].mean() + colors = [VARIATION_COLOR_MAP[variation] for variation in VARIATION_NAMES.keys()] + + # change the names of the columns to be more readable before plotting + plot_df.columns = list(VARIATION_NAMES.values()) + # now plot the three accuracies against steps_diff + assert isinstance(plot_df, pd.DataFrame) + ax = plot_df.plot(color=colors, ax=ax, legend=False) + ax.set_xlabel("Steps beyond mistake") + ax.set_ylabel("Accuracy") + ax.set_ylim(0, 1.1) + # ax.set_title(f"{MODEL_NAMES[model]} | {TASK_NAMES[task]} (higher is better)") + ax.set_title(f"{MODEL_NAMES[model]}") + # plt.tight_layout() + return ax + + +def plot_accuracy_by_task(model, metrics_df, all_tasks, all_tasks_renamed, out_dir): + all_tasks_pretty = [TASK_NAMES[i] for i in all_tasks_renamed] + accuracies = {"nr_name": [], "cr_name": [], "ir_name": []} + all_sems = [] + # for all tasks + model_mask = metrics_df.solver == model + + # and split by task type + for task in all_tasks_renamed: + + task_mask = metrics_df.task == task + CR_corrects = metrics_df[model_mask & task_mask]["CR_correct"] + IR_corrects = metrics_df[model_mask & task_mask]["IR_correct"] + NR_corrects = metrics_df[model_mask & task_mask]["NR_correct"] + + CR_accuracy, CR_sem = corrects_to_accuracy_and_sem(CR_corrects) + IR_accuracy, IR_sem = corrects_to_accuracy_and_sem(IR_corrects) + NR_accuracy, NR_sem = corrects_to_accuracy_and_sem(NR_corrects) + + accuracies["nr_name"].append(NR_accuracy) + accuracies["cr_name"].append(CR_accuracy) + accuracies["ir_name"].append(IR_accuracy) + + sems = [NR_sem, CR_sem, IR_sem] + all_sems.append(sems) + + sems_df = pd.DataFrame( + all_sems, + index=all_tasks_pretty, + columns=["nr_name", "cr_name", "ir_name"], + ) + + plot_df = pd.DataFrame(accuracies, index=all_tasks_pretty) + + fig, ax = plt.subplots(figsize=(15, 6), constrained_layout=True) + colors = [VARIATION_COLOR_MAP[variation] for variation in plot_df.columns] + plot_df.columns = list(VARIATION_NAMES.values()) + ax = plot_df.plot.bar(rot=0, color=colors, yerr=sems_df, capsize=4, ax=ax, width=0.8) + annotate_axes(ax, sems_df) + + # Shrink current axis by 20% to make room for the legend + box = ax.get_position() + ax.set_position((box.x0, box.y0, box.width * 0.8, box.height)) + # Place the legend outside the plot + ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) + ax.set_ylim(top=1.1) + ax.set_xlabel("Task type") + ax.set_ylabel("Accuracy") + ax.set_title(f"{MODEL_NAMES[model]} (higher is better)") + outpath = os.path.join(out_dir, f"results-split-by-task_{get_cleaned_model_name(model)}.png") + fig.savefig(outpath) + maybe_show(fig) + + +def performance_loss_per_task(metrics_df: pd.DataFrame, results_df: pd.DataFrame, out_dir: Path): + # Plot performance lost for each model + unique_models = get_unique_models(results_df) + get_all_tasks(results_df) + all_tasks_renamed = get_all_tasks_renamed(results_df) + all_tasks_pretty = [TASK_NAMES[i] for i in all_tasks_renamed] + + all_metrics = {} + all_errors = {} + for model in unique_models: + metrics = [] + errors = [] + for task in all_tasks_renamed: + model_mask = metrics_df.solver == model + task_mask = metrics_df.task == task + CR_corrects = metrics_df[model_mask & task_mask]["CR_correct"] + IR_corrects = metrics_df[model_mask & task_mask]["IR_correct"] + + performance_loss, performance_loss_error = corrects_to_performance_loss_and_error( + CR_corrects, IR_corrects + ) + metrics.append(performance_loss) + errors.append(performance_loss_error) + + pretty_model_name = MODEL_NAMES[model] + all_metrics[pretty_model_name] = metrics + all_errors[pretty_model_name] = errors + + fig, ax = plt.subplots(figsize=(20, 6), constrained_layout=True) + plot_df = pd.DataFrame(all_metrics, index=all_tasks_pretty) + errs_df = pd.DataFrame(all_errors, index=all_tasks_pretty) + colors = [MODEL_COLOR_MAP[model] for model in unique_models] + ax = plot_df.plot.bar(rot=0.0, color=colors, ax=ax, width=0.8, yerr=errs_df, capsize=4) + annotate_axes(ax, errs_df) + # Shrink current axis by 20% to make room for the legend + box = ax.get_position() + ax.set_position((box.x0, box.y0, box.width * 0.8, box.height)) + ax.set_ylim(bottom=-1, top=1.1) + ax.legend() + ax.axhline(0, 0, 1, color="black", linestyle="-") + ax.set_title("Performance loss per task (lower is better)") + ax.set_xlabel("Task type") + ax.set_ylabel("Performance loss") + + outpath = os.path.join(out_dir, "results_split_by_model.png") + fig.savefig(outpath) + maybe_show(fig) + + +def performance_loss_per_model(metrics_df: pd.DataFrame, results_df: pd.DataFrame, out_dir: Path): + unique_models = get_unique_models(results_df) + + metrics = {} + errors = {} + for model in unique_models: + model_mask = metrics_df.solver == model + + CR_corrects = metrics_df[model_mask]["CR_correct"] + IR_corrects = metrics_df[model_mask]["IR_correct"] + + performance_loss, performance_loss_error = corrects_to_performance_loss_and_error( + CR_corrects, IR_corrects + ) + + pretty_model_name = MODEL_NAMES[model] + metrics[pretty_model_name] = performance_loss + errors[pretty_model_name] = performance_loss_error + + fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True) + plot_df = pd.DataFrame(metrics, index=[0]) + errs_df = pd.DataFrame(errors, index=[0]) + colors = [MODEL_COLOR_MAP[model] for model in unique_models] + ax = plot_df.plot.bar(rot=0, color=colors, ax=ax, width=0.8, yerr=errs_df, capsize=4) + annotate_axes(ax, errs_df) + # Shrink current axis by 20% to make room for the legend + box = ax.get_position() + ax.set_position((box.x0, box.y0, box.width * 0.8, box.height)) + # Place the legend outside the plot + ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) + ax.set_xticklabels([]) + ax.set_xticks([]) + ax.set_ylabel("Performance loss") + ax.set_ylim(top=1.1) + ax.set_title("Average performance loss per model (lower is better)") + outpath = os.path.join(out_dir, "headline_results.png") + fig.savefig(outpath) + maybe_show(fig) + + +def main(): + parser = argparse.ArgumentParser() + # DEBUG: hacking together own_reasoning and other_reasoning plots + parser.add_argument( + "--log_dir", + "-d", + type=str, + required=True, + help="Path to log dir with primary results (if supplementary_dir is provided, this is should be 'own' reasoning)", + ) + parser.add_argument( + "--supplementary_dir", + "-s", + type=str, + help="Optional supplementary log dir with 'other' reasoning results", + ) + parser.add_argument("--out_dir", "-o", type=str, required=True) + args = parser.parse_args() + log_dir = Path(args.log_dir) + out_dir = Path(args.out_dir) + out_dir.mkdir(exist_ok=True, parents=True) + + metrics_df = extract_metrics(log_dir) + results_df = extract_results(log_dir) + if args.supplementary_dir: + other_log_dir = Path(args.supplementary_dir) + other_metrics_df = extract_metrics(other_log_dir) + other_results_df = extract_results(other_log_dir) + accuracy_by_model_and_reasoning( + metrics_df, results_df, other_metrics_df, other_results_df, out_dir + ) + accuracy_by_task(metrics_df, results_df, out_dir) + accuracy_by_model(metrics_df, results_df, out_dir) + performance_loss_per_task(metrics_df, results_df, out_dir) + performance_loss_per_model(metrics_df, results_df, out_dir) + plot_accuracy_by_steps_all(metrics_df, results_df, out_dir) + + +if __name__ == "__main__": + DISPLAY = False + main() diff --git a/evals/elsuite/error_recovery/scripts/run_experiments.sh b/evals/elsuite/error_recovery/scripts/run_experiments.sh new file mode 100755 index 0000000000..36f51faad4 --- /dev/null +++ b/evals/elsuite/error_recovery/scripts/run_experiments.sh @@ -0,0 +1,44 @@ +#!/bin/bash +logdir=./logs +outdir=./outputs + +timestamp=$(date +%Y%m%d_%H%M%S) +logpathbase=$logdir/$timestamp +outpathbase=$outdir/$timestamp +SPLIT=main + +mkdir -p ${logpathbase} + +export EVALS_THREADS=250 +echo Running full experiments and logging to $logpathbase + +declare -a SOLVERS=( + error_recovery/gpt-3.5-turbo-0613 + error_recovery/gpt-4-0613 + generation/hhh/gpt-4-base +) + +# OWN REASONING VARIANT +for solver in "${SOLVERS[@]}" +do + log_name=${SPLIT}_${solver//\//-}_own-reasoning + + oaieval $solver error-recovery.$SPLIT \ + --extra_eval_params final_answer_prompt_role=system \ + --record_path "$logpathbase/$log_name.log" +done + +# OTHER REASONING VARIANT +for solver in "${SOLVERS[@]}" +do + log_name=${SPLIT}_${solver//\//-}_other-reasoning + + oaieval $solver error-recovery.$SPLIT.other-reasoning \ + --extra_eval_params final_answer_prompt_role=system \ + --record_path "$logpathbase/$log_name.log" +done + +echo Producing plots, outputs to $outpathbase + +mkdir -p ${outpathbase} +python make_plots.py --log_dir ${logpathbase} --out_dir $outpathbase diff --git a/evals/registry/data/error_recovery/main.jsonl b/evals/registry/data/error_recovery/main.jsonl new file mode 100644 index 0000000000..77835457c7 --- /dev/null +++ b/evals/registry/data/error_recovery/main.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fda8fddd6a63d6b84ee4b6a8934bcedcada67e3fcd5df64041f14c04d774be3 +size 1543818 diff --git a/evals/registry/data/error_recovery/medium.jsonl b/evals/registry/data/error_recovery/medium.jsonl new file mode 100644 index 0000000000..77b989dee3 --- /dev/null +++ b/evals/registry/data/error_recovery/medium.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5c591504d282ca7763d7abe407958da1ea06d6dc62be4808ba4fa97ff5f3cb2 +size 280075 diff --git a/evals/registry/data/error_recovery/small.jsonl b/evals/registry/data/error_recovery/small.jsonl new file mode 100644 index 0000000000..64172d3d10 --- /dev/null +++ b/evals/registry/data/error_recovery/small.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e55b1af640b26eff5661c83c7ff6bf52040ea062c9a71ba16069e2305fdb362 +size 10191 diff --git a/evals/registry/evals/error_recovery.yaml b/evals/registry/evals/error_recovery.yaml new file mode 100644 index 0000000000..f42e0e9243 --- /dev/null +++ b/evals/registry/evals/error_recovery.yaml @@ -0,0 +1,36 @@ +error-recovery: + id: error-recovery.main + metrics: [accuracy] + description: TODO + +error-recovery.main: + class: evals.elsuite.error_recovery.eval:ErrorRecovery + args: + samples_jsonl: error_recovery/main.jsonl + +error-recovery.small: + class: evals.elsuite.error_recovery.eval:ErrorRecovery + args: + samples_jsonl: error_recovery/small.jsonl + +error-recovery.medium: + class: evals.elsuite.error_recovery.eval:ErrorRecovery + args: + samples_jsonl: error_recovery/medium.jsonl + +# --- mark reasoning as 'user' variant --- +error-recovery.main.other-reasoning: + class: evals.elsuite.error_recovery.eval:ErrorRecovery + args: + samples_jsonl: error_recovery/main.jsonl + mark_as_own_reasoning: False +error-recovery.small.other-reasoning: + class: evals.elsuite.error_recovery.eval:ErrorRecovery + args: + samples_jsonl: error_recovery/small.jsonl + mark_as_own_reasoning: False +error-recovery.medium.other-reasoning: + class: evals.elsuite.error_recovery.eval:ErrorRecovery + args: + samples_jsonl: error_recovery/medium.jsonl + mark_as_own_reasoning: False diff --git a/evals/registry/solvers/error_recovery.yaml b/evals/registry/solvers/error_recovery.yaml new file mode 100644 index 0000000000..bef801549e --- /dev/null +++ b/evals/registry/solvers/error_recovery.yaml @@ -0,0 +1,38 @@ +# TODO: use default solvers once they are versioned +error_recovery/gpt-3.5-turbo-0613: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-0613 + +error_recovery/gpt-4-0613: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-0613 + +error_recovery/default/gpt-4-base: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512 + +# solver that continues the previous message +error_recovery/continue/gpt-4-base: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + continue_last_assistant_msg: True + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512