diff --git a/LICENSE.md b/LICENSE.md index 388bda13d5..405cc0eb38 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -108,5 +108,12 @@ NOTE: This license applies to all parts of this repository except for the datase - **License**: Creative Commons Attribution 4.0 International: https://creativecommons.org/licenses/by/4.0/ - **Source**: https://allenai.org/data/socialiqa +#### Already Said That -Please note: While efforts have been made to accurately represent the licenses associated with each dataset, users should consult the original source of the dataset to ensure compliance with any licensing terms and conditions. +- **Location**: evals/registry/data/already_said_that +- **Components**: + - **WordNet**: + - **License**: WordNet License: https://wordnet.princeton.edu/license-and-commercial-use + - **Source**: https://wordnet.princeton.edu/ + +Please note: While efforts have been made to accurately represent the licenses associated with each dataset, users should consult the original source of the dataset to ensure compliance with any licensing terms and conditions. \ No newline at end of file diff --git a/evals/elsuite/already_said_that/README.md b/evals/elsuite/already_said_that/README.md new file mode 100644 index 0000000000..bdb5274b1e --- /dev/null +++ b/evals/elsuite/already_said_that/README.md @@ -0,0 +1,185 @@ +# Already Said That + +This eval measures how robust models are to distractors when performing +sequential tasks. We construct a toy task where the model needs to determine +whether it has already seen a given word, and inject distractor questions into +the interaction, keeping track of model performance throughout. + +## Usage + +Run with: + +```bash +oaieval already_said_that +``` + +We have found that `generation/direct/gpt-4-0125-preview` works well on this +eval. For more examples of tested solvers, see +[`./scripts/run_experiments.sh`](./scripts/run_experiments.sh). + +## Dataset + +The dataset consists of 500 samples, where each sample contains 100 unique words +randomly sampled from the [WordNet corpus](https://wordnet.princeton.edu/) via +the `nltk` library. + +We also rely on four sets of distractor questions, sourced directly from the +datasets of pre-existing evals. Specifically we make use of the datasets of the +following evals from our evals registry: + +- [`which-is-heavier`](../../registry/evals/which-is-heavier.yaml) +- [`first-letters`](../../registry/evals/first-letters.yaml) +- [`ambigous-sentences`](../../registry/evals/ambiguous-sentences.yaml) +- [`reverse-sort-words-eng`](../../registry/evals/reverse-sort-words-eng.yaml) + +## Evaluation Process + +The evaluation process is as follows for a given sample from our dataset: + +1. The `TASK_DESCRIPTION` prompt is shown to the solver. +2. For 100 turns, we either show a word to the solver or a distractor question, + with probability 2/3 and 1/3 respectively. +3. If a word is shown, we prefix it with `MAIN TASK -`, to indicate that we are + asking the solver to perform the main task of determining whether it has seen + the word before. +4. When showing a word, we randomly show previously seen words with a + probability of 1/2 and new words with a probability of 1/2. +5. If we show a distractor question, we directly show the question to the + solver. +6. The solver should respond with its answer wrapped in the format + `[answer: ]`. +7. The solver's response is parsed and compared to the correct answer. +8. If the solver's response is incorrect or a violation is raised (answered in + the incorrect format), in the case of the main task we stop the interaction + and record the number of turns the solver lasted for. Otherwise we continue + to the next turn. + +## Prompts + +We refer readers to [`./prompts.py`](./prompts.py) for the `TASK_DESCRIPTION` +used in the eval. + +We refer readers to [`./distractors.py`](./distractors.py) for any cosmetic +changes we make to the distractor questions. + +## Metrics + +Below are the metrics returned by the eval: + + +| **Metric** | **Notes** | +|------------------------- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `avg_num_turns` | The average number of turns shown before the model fails across the samples. Higher is better. Best possible is 100. | +| `stddev_num_turns` | The standard deviation on the above. | +| `median_num_turns` | The median number of turns shown before the model fails across the samples. Higher is better. Best possible is 100. | +| `max_num_turns` | The maximum number of turns shown before the model fails across the samples. | +| `min_num_turns` | The minimum number of turns shown before the model fails across the samples. | +| `false_positive_rate` | How often the model answers “yes” when it should have answered “no” (i.e. a new word is shown, and the model claims to have seen it already). | +| `false_negative_rate` | How often the model answers “no” when it should have answered “yes” (i.e. a word is shown again, and the model claims to not have seen it). | +| `avg_distractor_accuracy` | For a given sample interaction, we measure whether each model response to a given distractor question is accurate. We then compute the accuracy on the distractor questions shown over the interaction. We then average this accuracy across all samples. | +| `violation_rate` | how often the model responds in an invalid format, i.e. not using the `[answer: ]` format. | +| `avg_num_distractors` | The average number of distractors shown before the model fails across the samples. Higher is better. Best possible is around 33. | +| `stddev_num_distractors` | The standard deviation on the above. | +| `median_num_distractors` | The median number of distractors shown before the model fails across the samples. Higher is better. Best possible is around 33. | +| `max_num_distractors` | The maximum number of distractors shown before the model fails across the samples. | +| `min_num_distractors` | The minimum number of distractors shown before the model fails across the samples. | + + +## Variants + +We consider each of the four distractor datasets mentioned in +[Dataset](#dataset) as a variant of the eval. + +```bash +oaieval already_said_that. +``` + +We also have a `distractorless` variant where we only show words to the solver. +We use this as a baseline to determine how robust the solver is to distractors. + +```bash +oaieval already_said_that.distractorless +``` + +## Custom Solvers + +We implement 2 custom solvers for this eval in [./solvers.py](./solvers.py): + +1. `RandomBaselineSolver`: A solver that randomly answers `yes` or `no` for any + input. We view this baseline as equivalent to randomly guessing. +2. `AlreadySaidThatHuman`: A helper solver class that wraps the `HumanCliSolver` + class such that users do not have to wrap their answer in the + `[answer: ]` format and can instead just directly type the answer. + +## Token Usage Estimates + +Below are approximate token usage estimates for a given run (one run = all +samples) of the eval, for each of the distractor variants. + +For Direct gpt-4-0125-preview: + +| Distractor variant | Input | Output | Total | +| --------------------- | ---------- | ------- | ---------- | +| which-is-heavier | 17,960,000 | 80,000 | 18,040,000 | +| ambiguous-sentences | 27,750,000 | 110,000 | 27,860,000 | +| first-letters | 19,850,000 | 80,000 | 19,940,000 | +| reverse-sort-words-en | 10,700,000 | 120,000 | 10,820,000 | +| distractorless | 27,550,000 | 120,000 | 27,680,000 | + +For Direct gpt-3.5-turbo-0125: + +| Distractor variant | Input | Output | Total | +| --------------------- | --------- | ------ | --------- | +| which-is-heavier | 1,200,000 | 10,000 | 1,210,000 | +| ambiguous-sentences | 1,540,000 | 20,000 | 1,550,000 | +| first-letters | 2,120,000 | 20,000 | 2,140,000 | +| reverse-sort-words-en | 910,000 | 20,000 | 940,000 | +| distractorless | 1,250,000 | 20,000 | 1,270,000 | + +For Direct gpt-4-base: + +| Distractor variant | Input | Output | Total | +| --------------------- | ---------- | --------- | ---------- | +| which-is-heavier | 16,950,000 | 3,670,000 | 20,620,000 | +| ambiguous-sentences | 23,100,000 | 4,390,000 | 27,490,000 | +| first-letters | 25,310,000 | 4,870,000 | 30,180,000 | +| reverse-sort-words-en | 14,380,000 | 2,760,000 | 17,140,000 | +| distractorless | 24,460,000 | 5,000,000 | 29,460,000 | + +For CoT gpt-4-0125-preview: + +| Distractor variant | Input | Output | Total | +| --------------------- | ----------- | --------- | ----------- | +| which-is-heavier | 263,600,000 | 1,900,000 | 265,500,000 | +| ambiguous-sentences | 383,500,000 | 2,700,000 | 386,200,000 | +| first-letters | 251,700,000 | 1,700,000 | 253,400,000 | +| reverse-sort-words-en | 236,700,000 | 2,100,000 | 238,800,000 | +| distractorless | 395,500,000 | 2,400,000 | 398,000,000 | + +For CoT gpt-3.5-turbo-0125: + +| Distractor variant | Input | Output | Total | +| --------------------- | ---------- | ------- | ---------- | +| which-is-heavier | 10,100,000 | 190,000 | 10,280,000 | +| ambiguous-sentences | 7,510,000 | 140,000 | 7,650,000 | +| first-letters | 16,450,000 | 220,000 | 16,670,000 | +| reverse-sort-words-en | 4,690,000 | 150,000 | 4,840,000 | +| distractorless | 30,230,000 | 310,000 | 30,540,000 | + +## Future modifications + +- Extending the range of distractors considered, either by incorporating more + evals or designing new distractor variants. +- Experiment with multiple distractor sources in a single eval run, to see if + the variety of distractors affects the model's robustness. + +## Version History + +- v0: Initial version released + +## Contribution Statement + +Eval design, implementation, and results evaluation were primarily conducted by +Giulio Starace, under the guidance of (alphabetically by last-name) Steven +Adler, Andrei Alexandru, James Aung, and Chan Jun Shern who provided research +input, report revisions, and project management support. diff --git a/evals/elsuite/already_said_that/distractors.py b/evals/elsuite/already_said_that/distractors.py new file mode 100644 index 0000000000..712d172ece --- /dev/null +++ b/evals/elsuite/already_said_that/distractors.py @@ -0,0 +1,133 @@ +import os +from dataclasses import dataclass +from pathlib import Path + +import evals + + +@dataclass +class DistractorSample: + question: str + ideal: str + + +VARIANTS = { + "which-is-heavier", + "ambiguous-sentences", + "first-letters", + "reverse-sort-words-eng", + "distractorless", +} + + +def proc_which_is_heavier(samples) -> list[DistractorSample]: + distractor_samples = [] + for sample in samples: + # get rid of ' Answer Yes or No' + question = sample["input"][1]["content"][:-17] + ideal = sample["ideal"].lower() + distractor_samples.append(DistractorSample(question, ideal)) + return distractor_samples + + +def proc_distractors_first_letters(samples) -> list[DistractorSample]: + distractor_samples = [] + for sample in samples: + question = sample["input"][1]["content"] + ideal = sample["ideal"].lower() + distractor_samples.append(DistractorSample(question, ideal)) + return distractor_samples + + +def proc_distractors_ambiguous_sentences(samples) -> list[DistractorSample]: + distractor_samples = [] + for sample in samples: + sample_content = sample["input"][1]["content"] + question = f"{sample_content}" + ideal = sample["ideal"].lower() + distractor_samples.append(DistractorSample(question, ideal)) + return distractor_samples + + +def proc_distractors_reverse_sort_words_eng(samples) -> list[DistractorSample]: + distractor_samples = [] + for sample in samples: + # cut " (respond as concisely as possible and only include the comma-separated words in your response):" + instruction = sample["input"][0]["content"][:-96] + sample_content = sample["input"][1]["content"] + question = f"{instruction}: {sample_content}" + ideal = sample["ideal"].lower() + distractor_samples.append(DistractorSample(question, ideal)) + return distractor_samples + + +variant_to_processor = { + "which-is-heavier": proc_which_is_heavier, + "first-letters": proc_distractors_first_letters, + "ambiguous-sentences": proc_distractors_ambiguous_sentences, + "reverse-sort-words-eng": proc_distractors_reverse_sort_words_eng, +} + + +def get_basic_distractor_example() -> DistractorSample: + """ + An arbitrary distractor example used in the task description for the + distractorless variant + """ + return DistractorSample(question="What is the capital of Italy?", ideal="rome") + + +def get_distractors(variant: str) -> list[DistractorSample]: + """ + Gets and optionally processes the corpus of distractor questions for variant + """ + assert variant in VARIANTS, f"Invalid variant {variant}, expected one of {VARIANTS}" + if variant == "distractorless": + # single element will be pop()ed for the task description, leaving an empty list + return [get_basic_distractor_example()] + + samples = get_samples(variant) + + process_variant_fn = variant_to_processor[variant] + processed_samples = process_variant_fn(samples) + + return processed_samples + + +def get_samples(eval_name) -> list[dict]: + """ + Gets the samples from the samples_jsonl associated with + a given eval. + + Adapted from evals.eval.Eval.get_samples + """ + registry = evals.registry.Registry() + eval_spec = registry.get_eval(eval_name) + samples_path = eval_spec.args["samples_jsonl"] + registry_path = eval_spec.registry_path + samples_full_path = get_full_path(samples_path, registry_path) + return evals.data.get_jsonl(samples_full_path.as_posix()) + + +def get_full_path(data_path, registry_path) -> Path: + if os.path.isfile(data_path): + return Path(data_path) + + return registry_path / "data" / data_path + + +def get_distractor_word(question: str) -> str: + """ + Takes the last word of the question (stripped of punctuation and lower-cased) + To be shown in the task description example + """ + words = question.split() + last_word = words[-1] + last_word = last_word.strip(".,!?") + return last_word.lower() + + +if __name__ == "__main__": + # just for testing + distractors = get_distractors("rectangles") + print(distractors[0]) diff --git a/evals/elsuite/already_said_that/eval.py b/evals/elsuite/already_said_that/eval.py new file mode 100644 index 0000000000..2fa495c702 --- /dev/null +++ b/evals/elsuite/already_said_that/eval.py @@ -0,0 +1,160 @@ +import random +from collections import deque +from typing import Any, Deque, Optional + +import numpy as np + +from evals.elsuite.already_said_that import distractors, prompts, utils +from evals.eval import SolverEval +from evals.record import RecorderBase, record_metrics +from evals.solvers.solver import Solver +from evals.task_state import Message, TaskState + + +class AlreadySaidThat(SolverEval): + def __init__( + self, + distractor_variant: str, + adversarial: bool = True, + max_turns: int = 100, + n_samples: Optional[int] = 250, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.distractor_variant = distractor_variant + self.distractor_data = distractors.get_distractors(self.distractor_variant) + distractor_example = self.distractor_data.pop() + distractor_word = distractors.get_distractor_word(distractor_example.question) + self.task_description = prompts.TASK_DESCRIPTION.format( + distractor_question=distractor_example.question, + distractor_answer=distractor_example.ideal, + distractor_word=distractor_word, + ) + self.num_distractors = len(self.distractor_data) + self.max_turns = max_turns + self.adversarial = adversarial + self.n_samples = n_samples + self.rng = random.Random(self.seed) + + def eval_sample(self, solver: Solver, sample: dict, rng: random.Random) -> None: + words = sample["words"] + # make a deque of the (shuffled) distractor data, will be faster to rotate + distractor_data = deque(rng.sample(self.distractor_data, k=self.num_distractors)) + + conversation_metrics = self._conversation_loop(solver, words, distractor_data, rng) + + record_metrics(**conversation_metrics) + + def _conversation_loop( + self, + solver: Solver, + words: list[str], + distractor_data: Deque[dict[str, str]], + rng, + ) -> dict[str, Any]: + convo_metrics = { + "num_distractors": 0, + "num_turns": 0, + "was_false_pos": False, + "was_false_neg": False, + "violation_occurred": False, + "distractor_accuracy": np.nan, + } + + words_prev_shown = set() + words_not_shown = set(words) + words_from_solver = set() + words_from_distractors = set() + + distractor_correctness = [] + + task_state = TaskState(task_description=self.task_description) + + while convo_metrics["num_turns"] < self.max_turns: + # conversation + distracting_words = ( + words_from_solver.union(words_from_distractors) if self.adversarial else set() + ) + message, message_words, distractor_added = utils.build_message( + words_not_shown=words_not_shown, + words_prev_shown=words_prev_shown, + distracting_words=distracting_words, + rng=rng, + distractor_sample=distractor_data[0] if distractor_data else None, + ) + task_state.messages.append(message) + solver_output = solver(task_state).output + task_state.messages.append(Message(role="assistant", content=solver_output)) + + # track performance + parsing_results = utils.parse_solver_output( + solver_output, message_words, words_prev_shown, distractor_added + ) + convo_metrics["violation_occurred"] = parsing_results["violation_occurred"] + mistake_made = parsing_results["mistake_made"] + if distractor_added is not None: + distractor_correctness.append(not mistake_made) + convo_metrics["num_distractors"] += 1 + words_from_distractors.update(message_words) + # move the distractor we just used to the end of the queue + distractor_data.rotate(-1) + elif convo_metrics["violation_occurred"] or (mistake_made and distractor_added is None): + convo_metrics["was_false_pos"] = parsing_results["false_positive"] + convo_metrics["was_false_neg"] = parsing_results["false_negative"] + break + else: + words_prev_shown.update(message_words) + words_not_shown.difference_update(message_words) + words_from_solver.update(parsing_results["solver_words"]) + convo_metrics["num_turns"] += 1 + + convo_metrics["distractor_accuracy"] = ( + np.mean(distractor_correctness) if distractor_correctness else np.nan + ) + + return convo_metrics + + def run(self, recorder: RecorderBase): + samples = self._get_samples() + self.eval_all_samples(recorder, samples) + logged_metrics: list[dict] = recorder.get_metrics() + + agg_metrics = self._compute_agg_metrics(logged_metrics) + return agg_metrics + + def _compute_agg_metrics(self, logged_metrics: list[dict]) -> dict: + num_distractors = np.array([x["num_distractors"] for x in logged_metrics]) + num_turns = np.array([x["num_turns"] for x in logged_metrics]) + + agg_metrics = { + # distractors + "avg_num_distractors": np.mean(num_distractors), + "stddev_num_distractors": np.std(num_distractors), + "median_num_distractors": np.median(num_distractors), + "max_num_distractors": np.max(num_distractors), + "min_num_distractors": np.min(num_distractors), + # turns + "avg_num_turns": np.mean(num_turns), + "stddev_num_turns": np.std(num_turns), + "median_num_turns": np.median(num_turns), + "max_num_turns": np.max(num_turns), + "min_num_turns": np.min(num_turns), + # task stats + "false_positive_rate": np.mean([x["was_false_pos"] for x in logged_metrics]), + "false_negative_rate": np.mean([x["was_false_neg"] for x in logged_metrics]), + # distractor stats + "avg_distractor_accuracy": np.nanmean( + [x["distractor_accuracy"] for x in logged_metrics] + ), + # violation + "violation_rate": np.mean([x["violation_occurred"] for x in logged_metrics]), + } + # necessary for serialization, json doesn't like np floats + agg_metrics = {k: float(v) for k, v in agg_metrics.items()} + return agg_metrics + + def _get_samples(self) -> list[dict]: + samples = self.get_samples() + samples = self.rng.sample(samples, min(self.n_samples, len(samples))) + return samples diff --git a/evals/elsuite/already_said_that/prompts.py b/evals/elsuite/already_said_that/prompts.py new file mode 100644 index 0000000000..cd3bbb4490 --- /dev/null +++ b/evals/elsuite/already_said_that/prompts.py @@ -0,0 +1,28 @@ +TASK_DESCRIPTION = """\ +You will be presented with a list of words, in sequence.\ + Your main task is to determine whether a given word has been shown to you by the user already.\ + You may or may not be presented with unrelated tasks - you should ALWAYS answer\ + these, but then return to the main task of determining whether a given word has been shown\ + to you already. Words mentioned during the unrelated tasks do not count as being mentioned.\ + +You should respond with [answer: ], where is either yes or no for the\ + main task or a valid answer for the unrelated tasks. +Here is an example interaction: + +```example +input: MAIN TASK - hello +response: [answer: no] +input: MAIN TASK - hello +response: [answer: yes] +input: {distractor_question} +response: [answer: {distractor_answer}] +input: MAIN TASK - {distractor_word} +response: [answer: no] # words mentioned during unrelated tasks do not count +input: MAIN TASK - main +response: [answer: no] +input: MAIN TASK - main +response: [answer: yes] +``` + +The task will begin now.\ +""" diff --git a/evals/elsuite/already_said_that/scripts/data.sh b/evals/elsuite/already_said_that/scripts/data.sh new file mode 100755 index 0000000000..03c88ece88 --- /dev/null +++ b/evals/elsuite/already_said_that/scripts/data.sh @@ -0,0 +1,4 @@ +#!/bin/bash +echo "Generating word samples..." +python evals/elsuite/already_said_that/scripts/gen_data.py --n_samples 500 --jsonl_dir evals/registry/data/already_said_that --seed 0 +echo "Done." diff --git a/evals/elsuite/already_said_that/scripts/gen_data.py b/evals/elsuite/already_said_that/scripts/gen_data.py new file mode 100644 index 0000000000..94f827d0d6 --- /dev/null +++ b/evals/elsuite/already_said_that/scripts/gen_data.py @@ -0,0 +1,73 @@ +import argparse +import os +import random +import json + +import nltk +from nltk.corpus import wordnet +from tqdm.auto import tqdm + + +def process_wordnet() -> list[str]: + """ + Process the wordnet corpus and save it to the given directory + License info: https://www.nltk.org/nltk_data (number 102) + """ + # download wordnet corpus if necessary + nltk.download("wordnet", force=True) + wordnet_words = wordnet.words() + # get all unique alpha words from wordnet corpus + words = set() + for word in tqdm(wordnet_words): + if word.isalpha(): + words.add(word.lower()) + + return list(words) + + +def gen_sample(words_corpus: list[str], n_words, rng: random.Random) -> dict: + words = rng.sample(words_corpus, n_words) + return {"words": words} + + +def gen_samples(n_samples: int, n_words: int, rng: random.Random) -> list[dict]: + words = process_wordnet() + samples = [] + for _ in tqdm(range(n_samples)): + sample = gen_sample(words, n_words, rng) + samples.append(sample) + return samples + + +def write_to_jsonl( + samples: list[dict], + jsonl_path: str, +): + with open(jsonl_path, "w") as f: + for sample in samples: + f.write(json.dumps(sample) + "\n") + + +def main(args: argparse.Namespace): + rng = random.Random(args.seed) + samples = gen_samples(args.n_samples, args.n_words, rng) + os.makedirs(args.jsonl_dir, exist_ok=True) + jsonl_path = os.path.join(args.jsonl_dir, f"{args.n_samples}_{args.n_words}.jsonl") + write_to_jsonl(samples, jsonl_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--n_samples", type=int, default=500) + parser.add_argument( + "--n_words", type=int, default=100, help="Number of words in each sample" + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--jsonl_dir", type=str, default="./evals/registry/data/already_said_that/" + ) + + args = parser.parse_args() + + main(args) diff --git a/evals/elsuite/already_said_that/scripts/make_plots.py b/evals/elsuite/already_said_that/scripts/make_plots.py new file mode 100644 index 0000000000..ede36291ec --- /dev/null +++ b/evals/elsuite/already_said_that/scripts/make_plots.py @@ -0,0 +1,328 @@ +from pathlib import Path +import argparse +import json + +from tqdm.auto import tqdm +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +from evals.utils import log_utils + + +def zero_if_none(input_num): + if input_num is None: + return 0 + else: + return input_num + + +MODELS = [ + "cot/gpt-4-turbo-preview", + "gpt-4-turbo-preview", + "cot/gpt-3.5-turbo", + "gpt-3.5-turbo", + "gpt-4-base", + "gemini-pro", + "mixtral-8x7b-instruct", + "llama-2-70b-chat", + "random_baseline", +] +# separate list for OAI models for token counting, not supported in others. +OAI_MODELS = [ + "cot/gpt-4-turbo-preview", + "gpt-4-turbo-preview", + "cot/gpt-3.5-turbo", + "gpt-3.5-turbo", + "gpt-4-base", +] + + +DISTRACTORS = [ + "which-is-heavier", + "ambiguous-sentences", + "first-letters", + "reverse-sort-words-eng", + "distractorless", +] + + +MODEL_TO_LABEL = { + "cot/gpt-4-turbo-preview": "CoT gpt-4-0125-preview", + "cot/gpt-3.5-turbo": "CoT gpt-3.5-turbo-0125", + "gpt-4-turbo-preview": "Direct gpt-4-0125-preview", + "gpt-3.5-turbo": "Direct gpt-3.5-turbo-0125", + "gpt-4-base": "HHH gpt-4-base", + "gemini-pro": "Direct gemini-pro-1.0", + "mixtral-8x7b-instruct": "Direct mixtral-8x7b-instruct", + "llama-2-70b-chat": "Direct llama-2-70b-chat", + "random_baseline": "Random Baseline", +} + +NUM_REPEATS = 3 + +PLOT_STATS = ["avg_num_turns", "avg_distractor_accuracy"] +JSON_STATS = [ + "avg_num_turns", + "avg_distractor_accuracy", + "false_positive_rate", + "false_negative_rate", + "violation_rate", +] + +STAT_TO_MAX = { + "avg_num_distractors": 100 / 3, # distractors shown every 1/3 of the time + "avg_num_turns": 100, # best case, we run out of steps + "avg_distractor_accuracy": 1, + "false_positive_rate": 1, + "false_negative_rate": 1, + "violation_rate": 1, +} + +STAT_TO_LABEL = { + "avg_num_distractors": "Average number of distractors shown before failure", + "avg_num_turns": "Average number of turns before failure", + "avg_distractor_accuracy": "Average accuracy on distractor task", + "false_positive_rate": "False positive rate", + "false_negative_rate": "False negative rate", + "violation_rate": "Violation rate", +} + + +def make_results_dict(log_dir: Path) -> dict: + results_dict = prepare_results_dict() + results_dict = fill_results_dict(results_dict, log_dir) + return results_dict + + +def prepare_results_dict() -> dict: + results_dict = { + stat: { + distractor: { + model: {"raw": [], "mean": 0, "std_err": 0} for model in MODELS + } + for distractor in DISTRACTORS + } + for stat in [ + "avg_num_distractors", + "avg_num_turns", + "avg_distractor_accuracy", + "false_positive_rate", + "false_negative_rate", + "violation_rate", + ] + } + return results_dict + + +def fill_results_dict(results_dict: dict, log_dir: Path) -> dict: + print("Parsing logs...") + final_results = log_utils.get_final_results_from_dir(log_dir) + specs = log_utils.get_specs_from_dir(log_dir) + files = list(final_results.keys()) + + for file in tqdm(files): + final_result = final_results[file] + spec = specs[file] + distractor = spec["split"] + model = get_model(spec) + for stat in results_dict: + results_dict[stat][distractor][model]["raw"].append(final_result[stat]) + for file in tqdm(files): + spec = specs[file] + distractor = spec["split"] + model = get_model(spec) + # compute means/std_errs + for stat in results_dict: + data_points = results_dict[stat][distractor][model]["raw"] + results_dict[stat][distractor][model]["mean"] = np.mean(data_points) + results_dict[stat][distractor][model]["std_err"] = np.std( + data_points + ) / np.sqrt(NUM_REPEATS) + return results_dict + + +def get_model(spec): + # this is hilariously ugly but it works for now (sorry) + if "cot/gpt-4-turbo-preview" in spec["completion_fns"][0]: + return "cot/gpt-4-turbo-preview" + elif "gpt-4-turbo-preview" in spec["completion_fns"][0]: + return "gpt-4-turbo-preview" + elif "cot/gpt-3.5-turbo" in spec["completion_fns"][0]: + return "cot/gpt-3.5-turbo" + elif "gpt-3.5-turbo" in spec["completion_fns"][0]: + return "gpt-3.5-turbo" + elif "gpt-4-base" in spec["completion_fns"][0]: + return "gpt-4-base" + elif "gemini-pro" in spec["completion_fns"][0]: + return "gemini-pro" + elif "mixtral-8x7b-instruct" in spec["completion_fns"][0]: + return "mixtral-8x7b-instruct" + elif "llama-2-70b-chat" in spec["completion_fns"][0]: + return "llama-2-70b-chat" + elif "random_baseline" in spec["completion_fns"][0]: + return "random_baseline" + + +def make_bar_plot(results_dict: dict, stat: str, save_path: Path): + sns.set_context("paper") + sns.set_style("whitegrid") + + fig, ax = plt.subplots(1, 1, figsize=(8, 7), dpi=300) + + data = results_dict[stat] + + # the random baseline isn't plotted as bars + models = MODELS[:-1] + + distractors = [ + "which-is-heavier", + "ambiguous-sentences", + "first-letters", + "reverse-sort-words-eng", + ] + + width = 0.15 + if stat != "avg_distractor_accuracy": + distractors.append("distractorless") + diffs = [-width * 2, -width / 1, 0, width / 1, width * 2] + ax.axvline(STAT_TO_MAX[stat], label="maximum", linestyle="--", color="grey") + + # random baseline is roughly the same for all distractors; pick one for simplicity + random_baseline = data["first-letters"]["random_baseline"]["mean"] + + ax.axvline( + random_baseline, + label=MODEL_TO_LABEL["random_baseline"], + linestyle="-.", + color="black", + ) + + # make legend order match bar order, idk why matplotlib reverses them + legend_indices = [0, 1, 6, 5, 4, 3, 2] + else: + diffs = [-width * 1.5, -width / 2, width / 2, width * 1.5] + legend_indices = list(range(len(distractors)))[::-1] + + means = [[data[dis][model]["mean"] for dis in distractors] for model in models] + std_errs = [ + [data[dis][model]["std_err"] for dis in distractors] for model in models + ] + cmap = plt.get_cmap("Set3") + colors = np.array([cmap(i) for i in range(len(distractors))]) + + x = np.arange(len(models)) # the label locations + + distractor_bars = [] + for i, distractor in enumerate(distractors): + bar = ax.barh( + x + diffs[i], + [mean[i] for mean in means], + width, + xerr=[err[i] for err in std_errs], + label=distractor, + color=colors[i] if distractor != "distractorless" else "black", + ) + distractor_bars.append(bar) + + ax.set_xlabel(STAT_TO_LABEL[stat]) + x_max = STAT_TO_MAX[stat] + 0.05 * STAT_TO_MAX[stat] + ax.set_xlim([0, x_max]) + ax.set_yticks(x) + ax.set_yticklabels([MODEL_TO_LABEL[model] for model in models]) + handles, labels = ax.get_legend_handles_labels() + ax.legend( + [handles[i] for i in legend_indices], + [labels[i] for i in legend_indices], + loc="best", + ) + + for bar, distractor in zip(distractor_bars, distractors): + ax.bar_label( + bar, + label_type="edge", + fmt="%.2f", + # color="white" if distractor == "distractorless" else "black", + fontsize=8, + ) + + # get rid of horizontal grid lines + ax.grid(axis="y", which="both") + + fig.set_tight_layout(True) + + plt.savefig(save_path, bbox_inches="tight", dpi=300) + + +def count_tokens(log_dir) -> dict[str, dict[str, dict[str, int]]]: + """ + model -> distractor -> input, output, total tokens + """ + token_counts = { + model: { + distractor: {kind: 0 for kind in ["input", "output", "total"]} + for distractor in DISTRACTORS + } + for model in OAI_MODELS + } + globbed_logs = list(log_dir.glob("*.log")) + already_examined = set() + for log in tqdm(globbed_logs, total=len(globbed_logs), desc="Counting tokens"): + spec = log_utils.extract_spec(log) + distractor = spec["split"] + model = get_model(spec) + if model not in OAI_MODELS: + continue + + # dont care about repeats, this is a rough estimate anyway + if (model, distractor) in already_examined: + continue + already_examined.add((model, distractor)) + + samplings = log_utils.extract_individual_results(log, "sampling") + for sampling in samplings: + usage = sampling["usage"] + token_counts[model][distractor]["input"] += zero_if_none( + usage["prompt_tokens"] + ) + token_counts[model][distractor]["output"] += zero_if_none( + usage["completion_tokens"] + ) + token_counts[model][distractor]["total"] += zero_if_none( + usage["total_tokens"] + ) + return token_counts + + +def main(args: argparse.Namespace): + log_dir = Path(args.log_dir) + save_dir = Path(args.save_dir) + save_dir.mkdir(exist_ok=True, parents=True) + + results_dict = make_results_dict(log_dir) + + for stat in tqdm(PLOT_STATS, desc="Making plots"): + save_path = save_dir / f"{stat}.png" + make_bar_plot(results_dict, stat, save_path) + + for stat in tqdm(JSON_STATS, desc="Saving JSONs"): + save_path = save_dir / f"{stat}.json" + with open(save_path, "w") as f: + json.dump(results_dict[stat], f, indent=2) + + token_counts = count_tokens(log_dir) + save_path = save_dir / "token_counts.json" + with open(save_path, "w") as f: + json.dump(token_counts, f, indent=2) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--log_dir", type=str, required=True, help="Where the logs are stored" + ) + parser.add_argument( + "--save_dir", type=str, required=True, help="Where to save the plots" + ) + args = parser.parse_args() + main(args) diff --git a/evals/elsuite/already_said_that/scripts/run_experiments.sh b/evals/elsuite/already_said_that/scripts/run_experiments.sh new file mode 100755 index 0000000000..dd300f6141 --- /dev/null +++ b/evals/elsuite/already_said_that/scripts/run_experiments.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +usage() { + echo "Usage: $0 -l logdir" + echo " -l logdir Specify the directory for log files" + exit 1 +} + +# Check if no arguments were provided +if [ $# -eq 0 ]; then + usage + exit 1 +fi + +# Parse command-line options +while getopts 's:l:' flag; do + case "${flag}" in + l) logdir=${OPTARG} ;; + *) usage ;; + esac +done + +# Check if mandatory arguments were provided +if [ -z "$logdir" ]; then + usage + exit 1 +fi + +NUM_REPEATS=3 + +export EVALS_THREADS=10 +export EVALS_THREADS_TIMEOUT=5 + +declare -a SOLVERS=( + # gpt-4-turbo-preview + "generation/direct/gpt-4-turbo-preview" + "already_said_that/cot/gpt-4-turbo-preview" + # gpt-3.5-turbo + "generation/direct/gpt-3.5-turbo" + "already_said_that/cot/gpt-3.5-turbo" + # gpt-4-base + "generation/hhh/gpt-4-base" + # mixtral-8x7b-instruct + "generation/direct/mixtral-8x7b-instruct" + # llama chat 70b + "generation/direct/llama-2-70b-chat" + # gemini-pro + "generation/direct/gemini-pro" + # random baseline + "already_said_that/random_baseline" +) + +declare -a DISTRACTORS=( + "reverse-sort-words-eng" + "first-letters" + "ambiguous-sentences" + "which-is-heavier" + "distractorless" +) + +# Check if GEMINI_API_KEY is set +if [ -z "$GEMINI_API_KEY" ]; then + echo "Enter your Gemini API Key:" + read -s GEMINI_API_KEY + export GEMINI_API_KEY +fi + +# Check if TOGETHER_API_KEY is set +if [ -z "$TOGETHER_API_KEY" ]; then + echo "Enter your Together API Key:" + read -s TOGETHER_API_KEY + export TOGETHER_API_KEY +fi + +start_time=$SECONDS +for solver in "${SOLVERS[@]}"; do + + if [[ $solver == *"gemini"* ]]; then + export EVALS_SEQUENTIAL=1 + else + export EVALS_SEQUENTIAL=0 + fi + + solver_dotted=${solver//\//.} + + for ((i = 1; i <= NUM_REPEATS; i++)); do + for distractor in "${DISTRACTORS[@]}"; do + record_path="${logdir}/${solver_dotted}_${distractor}_${i}" + echo "Running $solver with $distractor, seed $i" + if [[ $solver == *"cot"* ]]; then + oaieval $solver "already_said_that.${distractor}" \ + --seed $i --record_path "$record_path.log" \ + --completion_args persistent_memory=False + else + oaieval $solver "already_said_that.${distractor}" \ + --record_path "$record_path.log" \ + --seed $i + fi + done + done +done +echo "Total time: $((SECONDS - start_time)) seconds" diff --git a/evals/elsuite/already_said_that/solvers.py b/evals/elsuite/already_said_that/solvers.py new file mode 100644 index 0000000000..5eed8c84a6 --- /dev/null +++ b/evals/elsuite/already_said_that/solvers.py @@ -0,0 +1,42 @@ +import random +from typing import Any + +from evals.solvers.solver import NestedSolver, Solver, SolverResult, SolverSpec +from evals.task_state import TaskState + + +class RandomBaselineSolver(Solver): + def __init__(self, registry: Any = None): + super().__init__() + + def _solve(self, task_state: TaskState, **kwargs) -> SolverResult: + answer = random.choice(["yes", "no"]) + return SolverResult(output=f"[answer: {answer}]") + + +class AlreadySaidThatHuman(NestedSolver): + def __init__(self, human_cli_solver: SolverSpec, *args, **kwargs): + super().__init__(human_cli_solver=human_cli_solver, *args, **kwargs) + + @property + def human_cli_solver(self) -> Solver: + return self.get_solver("human_cli_solver") + + def _solve(self, task_state: TaskState) -> SolverResult: + human_result = self.human_cli_solver(task_state=task_state) + answer = self._map_to_yesno(human_result.output) + return SolverResult( + output=f"[answer: {answer}]", + ) + + def _map_to_yesno(self, yesno_ish): + """ + Maps Y, y, Yes,1, yes, N, n, No, no, 0 to yes or no, respectively. + """ + if yesno_ish.lower() in {"y", "yes", "1"}: + return "yes" + elif yesno_ish.lower() in {"n", "no", "0"}: + return "no" + else: + # for other answers, return the original answer + return yesno_ish diff --git a/evals/elsuite/already_said_that/test_distractors.py b/evals/elsuite/already_said_that/test_distractors.py new file mode 100644 index 0000000000..d565274cbb --- /dev/null +++ b/evals/elsuite/already_said_that/test_distractors.py @@ -0,0 +1,187 @@ +import pytest + +from evals.elsuite.already_said_that.distractors import ( + proc_distractors_ambiguous_sentences, + proc_distractors_first_letters, + proc_distractors_reverse_sort_words_eng, + proc_which_is_heavier, +) + + +@pytest.fixture +def which_is_heavier_samples(): + return [ + { + "input": [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "Is 2 kilograms of eggshells heavier than 1 kilogram of gold bars? Answer Yes or No", + }, + ], + "ideal": "Yes", + }, + { + "input": [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "Is 10 kilograms of granite slabs heavier than 20 kilograms of balloons? Answer Yes or No", + }, + ], + "ideal": "No", + }, + ] + + +@pytest.fixture +def first_letters_samples(): + return [ + { + "input": [ + { + "role": "system", + "content": "You are a helpful assistant. Your response will contain just a single word in lowercase and nothing else.", + }, + { + "role": "user", + "content": 'What is the word obtained from concatenating the first letters of the words in the following sentence: "Dear Elena find irregular new energies."?', + }, + ], + "ideal": "define", + }, + { + "input": [ + { + "role": "system", + "content": "You are a helpful assistant. Your response will contain just a single word in lowercase and nothing else.", + }, + { + "role": "user", + "content": 'What is the word obtained from concatenating the first letters of the words in the following sentence: "Bring roses on a doorstep."?', + }, + ], + "ideal": "broad", + }, + ] + + +@pytest.fixture +def ambiguous_sentences_samples(): + return [ + { + "input": [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "This morning, Joey built a sand castle on the beach, and put a toy flag in the highest tower, but this afternoon the tide knocked it down.. What did the the tide knock down?", + }, + ], + "ideal": "castle", + }, + { + "input": [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "Sam took French classes from Adam, because he was eager to speak it fluently.. Who was eager to speak French fluently?", + }, + ], + "ideal": "Sam", + }, + ] + + +@pytest.fixture +def reverse_sort_words_eng_samples(): + return [ + { + "input": [ + { + "role": "system", + "content": "Sort the following comma-separated words in reversed alphabetical order (respond as concisely as possible and only include the comma-seperated words in your response):", + }, + { + "role": "user", + "content": "gargoyle, porcupine, whirlwind, zest, lily, molasses, penguin, satchel, igloo, viper", + }, + ], + "ideal": "zest, whirlwind, viper, satchel, porcupine, penguin, molasses, lily, igloo, gargoyle", + }, + { + "input": [ + { + "role": "system", + "content": "Sort the following comma-separated words in reversed alphabetical order (respond as concisely as possible and only include the comma-seperated words in your response):", + }, + { + "role": "user", + "content": "marigold, opal, labyrinth, silhouette, whirlpool, trumpet, forge, quill, knapsack, emblem", + }, + ], + "ideal": "whirlpool, trumpet, silhouette, quill, opal, marigold, labyrinth, knapsack, forge, emblem", + }, + ] + + +def test_proc_distractors_which_is_heavier(which_is_heavier_samples): + result = proc_which_is_heavier(which_is_heavier_samples) + assert len(result) == 2 + assert result[0].question == "Is 2 kilograms of eggshells heavier than 1 kilogram of gold bars?" + assert result[0].ideal == "yes" + assert ( + result[1].question + == "Is 10 kilograms of granite slabs heavier than 20 kilograms of balloons?" + ) + assert result[1].ideal == "no" + + +def test_proc_distractors_first_letter(first_letters_samples): + result = proc_distractors_first_letters(first_letters_samples) + assert len(result) == 2 + assert ( + result[0].question + == 'What is the word obtained from concatenating the first letters of the words in the following sentence: "Dear Elena find irregular new energies."?' + ) + assert result[0].ideal == "define" + assert ( + result[1].question + == 'What is the word obtained from concatenating the first letters of the words in the following sentence: "Bring roses on a doorstep."?' + ) + assert result[1].ideal == "broad" + + +def test_proc_distractors_ambiguous_sentences(ambiguous_sentences_samples): + result = proc_distractors_ambiguous_sentences(ambiguous_sentences_samples) + assert len(result) == 2 + assert ( + result[0].question + == "This morning, Joey built a sand castle on the beach, and put a toy flag in the highest tower, but this afternoon the tide knocked it down.. What did the the tide knock down?" + ) + assert result[0].ideal == "castle" + assert ( + result[1].question + == "Sam took French classes from Adam, because he was eager to speak it fluently.. Who was eager to speak French fluently?" + ) + assert result[1].ideal == "sam" + + +def test_proc_distractors_reverse_sort_words_eng(reverse_sort_words_eng_samples): + result = proc_distractors_reverse_sort_words_eng(reverse_sort_words_eng_samples) + assert len(result) == 2 + assert ( + result[0].question + == "Sort the following comma-separated words in reversed alphabetical order: gargoyle, porcupine, whirlwind, zest, lily, molasses, penguin, satchel, igloo, viper" + ) + assert ( + result[0].ideal + == "zest, whirlwind, viper, satchel, porcupine, penguin, molasses, lily, igloo, gargoyle" + ) + assert ( + result[1].question + == "Sort the following comma-separated words in reversed alphabetical order: marigold, opal, labyrinth, silhouette, whirlpool, trumpet, forge, quill, knapsack, emblem" + ) + assert ( + result[1].ideal + == "whirlpool, trumpet, silhouette, quill, opal, marigold, labyrinth, knapsack, forge, emblem" + ) diff --git a/evals/elsuite/already_said_that/utils.py b/evals/elsuite/already_said_that/utils.py new file mode 100644 index 0000000000..f535fd9708 --- /dev/null +++ b/evals/elsuite/already_said_that/utils.py @@ -0,0 +1,171 @@ +import random +import re +from typing import Any, Optional + +from evals.elsuite.already_said_that.distractors import DistractorSample +from evals.task_state import Message + + +def build_message( + words_not_shown: set[str], + words_prev_shown: set[str], + distracting_words: set[str], + rng: random.Random, + distractor_sample: Optional[DistractorSample] = None, +) -> tuple[Message, list[str], Optional[DistractorSample]]: + """ + Builds the TaskState.Message for a given sample. + Randomly chooses whether to show a word (base task) or ask a question (distractor). + In case of base task, the words are randomly chosen either from base_words or from + distracting_words, i.e. words that have been mentioned by the solver or as part of + the distractor task in previous rounds. + + Args: + words_not_shown: set of words that have not been shown to the solver + words_prev_shown: set of words that have been shown to the solver in + distracting_words: set of words that have been mentioned by the solver or as part of + the distractor questions in previous rounds + distractor_sample: the distractor sample to use in case of a distractor round + rng: seeded random number generator, for reproducibility + + Returns: + message: the TaskState.Message + message_words: list of words that are part of the message + distractor_added: The distractor sample if the message is a distractor question, + else None + """ + # we mainly do the base task, but sometimes we do a distractor question + is_a_distractor_question = rng.choices([True, False], weights=[1 / 3, 2 / 3], k=1)[0] + if not is_a_distractor_question or distractor_sample is None: + message, message_words = build_base_task_message( + words_not_shown, words_prev_shown, distracting_words, rng + ) + distractor_added = None + else: + message, message_words = build_distractor_question_message(distractor_sample) + distractor_added = distractor_sample + return message, message_words, distractor_added + + +def build_base_task_message( + words_not_shown: set[str], + words_prev_shown: set[str], + distracting_words: set[str], + rng, +) -> tuple[Message, list[str]]: + distract = rng.choice([True, False]) + if distract and len(distracting_words) > 0: + word = rng.choice(list(distracting_words)) + else: + # balance between showing new words and repeating old ones + word_pool = rng.choice([words_not_shown, words_prev_shown]) + if len(word_pool) > 0: + word = rng.choice(list(word_pool)) + else: + # this is always non-empty + word = rng.choice(list(words_not_shown)) + word = word.lower() + message = Message(role="user", content=f"MAIN TASK - {word}") + return message, [word] + + +def build_distractor_question_message( + distractor_sample: list[str], +) -> tuple[Message, list[str]]: + question = distractor_sample.question + message = Message(role="user", content=question) + question_words = find_alpha_words(question) + return message, question_words + + +def find_alpha_words(s: str) -> list[str]: + """ + Finds all the alphabetical words in the input string (i.e. no numbers, punctuation, etc.) + """ + # dont break up words with apostrophes, e.g. "don't" should be one word + all_words = re.findall(r"[A-Za-z]+(?:['`][A-Za-z]+)*", s) + # skip non-alphabetical words ("don't" gets skipped, not interesting anyway) + filtered_words = [word.lower() for word in all_words if word.isalpha()] + return filtered_words + + +def parse_solver_output( + solver_output: str, + message_words: list[str], + words_prev_shown: set[str], + distractor_added: Optional[DistractorSample] = None, +) -> dict[str, Any]: + """ + Parses the string returned by the solver, determining whether a violation or + mistake was made + + Args: + solver_output: string returned by the solver + message_words: list of words that were part of the input to the solver + words_prev_shown: words already shown in previous turns as part of the + base task + distractor_added: dictionary containing the DistractorSample data if the message + was a distractor question, else None + + Returns: + violation_occurred: whether the solver output is a violation + mistake_made: whether the solver output is a mistake. True if violation is True. + false_positive: whether the mistake is a false positive + false_negative: whether the mistake is a false negative + solver_words: list of words mentioned by the solver, filtered for only + alphabetical words (i.e. no numbers, punctuation, etc.) + """ + solver_string = solver_output + pattern = r"\[answer: ([^\]]+)\]" + match = re.search(pattern, solver_string) + violation_occurred = False + mistake_made = False + false_positive = False + false_negative = False + solver_words = [] + if match: + answer = match.group(1).strip().lower() + solver_words = find_alpha_words(answer) + if distractor_added is not None: + mistake_made = eval_distractor_task(answer, distractor_added.ideal) + else: + ( + violation_occurred, + mistake_made, + false_positive, + false_negative, + ) = eval_main_task(answer, message_words[0], words_prev_shown) + else: + violation_occurred = True + mistake_made = True + + return { + "violation_occurred": violation_occurred, + "mistake_made": mistake_made, + "false_positive": false_positive, + "false_negative": false_negative, + "solver_words": solver_words, + } + + +def eval_distractor_task(answer: str, ideal: str) -> bool: + # whether a mistake is made + return answer != ideal.strip().lower() + + +def eval_main_task(answer, curr_word, words_prev_shown): + violation_occurred = False + mistake_made = False + false_positive = False + false_negative = False + if answer not in {"yes", "no"}: + violation_occurred = True + mistake_made = True + elif answer == "no" and curr_word in words_prev_shown: + false_negative = True + mistake_made = True + elif answer == "yes" and curr_word not in words_prev_shown: + false_positive = True + mistake_made = True + + return violation_occurred, mistake_made, false_positive, false_negative diff --git a/evals/registry/data/already_said_that/500_100.jsonl b/evals/registry/data/already_said_that/500_100.jsonl new file mode 100644 index 0000000000..1804855f51 --- /dev/null +++ b/evals/registry/data/already_said_that/500_100.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6eeadef5aa33a57abcd0b42bbdb02d1a4eb3c61a35bcc1da9b1b29766f3469 +size 633725 diff --git a/evals/registry/data/already_said_that/LICENSE b/evals/registry/data/already_said_that/LICENSE new file mode 100644 index 0000000000..5a6c138028 --- /dev/null +++ b/evals/registry/data/already_said_that/LICENSE @@ -0,0 +1,3 @@ +WordNet: +WordNet License: https://wordnet.princeton.edu/license-and-commercial-use +Source: https://wordnet.princeton.edu/ diff --git a/evals/registry/evals/already_said_that.yaml b/evals/registry/evals/already_said_that.yaml new file mode 100644 index 0000000000..2895544ffe --- /dev/null +++ b/evals/registry/evals/already_said_that.yaml @@ -0,0 +1,50 @@ +already_said_that: + id: already_said_that.reverse-sort-words-eng + metrics: + [ + "avg_num_turns", + "stddev_num_turns", + "median_num_turns", + "max_num_turns", + "min_num_turns", + "false_positive_rate", + "false_negative_rate", + "avg_distractor_accuracy", + "violation_rate", + "avg_num_distractors", + "stddev_num_distractors", + "median_num_distractors", + "max_num_distractors", + "min_num_distractors", + ] + description: "Sustain performance in the presence of distractors" + +already_said_that.which-is-heavier: + class: evals.elsuite.already_said_that.eval:AlreadySaidThat + args: + samples_jsonl: already_said_that/500_100.jsonl + distractor_variant: which-is-heavier + +already_said_that.first-letters: + class: evals.elsuite.already_said_that.eval:AlreadySaidThat + args: + samples_jsonl: already_said_that/500_100.jsonl + distractor_variant: first-letters + +already_said_that.ambiguous-sentences: + class: evals.elsuite.already_said_that.eval:AlreadySaidThat + args: + samples_jsonl: already_said_that/500_100.jsonl + distractor_variant: ambiguous-sentences + +already_said_that.reverse-sort-words-eng: + class: evals.elsuite.already_said_that.eval:AlreadySaidThat + args: + samples_jsonl: already_said_that/500_100.jsonl + distractor_variant: reverse-sort-words-eng + +already_said_that.distractorless: + class: evals.elsuite.already_said_that.eval:AlreadySaidThat + args: + samples_jsonl: already_said_that/500_100.jsonl + distractor_variant: distractorless diff --git a/evals/registry/solvers/already_said_that.yaml b/evals/registry/solvers/already_said_that.yaml new file mode 100644 index 0000000000..71f0b65ff2 --- /dev/null +++ b/evals/registry/solvers/already_said_that.yaml @@ -0,0 +1,79 @@ +already_said_that/random_baseline: + class: evals.elsuite.already_said_that.solvers:RandomBaselineSolver + +already_said_that/human_cli: + class: evals.elsuite.already_said_that.solvers:AlreadySaidThatHuman + args: + human_cli_solver: + class: evals.solvers.human_cli_solver:HumanCliSolver + args: + registry: null + +already_said_that/cot/gpt-3.5-turbo: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + persistent_memory: False + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 1 + max_tokens: 512 + +already_said_that/cot/gpt-4-turbo-preview: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + persistent_memory: False + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-turbo-preview + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-turbo-preview + extra_options: + temperature: 1 + max_tokens: 512 + +already_said_that/cot_hhh/gpt-4-base: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + persistent_memory: False + cot_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512 diff --git a/evals/utils/log_utils.py b/evals/utils/log_utils.py index d54a846f41..6ef2b5e8ff 100644 --- a/evals/utils/log_utils.py +++ b/evals/utils/log_utils.py @@ -14,6 +14,17 @@ def get_final_results_from_dir(log_dir: Union[str, Path]) -> dict[Path, dict]: return final_results_dict +def get_specs_from_dir(log_dir: Union[str, Path]) -> dict[Path, dict]: + """ + Given a directory of log files, return a dictionary mapping log file paths to specs. + """ + specs_dict = {} + for path in Path(log_dir).glob("**/*.log"): + spec = extract_spec(path) + specs_dict[path] = spec + return specs_dict + + def extract_final_results(path: Path) -> dict: """ Given a path to a log file, find and return the "final_report" dictionary. @@ -31,7 +42,7 @@ def extract_final_results(path: Path) -> dict: raise ValueError(f"Could not find final_report in {path}") -def extract_individual_results(path: Path) -> list[dict]: +def extract_individual_results(path: Path, type_string: str = "metrics") -> list[dict]: """ Given a path to a log file, grab all the individual sample results. """ @@ -42,7 +53,7 @@ def extract_individual_results(path: Path) -> list[dict]: try: loaded_line = json.loads(line) if "type" in loaded_line: - if loaded_line["type"] == "metrics": + if loaded_line["type"] == type_string: all_data.append(loaded_line["data"]) except json.decoder.JSONDecodeError: print(f"Skipping line: {line}")