From 148c09174c4c10394b7d291b2a4d9f0f46b03d49 Mon Sep 17 00:00:00 2001 From: "Justin W. Lin" Date: Tue, 7 May 2024 15:30:43 -0700 Subject: [PATCH 1/9] evals: Add `imo_solutions_only.yaml` to registry --- evals/registry/evals/imo_solutions_only.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 evals/registry/evals/imo_solutions_only.yaml diff --git a/evals/registry/evals/imo_solutions_only.yaml b/evals/registry/evals/imo_solutions_only.yaml new file mode 100644 index 0000000000..63c5e6548b --- /dev/null +++ b/evals/registry/evals/imo_solutions_only.yaml @@ -0,0 +1,9 @@ +: + id: imo_solutions_only.dev.v0 + description: + metrics: [accuracy] + +.dev.v0: + class: evals.elsuite.basic.match:Match + args: + samples_jsonl: imo_solutions_only/samples.jsonl From 1bff78dba808e1dcd3fa4bf6adfffdbbff79fef3 Mon Sep 17 00:00:00 2001 From: "Justin W. Lin" Date: Tue, 7 May 2024 15:31:01 -0700 Subject: [PATCH 2/9] evals: Add `samples.jsonl` for `imo_solutions_only` containing 19 solutions-only IMO questions --- evals/registry/data/imo_solutions_only/samples.jsonl | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 evals/registry/data/imo_solutions_only/samples.jsonl diff --git a/evals/registry/data/imo_solutions_only/samples.jsonl b/evals/registry/data/imo_solutions_only/samples.jsonl new file mode 100644 index 0000000000..08934fa15d --- /dev/null +++ b/evals/registry/data/imo_solutions_only/samples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b09d97b7d20ed27bd441e555d40927897f10dc0b57d29d7f8026b2062ca6eff +size 17145 From 52ca817f214d22032fd54a37cb0d2d5352e95620 Mon Sep 17 00:00:00 2001 From: "Justin W. Lin" Date: Tue, 14 May 2024 21:41:58 -0700 Subject: [PATCH 3/9] fix: Add description --- evals/registry/evals/imo_solutions_only.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/registry/evals/imo_solutions_only.yaml b/evals/registry/evals/imo_solutions_only.yaml index 63c5e6548b..5e2b8e2972 100644 --- a/evals/registry/evals/imo_solutions_only.yaml +++ b/evals/registry/evals/imo_solutions_only.yaml @@ -1,6 +1,6 @@ : id: imo_solutions_only.dev.v0 - description: + description: A small set of IMO problems that have yes/no, numeric solutions as answers. metrics: [accuracy] .dev.v0: From 839c2672760eca4bfb4961a37b1272068b55d89b Mon Sep 17 00:00:00 2001 From: "Justin W. Lin" Date: Wed, 15 May 2024 00:35:50 -0700 Subject: [PATCH 4/9] =?UTF-8?q?fix:=20Rename=20`imo=5Fsolutions=5Fonly`=20?= =?UTF-8?q?=E2=86=92=20`imo=5Fexact=5Fanswers`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../samples.jsonl | 0 evals/registry/evals/imo_exact_answers.yaml | 9 +++++++++ evals/registry/evals/imo_solutions_only.yaml | 9 --------- 3 files changed, 9 insertions(+), 9 deletions(-) rename evals/registry/data/{imo_solutions_only => imo_exact_answers}/samples.jsonl (100%) create mode 100644 evals/registry/evals/imo_exact_answers.yaml delete mode 100644 evals/registry/evals/imo_solutions_only.yaml diff --git a/evals/registry/data/imo_solutions_only/samples.jsonl b/evals/registry/data/imo_exact_answers/samples.jsonl similarity index 100% rename from evals/registry/data/imo_solutions_only/samples.jsonl rename to evals/registry/data/imo_exact_answers/samples.jsonl diff --git a/evals/registry/evals/imo_exact_answers.yaml b/evals/registry/evals/imo_exact_answers.yaml new file mode 100644 index 0000000000..f02fa6a5d5 --- /dev/null +++ b/evals/registry/evals/imo_exact_answers.yaml @@ -0,0 +1,9 @@ +: + id: imo_exact_answers.dev.v0 + description: A small set of IMO problems that have exact answers (e.g. yes/no, number, fraction). + metrics: [accuracy] + +.dev.v0: + class: evals.elsuite.basic.match:Match + args: + samples_jsonl: imo_exact_answers/samples.jsonl diff --git a/evals/registry/evals/imo_solutions_only.yaml b/evals/registry/evals/imo_solutions_only.yaml deleted file mode 100644 index 5e2b8e2972..0000000000 --- a/evals/registry/evals/imo_solutions_only.yaml +++ /dev/null @@ -1,9 +0,0 @@ -: - id: imo_solutions_only.dev.v0 - description: A small set of IMO problems that have yes/no, numeric solutions as answers. - metrics: [accuracy] - -.dev.v0: - class: evals.elsuite.basic.match:Match - args: - samples_jsonl: imo_solutions_only/samples.jsonl From 67542df1e214ef4fcae3d81a0cd911a26baa038f Mon Sep 17 00:00:00 2001 From: "Justin W. Lin" Date: Fri, 17 May 2024 08:41:48 -0500 Subject: [PATCH 5/9] fix: Replace `` in `.yaml` file --- evals/registry/evals/imo_exact_answers.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/registry/evals/imo_exact_answers.yaml b/evals/registry/evals/imo_exact_answers.yaml index f02fa6a5d5..860c8e7fc6 100644 --- a/evals/registry/evals/imo_exact_answers.yaml +++ b/evals/registry/evals/imo_exact_answers.yaml @@ -1,9 +1,9 @@ -: +imo_exact_answers: id: imo_exact_answers.dev.v0 description: A small set of IMO problems that have exact answers (e.g. yes/no, number, fraction). metrics: [accuracy] -.dev.v0: +imo_exact_answers.dev.v0: class: evals.elsuite.basic.match:Match args: samples_jsonl: imo_exact_answers/samples.jsonl From 6fcf137e486f53ca14c3dcbbe5f0210784ae9dc4 Mon Sep 17 00:00:00 2001 From: "Justin W. Lin" Date: Fri, 17 May 2024 08:42:37 -0500 Subject: [PATCH 6/9] fix: Escape backslashes in LaTeX content for JSON compatibility --- evals/registry/data/imo_exact_answers/samples.jsonl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/registry/data/imo_exact_answers/samples.jsonl b/evals/registry/data/imo_exact_answers/samples.jsonl index 08934fa15d..118bbdc9a7 100644 --- a/evals/registry/data/imo_exact_answers/samples.jsonl +++ b/evals/registry/data/imo_exact_answers/samples.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b09d97b7d20ed27bd441e555d40927897f10dc0b57d29d7f8026b2062ca6eff -size 17145 +oid sha256:fad385ce8ae4b040072e5dc61a171e943716621fe1ac7d8db71a4879d64582b3 +size 17219 From be7118188e3c90af150b01c7292687053574a831 Mon Sep 17 00:00:00 2001 From: "Justin W. Lin" Date: Fri, 17 May 2024 08:44:28 -0500 Subject: [PATCH 7/9] fix: Remove instructions in system prompt to answer with `[]` --- evals/registry/data/imo_exact_answers/samples.jsonl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/registry/data/imo_exact_answers/samples.jsonl b/evals/registry/data/imo_exact_answers/samples.jsonl index 118bbdc9a7..41c4d445fb 100644 --- a/evals/registry/data/imo_exact_answers/samples.jsonl +++ b/evals/registry/data/imo_exact_answers/samples.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fad385ce8ae4b040072e5dc61a171e943716621fe1ac7d8db71a4879d64582b3 -size 17219 +oid sha256:53abc39b993f9de99a22309127bc5f16847273d8b441981d66c73d0abe9e5916 +size 16041 From bd7ab27914aff49fc7ec21ad953e9124d214f141 Mon Sep 17 00:00:00 2001 From: "Justin W. Lin" Date: Fri, 17 May 2024 09:03:25 -0500 Subject: [PATCH 8/9] fix: Update evaluation method to `Includes` instead of `Match` --- evals/registry/evals/imo_exact_answers.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/registry/evals/imo_exact_answers.yaml b/evals/registry/evals/imo_exact_answers.yaml index 860c8e7fc6..4d72c0e5cf 100644 --- a/evals/registry/evals/imo_exact_answers.yaml +++ b/evals/registry/evals/imo_exact_answers.yaml @@ -4,6 +4,6 @@ imo_exact_answers: metrics: [accuracy] imo_exact_answers.dev.v0: - class: evals.elsuite.basic.match:Match + class: evals.elsuite.basic.includes:Includes args: samples_jsonl: imo_exact_answers/samples.jsonl From 66a00b6955625a2ce91b9df9d72222b3e6c80772 Mon Sep 17 00:00:00 2001 From: "Justin W. Lin" Date: Fri, 17 May 2024 09:13:16 -0500 Subject: [PATCH 9/9] fix: Update system prompt to include reasoning steps and update answer/response format --- evals/registry/data/imo_exact_answers/samples.jsonl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/registry/data/imo_exact_answers/samples.jsonl b/evals/registry/data/imo_exact_answers/samples.jsonl index 41c4d445fb..ca5a5dac4a 100644 --- a/evals/registry/data/imo_exact_answers/samples.jsonl +++ b/evals/registry/data/imo_exact_answers/samples.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53abc39b993f9de99a22309127bc5f16847273d8b441981d66c73d0abe9e5916 -size 16041 +oid sha256:cc97a4694cf576bf00540fae940906f5c63547fe421210738079072801e70b8f +size 18492