Skip to content

Commit

Permalink
Add missing result in EvaluationResult
Browse files Browse the repository at this point in the history
Add option to write result and expectedOutput to file when writing Evaluation Report
Closes #1208
  • Loading branch information
Diego Ramp authored and geoand committed Jan 9, 2025
1 parent 271bd2c commit f74c1b4
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ public double scoreForTag(String tag) {
* @throws IOException if an error occurs while writing the report
*/
public void writeReport(File output) throws IOException {
writeReport(output, false);
}

/**
* Write the report to a file using the Markdown syntax.
*
* @param output the output file, must not be {@code null}
* @param includeResult whether to include the expectedOutput and result of the evaluation in the report
* @throws IOException if an error occurs while writing the report
*/
public void writeReport(File output, boolean includeResult) throws IOException {
StringBuilder buffer = new StringBuilder();
buffer.append("# Evaluation Report\n\n");
buffer.append("**Global Score**: ").append(score).append("\n\n");
Expand All @@ -69,9 +80,14 @@ public void writeReport(File output) throws IOException {
}

buffer.append("\n## Details\n\n");
var detailHeader = includeResult ? "### " : "- ";
for (Scorer.EvaluationResult<?> evaluation : evaluations) {
buffer.append("- ").append(evaluation.sample().name()).append(": ")
buffer.append(detailHeader).append(evaluation.sample().name()).append(": ")
.append(evaluation.passed() ? "PASSED" : "FAILED").append("\n");
if (includeResult) {
buffer.append("#### Result\n").append(evaluation.result()).append("\n");
buffer.append("#### Expected Output\n").append(evaluation.sample().expectedOutput()).append("\n");
}
}

Files.write(output.toPath(), buffer.toString().getBytes());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,16 @@ public <T> EvaluationReport evaluate(Samples<T> samples, Function<Parameters, T>
var response = execute(sample, function);
LOG.infof("Evaluating sample `%s`", sample.name());
for (EvaluationStrategy<T> strategy : strategies) {
EvaluationResult<T> evaluation = new EvaluationResult<>(sample,
strategy.evaluate(sample, response));
EvaluationResult<T> evaluation = EvaluationResult.fromCompletedEvaluation(sample,
response, strategy.evaluate(sample, response));
LOG.infof("Evaluation of sample `%s` with strategy `%s`: %s", sample.name(),
strategy.getClass().getSimpleName(),
evaluation.passed() ? "OK" : "KO");
evaluations.add(evaluation);
}
} catch (Throwable e) {
LOG.errorf(e, "Failed to evaluate sample `%s`", sample.name());
evaluations.add(new EvaluationResult<>(sample, false));
evaluations.add(EvaluationResult.fromEvaluationThrowable(sample, e));
} finally {
latch.countDown();
}
Expand All @@ -66,7 +66,14 @@ public void close() {
executor.shutdown();
}

public record EvaluationResult<T>(EvaluationSample<T> sample, boolean passed) {
public record EvaluationResult<T>(EvaluationSample<T> sample, T result, Throwable thrown, boolean passed) {
public static <T> EvaluationResult<T> fromCompletedEvaluation(EvaluationSample<T> sample, T result, boolean passed) {
return new EvaluationResult<>(sample, result, null, passed);
}

public static <T> EvaluationResult<T> fromEvaluationThrowable(EvaluationSample<T> sample, Throwable thrown) {
return new EvaluationResult<>(sample, null, thrown, false);
}
}

private <T> T execute(EvaluationSample<T> sample, Function<Parameters, T> function) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.quarkiverse.langchain4j.testing.scorer;

import static org.assertj.core.api.Assertions.*;
import static org.assertj.core.api.Assertions.assertThat;

import java.io.File;
import java.io.IOException;
Expand All @@ -14,12 +14,14 @@ class EvaluationReportTest {
@Test
void globalScoreShouldBeCorrect() {
// Create mock evaluations.
Scorer.EvaluationResult<String> result1 = new Scorer.EvaluationResult<>(
Scorer.EvaluationResult<String> result1 = Scorer.EvaluationResult.fromCompletedEvaluation(
new EvaluationSample<>("Sample1", new Parameters(), "expected", List.of("tag1")),
"expected",
true);

Scorer.EvaluationResult<String> result2 = new Scorer.EvaluationResult<>(
Scorer.EvaluationResult<String> result2 = Scorer.EvaluationResult.fromCompletedEvaluation(
new EvaluationSample<>("Sample2", new Parameters(), "expected", List.of("tag2")),
"some-response",
false);

EvaluationReport report = new EvaluationReport(List.of(result1, result2));
Expand All @@ -31,16 +33,19 @@ void globalScoreShouldBeCorrect() {
@Test
void scoreForTagShouldBeCorrect() {
// Create mock evaluations.
Scorer.EvaluationResult<String> result1 = new Scorer.EvaluationResult<>(
Scorer.EvaluationResult<String> result1 = Scorer.EvaluationResult.fromCompletedEvaluation(
new EvaluationSample<>("Sample1", new Parameters(), "expected", List.of("tag1")),
"expected",
true);

Scorer.EvaluationResult<String> result2 = new Scorer.EvaluationResult<>(
Scorer.EvaluationResult<String> result2 = Scorer.EvaluationResult.fromCompletedEvaluation(
new EvaluationSample<>("Sample2", new Parameters(), "expected", List.of("tag2")),
"some-response",
false);

Scorer.EvaluationResult<String> result3 = new Scorer.EvaluationResult<>(
Scorer.EvaluationResult<String> result3 = Scorer.EvaluationResult.fromCompletedEvaluation(
new EvaluationSample<>("Sample3", new Parameters(), "expected", List.of("tag1", "tag2")),
"expected",
true);

EvaluationReport report = new EvaluationReport(List.of(result1, result2, result3));
Expand All @@ -53,12 +58,14 @@ void scoreForTagShouldBeCorrect() {
@Test
void writeReportShouldGenerateMarkdownFile() throws IOException {
// Create mock evaluations.
Scorer.EvaluationResult<String> result1 = new Scorer.EvaluationResult<>(
Scorer.EvaluationResult<String> result1 = Scorer.EvaluationResult.fromCompletedEvaluation(
new EvaluationSample<>("Sample1", new Parameters(), "expected", List.of("tag1")),
"expected",
true);

Scorer.EvaluationResult<String> result2 = new Scorer.EvaluationResult<>(
Scorer.EvaluationResult<String> result2 = Scorer.EvaluationResult.fromCompletedEvaluation(
new EvaluationSample<>("Sample2", new Parameters(), "expected", List.of("tag2")),
"some-response",
false);

EvaluationReport report = new EvaluationReport(List.of(result1, result2));
Expand All @@ -79,4 +86,40 @@ void writeReportShouldGenerateMarkdownFile() throws IOException {
assertThat(content).contains("- Sample1: PASSED");
assertThat(content).contains("- Sample2: FAILED");
}

@Test
void writeReportShouldGenerateMarkdownFileIncudingExpectedOutputAndResult() throws IOException {
// Create mock evaluations.
Scorer.EvaluationResult<String> result1 = Scorer.EvaluationResult.fromCompletedEvaluation(
new EvaluationSample<>("Sample1", new Parameters(), "expected1", List.of("tag1")),
"expected1",
true);

Scorer.EvaluationResult<String> result2 = Scorer.EvaluationResult.fromCompletedEvaluation(
new EvaluationSample<>("Sample2", new Parameters(), "expected2", List.of("tag2")),
"some-response",
false);

EvaluationReport report = new EvaluationReport(List.of(result1, result2));

// Write the report to a temporary file.
File tempFile = File.createTempFile("evaluation-report", ".md");
report.writeReport(tempFile, true);

// Assertions
assertThat(tempFile).exists();
String content = Files.readString(tempFile.toPath());
assertThat(content).contains("# Evaluation Report");
assertThat(content).contains("**Global Score**: 50.0");
assertThat(content).contains("## Score per tags");
assertThat(content).contains("- **tag1**: 100.0");
assertThat(content).contains("- **tag2**: 0.0");
assertThat(content).contains("## Details");
assertThat(content).contains("### Sample1: PASSED");
assertThat(content).contains("#### Result\nexpected1");
assertThat(content).contains("#### Expected Output\nexpected1");
assertThat(content).contains("### Sample2: FAILED");
assertThat(content).contains("#### Result\nsome-response");
assertThat(content).contains("#### Expected Output\nexpected2");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void evaluateShouldReturnCorrectReport() {
EvaluationSample<String> sample1 = new EvaluationSample<>(
"Sample1",
new Parameters().add(new Parameter.UnnamedParameter("param1")),
"expected1",
"expected1:param1",
List.of("tag1", "tag2"));

EvaluationSample<String> sample2 = new EvaluationSample<>(
Expand All @@ -36,7 +36,7 @@ void evaluateShouldReturnCorrectReport() {
"expected2",
List.of("tag2"));

Function<Parameters, String> mockFunction = params -> "expected1";
Function<Parameters, String> mockFunction = params -> "expected1:param1";
EvaluationStrategy<String> strategy = (sample, actual) -> actual.equals(sample.expectedOutput());

Samples<String> samples = new Samples<>(sample1, sample2);
Expand All @@ -46,11 +46,12 @@ void evaluateShouldReturnCorrectReport() {
assertThat(report.score()).isEqualTo(50.0); // Only one sample should pass.
assertThat(report.evaluations()).hasSize(2);

Scorer.EvaluationResult<?> result1 = report.evaluations().get(0);
assertThat(result1.passed()).isTrue();

Scorer.EvaluationResult<?> result2 = report.evaluations().get(1);
assertThat(result2.passed()).isFalse();
var actualEvaluations = report.evaluations().stream()
.map(e -> "%s[%s;%s=%s]".formatted(e.sample().name(), e.sample().expectedOutput(), e.result(), e.passed()))
.toList();
assertThat(actualEvaluations).containsExactlyInAnyOrder(
"Sample1[expected1:param1;expected1:param1=true]",
"Sample2[expected2;expected1:param1=false]");
}

@Test
Expand Down

0 comments on commit f74c1b4

Please sign in to comment.