From 111b4ece8413342f18c608c304e691ca8144aa93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20B=C3=A4uerle?=
 <45968370+famosab@users.noreply.github.com>
Date: Thu, 20 Jun 2024 12:49:51 +0200
Subject: [PATCH] feat: add version of benchmark as key in presets and include
 in report (#80)

* feat: add version flag to genomes

* feat: add bm version to name

* feat: add bm version as label

* fix: snakefmt

* fix: remov version from title

* feat: function to get version

* fix: remove version from label and add as param

* fix: add version to title (can also be changed to description)

* fix: snakefmt

* fix: include genome version in description

* fix: get genome name for precision and recall

* fix: f strings

* fix: change text for bm version

* fix: snakefmt
---
 .../resources/datavzrd/fp-fn-config.yte.yaml     |  6 ++++++
 .../datavzrd/precision-recall-config.yte.yaml    | 14 +++++++++++++-
 workflow/resources/presets.yaml                  |  4 ++++
 workflow/rules/common.smk                        | 16 +++++++++++++++-
 workflow/rules/eval.smk                          |  8 +++++++-
 5 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/workflow/resources/datavzrd/fp-fn-config.yte.yaml b/workflow/resources/datavzrd/fp-fn-config.yte.yaml
index 1feac08..ff75829 100644
--- a/workflow/resources/datavzrd/fp-fn-config.yte.yaml
+++ b/workflow/resources/datavzrd/fp-fn-config.yte.yaml
@@ -31,10 +31,14 @@ views:
     ?view if view == "main" else f"by {view}":
       ?if view == "main":
         desc: |
+          ?f"""
           The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous).
           If a callset does not have an entry, this means that the variant matches the truth.
 
           Rows and columns are sorted according to an unsupervised hiearchical clustering with hamming distance and ward linkage.
+
+          Benchmark version: {wildcards.genome} {params.version}
+          """
       ?else:
         desc: |
           ?f"""
@@ -46,6 +50,8 @@ views:
 
           The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous).
           If a callset does not have an entry, this means that the variant matches the truth.
+
+          Benchmark version: {wildcards.genome} {params.version}
           """
       dataset: ?view
       page-size: 12
diff --git a/workflow/resources/datavzrd/precision-recall-config.yte.yaml b/workflow/resources/datavzrd/precision-recall-config.yte.yaml
index d5ee2fb..23d2a3e 100644
--- a/workflow/resources/datavzrd/precision-recall-config.yte.yaml
+++ b/workflow/resources/datavzrd/precision-recall-config.yte.yaml
@@ -22,6 +22,7 @@ views:
   results-plot:
     dataset: results
     desc: |
+      ?f"""
       Precision and recall are calculated by matching variants between each callset
       and truth, stratified by coverage categories. The matching ignores genotype
       differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate"
@@ -32,6 +33,9 @@ views:
       are not shown because estimates are unreliable in such cases.
 
       Zoom in by scrolling inside of the plot area, pan by dragging, and reset by double-clicking.
+
+      Benchmark version: {params.genome} {params.version}
+      """
     render-plot:
       spec: |
         {
@@ -71,8 +75,12 @@ views:
     dataset: results
     ?if params.vaf:
       desc: |
+        ?f"""
         Precision and recall are calculated by matching variants between each callset
         and truth, stratified by coverage categories. Stratified by VAF.
+
+        Benchmark version: {params.genome} {params.version}
+        """
       page-size: 12
       render-table:
         columns:
@@ -121,7 +129,7 @@ views:
                     - "#6baed6"
           vaf:
             plot:
-             heatmap:
+              heatmap:
                 scale: linear
                 custom-content:
                   function(value, row) {
@@ -152,10 +160,14 @@ views:
                   - "#1a833fff"
     ?else:
       desc: |
+        ?f"""
         Precision and recall are calculated by matching variants between each callset
         and truth, stratified by coverage categories. The matching ignores genotype
         differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate"
         column.
+
+        Benchmark version: {params.genome} {params.version}
+        """
       page-size: 12
       render-table:
         columns:
diff --git a/workflow/resources/presets.yaml b/workflow/resources/presets.yaml
index 0ec9ea9..fcab238 100644
--- a/workflow/resources/presets.yaml
+++ b/workflow/resources/presets.yaml
@@ -50,6 +50,7 @@ benchmarks:
 
 genomes:
   na12878:
+    version: 4.2.1
     truth:
       grch37: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh37/HG001_GRCh37_1_22_v4.2.1_benchmark.vcf.gz
       grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
@@ -58,6 +59,7 @@ genomes:
       grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.bed
 
   chm-eval:
+    version: 0.5
     archive: https://github.com/lh3/CHM-eval/releases/download/v0.5/CHM-evalkit-20180222.tar
     truth:
       grch38: full.38.vcf.gz
@@ -67,6 +69,7 @@ genomes:
       grch37: full.37m.bed.gz
 
   seqc2-somatic:
+    version: 1.2
     truth:
       grch38:
         snvs: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/release/latest/high-confidence_sSNV_in_HC_regions_v1.2.vcf.gz
@@ -76,6 +79,7 @@ genomes:
     somatic: true
 
   na12878-somatic:
+    version: 4.2.1
     truth:
       grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
     confidence-regions:
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 382838d..d46c409 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -93,7 +93,7 @@ def get_plot_cov_labels():  # TODO check if ever used anywhere
     def label(name):
         lower, upper = get_cov_interval(name)
         if upper:
-            return f"{lower}-{upper-1}"
+            return f"{lower}-{upper - 1}"
         return f"≥{lower}"
 
     return {name: label(name) for name in low_coverages}
@@ -490,6 +490,20 @@ def get_collect_precision_recall_input(wildcards):
     )
 
 
+def get_genome_name(wildcards):
+    if hasattr(wildcards, "benchmark"):
+        return get_benchmark(wildcards.benchmark).get("genome")
+    if hasattr(wildcards, "callset"):
+        benchmark = config["variant-calls"][wildcards.callset]["benchmark"]
+        return get_benchmark(benchmark).get("genome")
+    else:
+        return wildcards.genome
+
+
+def get_genome_version(wildcards):
+    return genomes[get_genome_name(wildcards)].get("version")
+
+
 def get_genome_callsets(genome):
     return sorted(
         callset
diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk
index 30db27d..e99b03a 100644
--- a/workflow/rules/eval.smk
+++ b/workflow/rules/eval.smk
@@ -273,7 +273,10 @@ rule report_precision_recall:
             directory("results/report/precision-recall/{benchmark}/{vartype}"),
             htmlindex="index.html",
             category="precision/recall",
-            labels={"benchmark": "{benchmark}", "vartype": "{vartype}"},
+            labels={
+                "benchmark": "{benchmark}",
+                "vartype": "{vartype}",
+            },
         ),
     log:
         "logs/datavzrd/precision-recall/{benchmark}/{vartype}.log",
@@ -281,6 +284,8 @@ rule report_precision_recall:
         somatic=get_somatic_status,
         vaf=get_vaf_status,
         high_coverage=get_high_coverage_status,
+        genome=get_genome_name,
+        version=get_genome_version,
     wrapper:
         "v3.10.1/utils/datavzrd"
 
@@ -343,5 +348,6 @@ rule report_fp_fn:
         "logs/datavzrd/fp-fn/{genome}/{cov}/{classification}.log",
     params:
         labels=lambda w: get_callsets_labels(get_genome_callsets(w.genome)),
+        version=get_genome_version,
     wrapper:
         "v3.10.1/utils/datavzrd"