feat: add version of benchmark as key in presets and include in report (

#80) * feat: add version flag to genomes * feat: add bm version to name * feat: add bm version as label * fix: snakefmt * fix: remov version from title * feat: function to get version * fix: remove version from label and add as param * fix: add version to title (can also be changed to description) * fix: snakefmt * fix: include genome version in description * fix: get genome name for precision and recall * fix: f strings * fix: change text for bm version * fix: snakefmt
snakemake-workflows · Jun 20, 2024 · 111b4ec · 111b4ec
1 parent f4378c7
commit 111b4ec
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 3 deletions.
diff --git a/workflow/resources/datavzrd/fp-fn-config.yte.yaml b/workflow/resources/datavzrd/fp-fn-config.yte.yaml
@@ -31,10 +31,14 @@ views:
     ?view if view == "main" else f"by {view}":
       ?if view == "main":
         desc: |
+          ?f"""
           The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous).
           If a callset does not have an entry, this means that the variant matches the truth.
 
           Rows and columns are sorted according to an unsupervised hiearchical clustering with hamming distance and ward linkage.
+
+          Benchmark version: {wildcards.genome} {params.version}
+          """
       ?else:
         desc: |
           ?f"""
@@ -46,6 +50,8 @@ views:
 
           The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous).
           If a callset does not have an entry, this means that the variant matches the truth.
+
+          Benchmark version: {wildcards.genome} {params.version}
           """
       dataset: ?view
       page-size: 12

diff --git a/workflow/resources/datavzrd/precision-recall-config.yte.yaml b/workflow/resources/datavzrd/precision-recall-config.yte.yaml
@@ -22,6 +22,7 @@ views:
   results-plot:
     dataset: results
     desc: |
+      ?f"""
       Precision and recall are calculated by matching variants between each callset
       and truth, stratified by coverage categories. The matching ignores genotype
       differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate"
@@ -32,6 +33,9 @@ views:
       are not shown because estimates are unreliable in such cases.
 
       Zoom in by scrolling inside of the plot area, pan by dragging, and reset by double-clicking.
+
+      Benchmark version: {params.genome} {params.version}
+      """
     render-plot:
       spec: |
         {
@@ -71,8 +75,12 @@ views:
     dataset: results
     ?if params.vaf:
       desc: |
+        ?f"""
         Precision and recall are calculated by matching variants between each callset
         and truth, stratified by coverage categories. Stratified by VAF.
+
+        Benchmark version: {params.genome} {params.version}
+        """
       page-size: 12
       render-table:
         columns:
@@ -121,7 +129,7 @@ views:
                     - "#6baed6"
           vaf:
             plot:
-             heatmap:
+              heatmap:
                 scale: linear
                 custom-content:
                   function(value, row) {
@@ -152,10 +160,14 @@ views:
                   - "#1a833fff"
     ?else:
       desc: |
+        ?f"""
         Precision and recall are calculated by matching variants between each callset
         and truth, stratified by coverage categories. The matching ignores genotype
         differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate"
         column.
+
+        Benchmark version: {params.genome} {params.version}
+        """
       page-size: 12
       render-table:
         columns:

diff --git a/workflow/resources/presets.yaml b/workflow/resources/presets.yaml
@@ -50,6 +50,7 @@ benchmarks:
 
 genomes:
   na12878:
+    version: 4.2.1
     truth:
       grch37: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh37/HG001_GRCh37_1_22_v4.2.1_benchmark.vcf.gz
       grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
@@ -58,6 +59,7 @@ genomes:
       grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.bed
 
   chm-eval:
+    version: 0.5
     archive: https://github.com/lh3/CHM-eval/releases/download/v0.5/CHM-evalkit-20180222.tar
     truth:
       grch38: full.38.vcf.gz
@@ -67,6 +69,7 @@ genomes:
       grch37: full.37m.bed.gz
 
   seqc2-somatic:
+    version: 1.2
     truth:
       grch38:
         snvs: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/release/latest/high-confidence_sSNV_in_HC_regions_v1.2.vcf.gz
@@ -76,6 +79,7 @@ genomes:
     somatic: true
 
   na12878-somatic:
+    version: 4.2.1
     truth:
       grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
     confidence-regions:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -93,7 +93,7 @@ def get_plot_cov_labels():  # TODO check if ever used anywhere
     def label(name):
         lower, upper = get_cov_interval(name)
         if upper:
-            return f"{lower}-{upper-1}"
+            return f"{lower}-{upper - 1}"
         return f"≥{lower}"
 
     return {name: label(name) for name in low_coverages}
@@ -490,6 +490,20 @@ def get_collect_precision_recall_input(wildcards):
     )
 
 
+def get_genome_name(wildcards):
+    if hasattr(wildcards, "benchmark"):
+        return get_benchmark(wildcards.benchmark).get("genome")
+    if hasattr(wildcards, "callset"):
+        benchmark = config["variant-calls"][wildcards.callset]["benchmark"]
+        return get_benchmark(benchmark).get("genome")
+    else:
+        return wildcards.genome
+
+
+def get_genome_version(wildcards):
+    return genomes[get_genome_name(wildcards)].get("version")
+
+
 def get_genome_callsets(genome):
     return sorted(
         callset

diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk
@@ -273,14 +273,19 @@ rule report_precision_recall:
             directory("results/report/precision-recall/{benchmark}/{vartype}"),
             htmlindex="index.html",
             category="precision/recall",
-            labels={"benchmark": "{benchmark}", "vartype": "{vartype}"},
+            labels={
+                "benchmark": "{benchmark}",
+                "vartype": "{vartype}",
+            },
         ),
     log:
         "logs/datavzrd/precision-recall/{benchmark}/{vartype}.log",
     params:
         somatic=get_somatic_status,
         vaf=get_vaf_status,
         high_coverage=get_high_coverage_status,
+        genome=get_genome_name,
+        version=get_genome_version,
     wrapper:
         "v3.10.1/utils/datavzrd"
 
@@ -343,5 +348,6 @@ rule report_fp_fn:
         "logs/datavzrd/fp-fn/{genome}/{cov}/{classification}.log",
     params:
         labels=lambda w: get_callsets_labels(get_genome_callsets(w.genome)),
+        version=get_genome_version,
     wrapper:
         "v3.10.1/utils/datavzrd"