From 111b4ece8413342f18c608c304e691ca8144aa93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20B=C3=A4uerle?= <45968370+famosab@users.noreply.github.com> Date: Thu, 20 Jun 2024 12:49:51 +0200 Subject: [PATCH] feat: add version of benchmark as key in presets and include in report (#80) * feat: add version flag to genomes * feat: add bm version to name * feat: add bm version as label * fix: snakefmt * fix: remov version from title * feat: function to get version * fix: remove version from label and add as param * fix: add version to title (can also be changed to description) * fix: snakefmt * fix: include genome version in description * fix: get genome name for precision and recall * fix: f strings * fix: change text for bm version * fix: snakefmt --- .../resources/datavzrd/fp-fn-config.yte.yaml | 6 ++++++ .../datavzrd/precision-recall-config.yte.yaml | 14 +++++++++++++- workflow/resources/presets.yaml | 4 ++++ workflow/rules/common.smk | 16 +++++++++++++++- workflow/rules/eval.smk | 8 +++++++- 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/workflow/resources/datavzrd/fp-fn-config.yte.yaml b/workflow/resources/datavzrd/fp-fn-config.yte.yaml index 1feac08..ff75829 100644 --- a/workflow/resources/datavzrd/fp-fn-config.yte.yaml +++ b/workflow/resources/datavzrd/fp-fn-config.yte.yaml @@ -31,10 +31,14 @@ views: ?view if view == "main" else f"by {view}": ?if view == "main": desc: | + ?f""" The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous). If a callset does not have an entry, this means that the variant matches the truth. Rows and columns are sorted according to an unsupervised hiearchical clustering with hamming distance and ward linkage. + + Benchmark version: {wildcards.genome} {params.version} + """ ?else: desc: | ?f""" @@ -46,6 +50,8 @@ views: The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous). If a callset does not have an entry, this means that the variant matches the truth. + + Benchmark version: {wildcards.genome} {params.version} """ dataset: ?view page-size: 12 diff --git a/workflow/resources/datavzrd/precision-recall-config.yte.yaml b/workflow/resources/datavzrd/precision-recall-config.yte.yaml index d5ee2fb..23d2a3e 100644 --- a/workflow/resources/datavzrd/precision-recall-config.yte.yaml +++ b/workflow/resources/datavzrd/precision-recall-config.yte.yaml @@ -22,6 +22,7 @@ views: results-plot: dataset: results desc: | + ?f""" Precision and recall are calculated by matching variants between each callset and truth, stratified by coverage categories. The matching ignores genotype differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate" @@ -32,6 +33,9 @@ views: are not shown because estimates are unreliable in such cases. Zoom in by scrolling inside of the plot area, pan by dragging, and reset by double-clicking. + + Benchmark version: {params.genome} {params.version} + """ render-plot: spec: | { @@ -71,8 +75,12 @@ views: dataset: results ?if params.vaf: desc: | + ?f""" Precision and recall are calculated by matching variants between each callset and truth, stratified by coverage categories. Stratified by VAF. + + Benchmark version: {params.genome} {params.version} + """ page-size: 12 render-table: columns: @@ -121,7 +129,7 @@ views: - "#6baed6" vaf: plot: - heatmap: + heatmap: scale: linear custom-content: function(value, row) { @@ -152,10 +160,14 @@ views: - "#1a833fff" ?else: desc: | + ?f""" Precision and recall are calculated by matching variants between each callset and truth, stratified by coverage categories. The matching ignores genotype differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate" column. + + Benchmark version: {params.genome} {params.version} + """ page-size: 12 render-table: columns: diff --git a/workflow/resources/presets.yaml b/workflow/resources/presets.yaml index 0ec9ea9..fcab238 100644 --- a/workflow/resources/presets.yaml +++ b/workflow/resources/presets.yaml @@ -50,6 +50,7 @@ benchmarks: genomes: na12878: + version: 4.2.1 truth: grch37: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh37/HG001_GRCh37_1_22_v4.2.1_benchmark.vcf.gz grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz @@ -58,6 +59,7 @@ genomes: grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.bed chm-eval: + version: 0.5 archive: https://github.com/lh3/CHM-eval/releases/download/v0.5/CHM-evalkit-20180222.tar truth: grch38: full.38.vcf.gz @@ -67,6 +69,7 @@ genomes: grch37: full.37m.bed.gz seqc2-somatic: + version: 1.2 truth: grch38: snvs: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/release/latest/high-confidence_sSNV_in_HC_regions_v1.2.vcf.gz @@ -76,6 +79,7 @@ genomes: somatic: true na12878-somatic: + version: 4.2.1 truth: grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz confidence-regions: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 382838d..d46c409 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -93,7 +93,7 @@ def get_plot_cov_labels(): # TODO check if ever used anywhere def label(name): lower, upper = get_cov_interval(name) if upper: - return f"{lower}-{upper-1}" + return f"{lower}-{upper - 1}" return f"≥{lower}" return {name: label(name) for name in low_coverages} @@ -490,6 +490,20 @@ def get_collect_precision_recall_input(wildcards): ) +def get_genome_name(wildcards): + if hasattr(wildcards, "benchmark"): + return get_benchmark(wildcards.benchmark).get("genome") + if hasattr(wildcards, "callset"): + benchmark = config["variant-calls"][wildcards.callset]["benchmark"] + return get_benchmark(benchmark).get("genome") + else: + return wildcards.genome + + +def get_genome_version(wildcards): + return genomes[get_genome_name(wildcards)].get("version") + + def get_genome_callsets(genome): return sorted( callset diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 30db27d..e99b03a 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -273,7 +273,10 @@ rule report_precision_recall: directory("results/report/precision-recall/{benchmark}/{vartype}"), htmlindex="index.html", category="precision/recall", - labels={"benchmark": "{benchmark}", "vartype": "{vartype}"}, + labels={ + "benchmark": "{benchmark}", + "vartype": "{vartype}", + }, ), log: "logs/datavzrd/precision-recall/{benchmark}/{vartype}.log", @@ -281,6 +284,8 @@ rule report_precision_recall: somatic=get_somatic_status, vaf=get_vaf_status, high_coverage=get_high_coverage_status, + genome=get_genome_name, + version=get_genome_version, wrapper: "v3.10.1/utils/datavzrd" @@ -343,5 +348,6 @@ rule report_fp_fn: "logs/datavzrd/fp-fn/{genome}/{cov}/{classification}.log", params: labels=lambda w: get_callsets_labels(get_genome_callsets(w.genome)), + version=get_genome_version, wrapper: "v3.10.1/utils/datavzrd"