Skip to content

Commit

Permalink
feat: Report FP / FN negative variants in single table (#107)
Browse files Browse the repository at this point in the history
* feat: Change folder structure for fp-fn results, add collection of fp-fn results for all covs of a callset and all callsets of a benchmark.

* fix: fmt and clean up

* fix: fix bug with collect-fp-fn when running low and high coverage callsets together.

* feat: yte config and report rendering per benchmark

* fix: snakefmt

* fix: output naming

* feat: add new datavzrd reports to Snakefile

* fix: snakefmt

* fix: fix input files and params.labels for report_fp_fn_benchmark

* feat: show columns based on germline/somatic

* fix: report directory

* fix: snakefmt

* fix: column names to match gemrline

* feat: add reports per callset

* fix: snakefmt

* fix: reporting

* fix: remove unused output

* feat: add description to reports and fix rule names

---------

Co-authored-by: Famke Bäuerle <[email protected]>
  • Loading branch information
BiancaStoecker and famosab authored Dec 13, 2024
1 parent db68793 commit 0464700
Show file tree
Hide file tree
Showing 6 changed files with 256 additions and 9 deletions.
5 changes: 5 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ if "variant-calls" in config:
benchmark=used_benchmarks,
vartype=["snvs", "indels"],
),
expand(
"results/report/fp-fn/callsets/{callset}/{classification}",
callset=used_callsets,
classification=["fp", "fn"],
),
get_fp_fn_reports,
# collect the checkpoint inputs to avoid issues when
# --all-temp is used: --all-temp leads to premature deletion
Expand Down
37 changes: 37 additions & 0 deletions workflow/resources/datavzrd/fp-fn-per-callset-config.yte.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
__use_yte__: true

__variables__:
green: "#74c476"
orange: "#fd8d3c"

name: ?f"{wildcards.classification} of {wildcards.callset}"

webview-controls: true
default-view: results-table

datasets:
results:
path: ?input.table
separator: "\t"
offer-excel: true

views:
results-table:
dataset: results
desc: |
?f"""
Rows are sorted by coverage.
Benchmark version: {params.genome} {params.version}
"""
page-size: 12
render-table:
columns:
coverage:
plot:
heatmap:
scale: ordinal
?if params.somatic:
true_genotype:
display-mode: hidden
predicted_genotype:
display-mode: hidden
49 changes: 45 additions & 4 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ callsets = config.get("variant-calls", dict())
benchmarks.update(config.get("custom-benchmarks", dict()))
used_benchmarks = {callset["benchmark"] for callset in callsets.values()}

used_callsets = {callset for callset in callsets.keys()}

used_genomes = {benchmarks[benchmark]["genome"] for benchmark in used_benchmarks}


Expand Down Expand Up @@ -403,6 +405,16 @@ def get_coverages(wildcards):
return coverages


def get_coverages_of_callset(callset):
benchmark = config["variant-calls"][callset]["benchmark"]
high_cov_status = benchmarks[benchmark].get("high-coverage", False)
if high_cov_status:
coverages = high_coverages
else:
coverages = low_coverages
return coverages


def get_somatic_status(wildcards):
if hasattr(wildcards, "benchmark"):
return genomes[benchmarks[wildcards.benchmark]["genome"]].get("somatic")
Expand Down Expand Up @@ -468,10 +480,17 @@ def get_collect_stratifications_input(wildcards):
)


def get_collect_stratifications_fp_fn_input(wildcards):
return expand(
"results/fp-fn/callsets/{{callset}}/{cov}.{{classification}}.tsv",
cov=get_nonempty_coverages(wildcards),
)


def get_fp_fn_reports(wildcards):
for genome in used_genomes:
yield from expand(
"results/report/fp-fn/{genome}/{cov}/{classification}",
"results/report/fp-fn/genomes/{genome}/{cov}/{classification}",
genome=genome,
cov={
cov
Expand All @@ -482,6 +501,15 @@ def get_fp_fn_reports(wildcards):
)


def get_fp_fn_reports_benchmarks(wildcards):
for genome in used_genomes:
yield from expand(
"results/report/fp-fn/benchmarks/{benchmark}/{classification}",
benchmark={benchmark for benchmark in used_benchmarks},
classification=["fp", "fn"],
)


def get_benchmark_callsets(benchmark):
return [
callset
Expand All @@ -497,6 +525,13 @@ def get_collect_precision_recall_input(wildcards):
)


def get_collect_fp_fn_benchmark_input(wildcards):
callsets = get_benchmark_callsets(wildcards.benchmark)
return expand(
"results/fp-fn/callsets/{callset}.{{classification}}.tsv", callset=callsets
)


def get_genome_name(wildcards):
if hasattr(wildcards, "benchmark"):
return get_benchmark(wildcards.benchmark).get("genome")
Expand Down Expand Up @@ -546,19 +581,25 @@ def get_callset_label_entries(callsets):


def get_collect_fp_fn_callsets(wildcards):
return get_genome_callsets(wildcards.genome)
callsets = get_genome_callsets(wildcards.genome)
callsets = [
callset
for callset in callsets
if wildcards.cov in get_coverages_of_callset(callset)
]
return callsets


def get_collect_fp_fn_input(wildcards):
callsets = get_collect_fp_fn_callsets(wildcards)
return expand(
"results/fp-fn/callsets/{{cov}}/{callset}/{{classification}}.tsv",
"results/fp-fn/callsets/{callset}/{{cov}}.{{classification}}.tsv",
callset=callsets,
)


def get_collect_fp_fn_labels(wildcards):
callsets = get_genome_callsets(wildcards.genome)
callsets = get_collect_fp_fn_callsets(wildcards)
return get_callset_label_entries(callsets)


Expand Down
76 changes: 71 additions & 5 deletions workflow/rules/eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,9 @@ rule extract_fp_fn:
calls="results/vcfeval/{callset}/{cov}/output.vcf.gz",
common_src=common_src,
output:
"results/fp-fn/callsets/{cov}/{callset}/{classification}.tsv",
"results/fp-fn/callsets/{callset}/{cov}.{classification}.tsv",
log:
"logs/extract-fp-fn/{cov}/{callset}/{classification}.log",
"logs/extract-fp-fn/{callset}/{cov}.{classification}.log",
conda:
"../envs/vembrane.yaml"
script:
Expand Down Expand Up @@ -374,18 +374,55 @@ rule collect_fp_fn:
"../scripts/collect-fp-fn.py"


rule collect_stratifications_fp_fn:
input:
get_collect_stratifications_fp_fn_input,
output:
"results/fp-fn/callsets/{callset}.{classification}.tsv",
params:
coverages=get_nonempty_coverages,
coverage_lower_bounds=get_coverages,
log:
"logs/fp-fn/callsets/{callset}.{classification}.log",
conda:
"../envs/stats.yaml"
# This has to happen after precision/recall has been computed, otherwise we risk
# extremely high memory usage if a callset does not match the truth at all.
priority: 1
script:
"../scripts/collect-stratifications-fp-fn.py"


rule collect_fp_fn_benchmark:
input:
tables=get_collect_fp_fn_benchmark_input,
output:
"results/fp-fn/benchmarks/{benchmark}.{classification}.tsv",
params:
callsets=lambda w: get_benchmark_callsets(w.benchmark),
log:
"logs/fp-fn/benchmarks/{benchmark}.{classification}.log",
conda:
"../envs/stats.yaml"
script:
"../scripts/collect-fp-fn-benchmarks.py"


rule report_fp_fn:
input:
main_dataset="results/fp-fn/genomes/{genome}/{cov}/{classification}/main.tsv",
dependency_sorting_datasets="results/fp-fn/genomes/{genome}/{cov}/{classification}/dependency-sorting",
config=workflow.source_path("../resources/datavzrd/fp-fn-config.yte.yaml"),
output:
report(
directory("results/report/fp-fn/{genome}/{cov}/{classification}"),
directory("results/report/fp-fn/genomes/{genome}/{cov}/{classification}"),
htmlindex="index.html",
category="{classification} variants",
category="{classification} variants per genome",
subcategory=lambda w: w.genome,
labels=lambda w: {"coverage": w.cov},
labels=lambda w: {
"coverage": w.cov,
"genome": w.genome,
},
),
log:
"logs/datavzrd/fp-fn/{genome}/{cov}/{classification}.log",
Expand All @@ -394,3 +431,32 @@ rule report_fp_fn:
version=get_genome_version,
wrapper:
"v5.0.1/utils/datavzrd"


rule report_fp_fn_callset:
input:
table="results/fp-fn/callsets/{callset}.{classification}.tsv",
config=workflow.source_path(
"../resources/datavzrd/fp-fn-per-callset-config.yte.yaml"
),
output:
report(
directory("results/report/fp-fn/callsets/{callset}/{classification}"),
htmlindex="index.html",
category="{classification} variants per benchmark",
subcategory=lambda w: config["variant-calls"][w.callset]["benchmark"],
labels=lambda w: {
"callset": w.callset,
},
),
log:
"logs/datavzrd/fp-fn/{callset}/{classification}.log",
params:
labels=lambda w: get_callsets_labels(
get_benchmark_callsets(config["variant-calls"][w.callset]["benchmark"])
),
genome=get_genome_name,
version=get_genome_version,
somatic=get_somatic_status,
wrapper:
"v5.0.1/utils/datavzrd"
42 changes: 42 additions & 0 deletions workflow/scripts/collect-fp-fn-benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import sys
sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def load_data(path, callset):
d = pd.read_csv(path, sep="\t")
d.insert(0, "callset", callset)
return d


results = pd.concat(
[
load_data(f, callset)
for f, callset in zip(snakemake.input.tables, snakemake.params.callsets)
],
axis="rows",
)

def cov_key(cov_label):
# return lower bound as integer for sorting
if ".." in cov_label:
return int(cov_label.split("..")[0])
else:
return int(cov_label[1:])



def sort_key(col):
if col.name == "callset":
return col
if col.name == "coverage":
return col.apply(cov_key)
else:
return col


results.sort_values(["callset", "coverage"], inplace=True, key=sort_key)
results["sort_index"] = results["coverage"].apply(cov_key)

results.to_csv(snakemake.output[0], sep="\t", index=False)
56 changes: 56 additions & 0 deletions workflow/scripts/collect-stratifications-fp-fn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def get_cov_label(coverage):
lower = snakemake.params.coverage_lower_bounds[coverage]
bounds = [
bound
for bound in snakemake.params.coverage_lower_bounds.values()
if bound > lower
]
if bounds:
upper = min(bounds)
return f"{lower}..{upper}"
else:
return f"≥{lower}"


def load_data(f, coverage):
d = pd.read_csv(f, sep="\t")
d.insert(0, "coverage", get_cov_label(coverage))
return d


if snakemake.input:
report = pd.concat(
load_data(f, cov) for cov, f in zip(snakemake.params.coverages, snakemake.input)
)

# TODO With separate files for SNVs and indels with e.g. STRELKA no predicted variants for the other type are expected
# If later relevant, add annotation to the report
# if (report["tp_truth"] == 0).all():
# raise ValueError(
# f"The callset {snakemake.wildcards.callset} does not predict any variant from the truth. "
# "This is likely a technical issue in the callset and should be checked before further evaluation."
# )

report.to_csv(snakemake.output[0], sep="\t", index=False)
else:
pd.DataFrame(
{
col: []
for col in [
"coverage",
"class",
"chromosome position",
"ref_allele",
"alt_allele"
"true_genotype",
"predicted_genotype"
]
}
).to_csv(snakemake.output[0], sep="\t")

0 comments on commit 0464700

Please sign in to comment.