generated from snakemake-workflows/snakemake-workflow-template
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Report FP / FN negative variants in single table (#107)
* feat: Change folder structure for fp-fn results, add collection of fp-fn results for all covs of a callset and all callsets of a benchmark. * fix: fmt and clean up * fix: fix bug with collect-fp-fn when running low and high coverage callsets together. * feat: yte config and report rendering per benchmark * fix: snakefmt * fix: output naming * feat: add new datavzrd reports to Snakefile * fix: snakefmt * fix: fix input files and params.labels for report_fp_fn_benchmark * feat: show columns based on germline/somatic * fix: report directory * fix: snakefmt * fix: column names to match gemrline * feat: add reports per callset * fix: snakefmt * fix: reporting * fix: remove unused output * feat: add description to reports and fix rule names --------- Co-authored-by: Famke Bäuerle <[email protected]>
- Loading branch information
1 parent
db68793
commit 0464700
Showing
6 changed files
with
256 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
37 changes: 37 additions & 0 deletions
37
workflow/resources/datavzrd/fp-fn-per-callset-config.yte.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
__use_yte__: true | ||
|
||
__variables__: | ||
green: "#74c476" | ||
orange: "#fd8d3c" | ||
|
||
name: ?f"{wildcards.classification} of {wildcards.callset}" | ||
|
||
webview-controls: true | ||
default-view: results-table | ||
|
||
datasets: | ||
results: | ||
path: ?input.table | ||
separator: "\t" | ||
offer-excel: true | ||
|
||
views: | ||
results-table: | ||
dataset: results | ||
desc: | | ||
?f""" | ||
Rows are sorted by coverage. | ||
Benchmark version: {params.genome} {params.version} | ||
""" | ||
page-size: 12 | ||
render-table: | ||
columns: | ||
coverage: | ||
plot: | ||
heatmap: | ||
scale: ordinal | ||
?if params.somatic: | ||
true_genotype: | ||
display-mode: hidden | ||
predicted_genotype: | ||
display-mode: hidden |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import sys | ||
sys.stderr = open(snakemake.log[0], "w") | ||
|
||
import pandas as pd | ||
|
||
|
||
def load_data(path, callset): | ||
d = pd.read_csv(path, sep="\t") | ||
d.insert(0, "callset", callset) | ||
return d | ||
|
||
|
||
results = pd.concat( | ||
[ | ||
load_data(f, callset) | ||
for f, callset in zip(snakemake.input.tables, snakemake.params.callsets) | ||
], | ||
axis="rows", | ||
) | ||
|
||
def cov_key(cov_label): | ||
# return lower bound as integer for sorting | ||
if ".." in cov_label: | ||
return int(cov_label.split("..")[0]) | ||
else: | ||
return int(cov_label[1:]) | ||
|
||
|
||
|
||
def sort_key(col): | ||
if col.name == "callset": | ||
return col | ||
if col.name == "coverage": | ||
return col.apply(cov_key) | ||
else: | ||
return col | ||
|
||
|
||
results.sort_values(["callset", "coverage"], inplace=True, key=sort_key) | ||
results["sort_index"] = results["coverage"].apply(cov_key) | ||
|
||
results.to_csv(snakemake.output[0], sep="\t", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import sys | ||
|
||
sys.stderr = open(snakemake.log[0], "w") | ||
|
||
import pandas as pd | ||
|
||
|
||
def get_cov_label(coverage): | ||
lower = snakemake.params.coverage_lower_bounds[coverage] | ||
bounds = [ | ||
bound | ||
for bound in snakemake.params.coverage_lower_bounds.values() | ||
if bound > lower | ||
] | ||
if bounds: | ||
upper = min(bounds) | ||
return f"{lower}..{upper}" | ||
else: | ||
return f"≥{lower}" | ||
|
||
|
||
def load_data(f, coverage): | ||
d = pd.read_csv(f, sep="\t") | ||
d.insert(0, "coverage", get_cov_label(coverage)) | ||
return d | ||
|
||
|
||
if snakemake.input: | ||
report = pd.concat( | ||
load_data(f, cov) for cov, f in zip(snakemake.params.coverages, snakemake.input) | ||
) | ||
|
||
# TODO With separate files for SNVs and indels with e.g. STRELKA no predicted variants for the other type are expected | ||
# If later relevant, add annotation to the report | ||
# if (report["tp_truth"] == 0).all(): | ||
# raise ValueError( | ||
# f"The callset {snakemake.wildcards.callset} does not predict any variant from the truth. " | ||
# "This is likely a technical issue in the callset and should be checked before further evaluation." | ||
# ) | ||
|
||
report.to_csv(snakemake.output[0], sep="\t", index=False) | ||
else: | ||
pd.DataFrame( | ||
{ | ||
col: [] | ||
for col in [ | ||
"coverage", | ||
"class", | ||
"chromosome position", | ||
"ref_allele", | ||
"alt_allele" | ||
"true_genotype", | ||
"predicted_genotype" | ||
] | ||
} | ||
).to_csv(snakemake.output[0], sep="\t") |