Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve coverage stratification for datasets with high coverage #75

Merged
merged 8 commits into from
Jun 11, 2024
17 changes: 15 additions & 2 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@ if "variant-calls" in config:
# of the checkpoint output
expand(
"results/variants/{benchmark}.truth.cov-{cov}.stats.json",
benchmark=used_benchmarks,
cov=coverages,
benchmark=[
benchmark
for benchmark in used_benchmarks
if benchmarks[benchmark].get("high-coverage")
],
cov=high_coverages,
),
expand(
"results/variants/{benchmark}.truth.cov-{cov}.stats.json",
benchmark=[
benchmark
for benchmark in used_benchmarks
if not benchmarks[benchmark].get("high-coverage")
],
cov=low_coverages,
),
64 changes: 48 additions & 16 deletions workflow/resources/datavzrd/precision-recall-config.yte.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,30 @@ views:
plot:
heatmap:
scale: ordinal
domain:
- low
- medium
- high
range:
- "#c6dbef"
- "#9ecae1"
- "#6baed6"
?if params.high_coverage:
domain:
- very_low
- low
- medium
- upper_medium
- high
- very_high
range:
- "#d8e6f4"
- "#c6dbef"
- "#9ecae1"
- "#74b2d7"
- "#3181bd"
- "#1e68a7"
?else:
domain:
- low
- medium
- high
range:
- "#c6dbef"
- "#9ecae1"
- "#6baed6"
vaf:
plot:
heatmap:
Expand Down Expand Up @@ -171,11 +187,27 @@ views:
plot:
heatmap:
scale: ordinal
domain:
- low
- medium
- high
range:
- "#c6dbef"
- "#9ecae1"
- "#6baed6"
?if params.high_coverage:
domain:
- very_low
- low
- medium
- upper_medium
- high
- very_high
range:
- "#d8e6f4"
- "#c6dbef"
- "#9ecae1"
- "#74b2d7"
- "#3181bd"
- "#1e68a7"
?else:
domain:
- low
- medium
- high
range:
- "#c6dbef"
- "#9ecae1"
- "#6baed6"
13 changes: 9 additions & 4 deletions workflow/resources/presets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,38 @@ benchmarks:
bam-url: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WES/WES_EA_T_1.bwa.dedup.bam
target-regions: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/technical/reference_genome/Exome_Target_bed/S07604624_Covered_human_all_v6_plus_UTR.liftover.to.hg38.bed6.gz
grch37: false
vaf-field:
vaf-field:
- INFO # either FORMAT or INFO
- TVAF # name of tumor variant allele frequency
high-coverage: true

imgag-somatic-5perc:
genome: na12878-somatic
bam-url: https://download.imgag.de/public/validation_dataset_somatic/NA12878x3_23_NA12877_21_5.bam
target-regions: https://download.imgag.de/public/validation_dataset_somatic/Twist_Custom_Exome_IMGAG_v2.bed
grch37: false
high-coverage: true

imgag-somatic-10perc:
genome: na12878-somatic
bam-url: https://download.imgag.de/public/validation_dataset_somatic/NA12878x3_23_NA12877_21_10.bam
target-regions: https://download.imgag.de/public/validation_dataset_somatic/Twist_Custom_Exome_IMGAG_v2.bed
grch37: false
high-coverage: true

imgag-somatic-20perc:
genome: na12878-somatic
bam-url: https://download.imgag.de/public/validation_dataset_somatic/NA12878x3_23_NA12877_21_20.bam
target-regions: https://download.imgag.de/public/validation_dataset_somatic/Twist_Custom_Exome_IMGAG_v2.bed
grch37: false
high-coverage: true

imgag-somatic-40perc:
genome: na12878-somatic
bam-url: https://download.imgag.de/public/validation_dataset_somatic/NA12878x3_23_NA12877_21_40.bam
target-regions: https://download.imgag.de/public/validation_dataset_somatic/Twist_Custom_Exome_IMGAG_v2.bed
grch37: false
high-coverage: true

genomes:
na12878:
Expand All @@ -51,7 +56,7 @@ genomes:
confidence-regions:
grch37: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh37/HG001_GRCh37_1_22_v4.2.1_benchmark.bed
grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.bed

chm-eval:
archive: https://github.com/lh3/CHM-eval/releases/download/v0.5/CHM-evalkit-20180222.tar
truth:
Expand All @@ -60,7 +65,7 @@ genomes:
confidence-regions:
grch38: full.38.bed.gz
grch37: full.37m.bed.gz

seqc2-somatic:
truth:
grch38:
Expand All @@ -69,7 +74,7 @@ genomes:
confidence-regions:
grch38: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/release/latest/High-Confidence_Regions_v1.2.bed
somatic: true

na12878-somatic:
truth:
grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
Expand Down
53 changes: 44 additions & 9 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,19 @@ if any(
repl_chr = "s/chr//"


coverages = {
"low": 1,
"medium": 10,
"high": 30,
low_coverages = {
"lc_low": 1,
"lc_medium": 10,
"lc_high": 30,
}

high_coverages = {
"hc_very_low": 1,
"hc_low": 10,
"hc_medium": 30,
"hc_upper_medium": 50,
"hc_high": 100,
"hc_very_high": 300,
}


Expand Down Expand Up @@ -75,18 +84,19 @@ def get_bwa_input(wildcards):
return benchmark["fastqs"]


def get_mosdepth_quantize():
def get_mosdepth_quantize(wildcards):
coverages = get_coverages(wildcards)
return ":".join(map(str, sorted(coverages.values()))) + ":"


def get_plot_cov_labels():
def get_plot_cov_labels(): # TODO check if ever used anywhere
def label(name):
lower, upper = get_cov_interval(name)
if upper:
return f"{lower}-{upper-1}"
return f"≥{lower}"

return {name: label(name) for name in coverages}
return {name: label(name) for name in low_coverages}


def get_truth_url(wildcards, input):
Expand Down Expand Up @@ -156,13 +166,14 @@ def get_happy_prefix(wildcards, output):


def get_cov_label(wildcards):
lower, upper = get_cov_interval(wildcards.cov)
coverages = get_coverages(wildcards)
lower, upper = get_cov_interval(wildcards.cov, coverages)
if upper:
return f"{lower}:{upper}"
return f"{lower}:inf"


def get_cov_interval(name):
def get_cov_interval(name, coverages):
threshold = coverages[name]
upper_bound = None

Expand Down Expand Up @@ -351,6 +362,11 @@ def get_mosdepth_input(bai=False):

def _get_nonempty_coverages(callset):
benchmark = config["variant-calls"][callset]["benchmark"]
benchmark_dict = get_benchmark(benchmark)
if benchmark_dict.get("high-coverage", False):
coverages = high_coverages
else:
coverages = low_coverages

def isempty(cov):
with checkpoints.stat_truth.get(benchmark=benchmark, cov=cov).output[
Expand All @@ -366,6 +382,20 @@ def get_nonempty_coverages(wildcards):
return _get_nonempty_coverages(wildcards.callset)


def get_coverages(wildcards):
if hasattr(wildcards, "benchmark"):
# benchmark = get_benchmark(wildcards.benchmark)
high_cov_status = benchmarks[wildcards.benchmark].get("high-coverage", False)
else:
benchmark = config["variant-calls"][wildcards.callset]["benchmark"]
high_cov_status = benchmarks[benchmark].get("high-coverage", False)
if high_cov_status:
coverages = high_coverages
else:
coverages = low_coverages
return coverages


def get_somatic_status(wildcards):
if hasattr(wildcards, "benchmark"):
return genomes[benchmarks[wildcards.benchmark]["genome"]].get("somatic")
Expand Down Expand Up @@ -417,6 +447,11 @@ def get_vaf_status(wildcards):
return False


def get_high_coverage_status(wildcards):
benchmark = get_benchmark(wildcards.benchmark)
return benchmark.get("high-coverage", False)


def get_collect_stratifications_input(wildcards):
import json

Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/download.smk
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ rule mosdepth:
"logs/mosdepth/{benchmark}.log",
params:
extra="--no-per-base --mapq 59", # we do not want low MAPQ regions end up being marked as high coverage
quantize=get_mosdepth_quantize(),
quantize=get_mosdepth_quantize,
wrapper:
"v1.7.2/bio/mosdepth"

Expand Down
3 changes: 2 additions & 1 deletion workflow/rules/eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ rule collect_stratifications:
"results/precision-recall/callsets/{callset}.{vartype}.tsv",
params:
coverages=get_nonempty_coverages,
coverage_lower_bounds=coverages,
coverage_lower_bounds=get_coverages,
log:
"logs/collect-stratifications/{callset}/{vartype}.log",
conda:
Expand Down Expand Up @@ -280,6 +280,7 @@ rule report_precision_recall:
params:
somatic=get_somatic_status,
vaf=get_vaf_status,
high_coverage=get_high_coverage_status,
wrapper:
"v3.10.1/utils/datavzrd"

Expand Down
Loading