feat: add option to allow mapping to pangenome (#274)

snakemake-workflows · Dec 5, 2024 · e86cc81 · e86cc81
1 parent 5d41a44
commit e86cc81
Show file tree

Hide file tree

Showing 20 changed files with 218 additions and 16 deletions.
diff --git a/.test/config-chm-eval/config.yaml b/.test/config-chm-eval/config.yaml
@@ -12,6 +12,9 @@ ref:
   release: 100
   # Genome build
   build: GRCh38
+  pangenome:
+    activate: false  
+    vcf: ""
 
 primers:
   trimming:
@@ -180,4 +183,4 @@ report:
   max_read_depth: 250
   stratify:
     activate: false
-    by-column: condition
+    by-column: condition
diff --git a/.test/config-giab/config.yaml b/.test/config-giab/config.yaml
@@ -13,6 +13,9 @@ ref:
   # Genome build
   build: GRCh38
   chromosome: 1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:
@@ -153,4 +156,4 @@ params:
     min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs
 
 gene_coverage:
-  min_avg_coverage: 5
+  min_avg_coverage: 5
diff --git a/.test/config-no-candidate-filtering/config.yaml b/.test/config-no-candidate-filtering/config.yaml
@@ -12,7 +12,10 @@ ref:
   release: 100
   # Genome build
   build: R64-1-1
-
+  pangenome:
+    activate: false
+    vcf: ""
+
 primers:
   trimming:
     primers_fa1: ""

diff --git a/.test/config-simple/config.yaml b/.test/config-simple/config.yaml
@@ -12,6 +12,9 @@ ref:
   release: 100
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml
@@ -12,6 +12,9 @@ ref:
   release: 110
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/.test/config-target-regions/config.yaml b/.test/config-target-regions/config.yaml
@@ -14,6 +14,9 @@ ref:
   release: 100
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/.test/config-target-regions/config_multiple_beds.yaml b/.test/config-target-regions/config_multiple_beds.yaml
@@ -16,6 +16,9 @@ ref:
   release: 100
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/.test/config_primers/config.yaml b/.test/config_primers/config.yaml
@@ -13,6 +13,9 @@ ref:
   snpeff_release: 86
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/config/config.yaml b/config/config.yaml
@@ -25,6 +25,21 @@ ref:
   release: 111
   # Genome build
   build: GRCh38
+  pangenome: 
+    # if active, reads will be aligned to given pangenome instead of to the linear reference genome
+    # Important: this is only supported for homo_sapiens so far
+    activate: true
+    # URL to pangenome haplotypes (vcf-file)
+    # Graph resources v1.1: https://github.com/human-pangenomics/hpp_pangenome_resources/
+    # Graph resources v1.0: https://github.com/human-pangenomics/hpp_pangenome_resources/blob/main/hprc-v1.0-mc.md
+    # Important: ensure that the haplotype vcf is built against the same reference genome as specified above under build
+    vcf: https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.1-mc-grch38/hprc-v1.1-mc-grch38.raw.vcf.gz
+    # optional: In case contigs of the hoplotype vcf need to be renamed
+    # a list of python expressions can be defined.
+    # Contigs can be accessed by `contig`.
+    # Each expression will be successively saved into `renamed_contig`
+    rename_expressions:
+      - "'MT' if contig == 'chrM' else contig[3:]"
   # Optionally, instead of downloading the whole reference from Ensembl via the
   # parameters above, specify a specific chromosome below and uncomment the line.
   # This is usually only relevant for testing.
@@ -162,6 +177,7 @@ calling:
           # Add varlociraptor events to aggregated over.
           # The probability for the union of these events is used for controlling
           # the FDR.
+          - present
           - somatic_tumor_high
           - somatic_tumor_medium
         filter: # myfilter

diff --git a/config/samples.tsv b/config/samples.tsv
@@ -1 +1,2 @@
 sample_name	alias	group	platform	purity	panel	umi_read	umi_read_structure	datatype	calling
+SRR702070	tumor	SRR702070_group	ILLUMINA	1.0				dna	variants
diff --git a/config/units.tsv b/config/units.tsv
@@ -1 +1,2 @@
 sample_name	unit_name	fq1	fq2	sra	adapters
+SRR702070	lane1	/projects/koesterlab/orthanq/HapMap_data/SRR702070_1.fastq.gz	/projects/koesterlab/orthanq/HapMap_data/SRR702070_2.fastq.gz
diff --git a/workflow/envs/pysam.yaml b/workflow/envs/pysam.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python =3.12
+  - pysam =0.22
diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml
@@ -3,6 +3,6 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - varlociraptor >=8.4.11,<8.5
+  - varlociraptor >=8.4.13,<8.5
   - vega-lite-cli =5.16
-  - bcftools =1.19
+  - bcftools =1.21
diff --git a/workflow/rules/benchmarking.smk b/workflow/rules/benchmarking.smk
@@ -1,4 +1,5 @@
-ruleorder: chm_eval_sample > map_reads
+# TODO Is this ruleorder of any use?!
+ruleorder: chm_eval_sample > map_reads_bwa
 
 
 rule gather_benchmark_calls:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -30,6 +30,8 @@ genome_prefix = f"resources/{genome_name}"
 genome = f"{genome_prefix}.fasta"
 genome_fai = f"{genome}.fai"
 genome_dict = f"{genome_prefix}.dict"
+pangenome_name = f"pangenome.{species}.{build}"
+pangenome_prefix = f"resources/{pangenome_name}"
 
 # cram variables
 use_cram = config.get("use_cram", False)
@@ -263,6 +265,8 @@ def get_control_fdr_input(wildcards):
 def get_aligner(wildcards):
     if get_sample_datatype(wildcards.sample) == "rna":
         return "star"
+    elif is_activated("ref/pangenome"):
+        return "vg"
     else:
         return "bwa"
 
@@ -436,7 +440,7 @@ def get_recalibrate_quality_input(wildcards, bai=False):
 def get_consensus_input(wildcards, bai=False):
     ext = "bai" if bai else "bam"
     if sample_has_primers(wildcards):
-        return "results/trimmed/{{sample}}.trimmed.{ext}".format(ext=ext)
+        return f"results/trimmed/{{sample}}.trimmed.{ext}"
     else:
         return get_trimming_input(wildcards, bai)
 
@@ -452,6 +456,13 @@ def get_trimming_input(wildcards, bai=False):
         )
 
 
+def get_vg_autoindex_vcf():
+    if config["ref"]["pangenome"].get("rename_expressions"):
+        return f"{pangenome_prefix}.renamed.vcf.gz"
+    else:
+        return f"{pangenome_prefix}.vcf.gz"
+
+
 def get_primer_bed(wc):
     if isinstance(primer_panels, pd.DataFrame):
         if not pd.isna(primer_panels.loc[wc.panel, "fa2"]):
@@ -618,6 +629,13 @@ def get_read_group(wildcards):
     )
 
 
+def get_vg_read_group(wildcards):
+    platform = extract_unique_sample_column_value(wildcards.sample, "platform")
+    return r"--RGLB lib1 --RGPL {platform} --RGPU {sample} --RGSM {sample} --RGID {sample}".format(
+        sample=wildcards.sample, platform=platform
+    )
+
+
 def get_map_reads_sorting_params(wildcards, ordering=False):
     match (sample_has_umis(wildcards.sample), ordering):
         case (True, True):
@@ -630,6 +648,13 @@ def get_map_reads_sorting_params(wildcards, ordering=False):
             return "samtools"
 
 
+def get_add_readgroup_input(wildcards):
+    if sample_has_umis(wildcards.sample):
+        return "results/mapped/vg/{sample}.annotated.bam"
+    else:
+        return "results/mapped/vg/{sample}.mate_fixed.bam"
+
+
 def get_mutational_burden_targets():
     mutational_burden_targets = []
     if is_activated("mutational_burden"):
@@ -680,15 +705,19 @@ def get_selected_annotations():
     return selection
 
 
-def get_annotated_bcf(wildcards):
+def get_annotated_bcf(wildcards, index=False):
+    ext = ".csi" if index else ""
     selection = (
         get_selected_annotations() if wildcards.calling_type == "variants" else ""
     )
-    return "results/calls/{group}.{calling_type}.{scatteritem}{selection}.bcf".format(
-        group=wildcards.group,
-        calling_type=wildcards.calling_type,
-        selection=selection,
-        scatteritem=wildcards.scatteritem,
+    return (
+        "results/calls/{group}.{calling_type}.{scatteritem}{selection}.bcf{ext}".format(
+            group=wildcards.group,
+            calling_type=wildcards.calling_type,
+            selection=selection,
+            scatteritem=wildcards.scatteritem,
+            ext=ext,
+        )
     )
 
 

diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk
@@ -19,6 +19,7 @@ rule filter_candidates_by_annotation:
 rule filter_by_annotation:
     input:
         bcf=get_annotated_bcf,
+        csi=partial(get_annotated_bcf, index=True),
         aux=get_annotation_filter_aux_files,
     output:
         "results/calls/{group}.{event}.{calling_type}.{scatteritem}.filtered_ann.bcf",

diff --git a/workflow/rules/maf.smk b/workflow/rules/maf.smk
@@ -33,12 +33,12 @@ rule group_vcf_to_maf:
             dpath="maf/primary_alias", within=config, default="tumor"
         ),
         vcf_control_alias_option=(
-            f'--vcf-normal-id {lookup(dpath= "maf/control_alias", within= config, default= "")}'
+            f'--vcf-normal-id {lookup(dpath = "maf/control_alias", within = config , default = "")}'
             if lookup(dpath="maf/control_alias", within=config, default=False)
             else ""
         ),
         normal_id=lambda wc: (
-            f'--normal-id {wc.group}_{lookup(dpath= "maf/control_alias", within= config, default= "")}'
+            f'--normal-id {wc.group}_{lookup(dpath = "maf/control_alias", within = config , default = "")}'
             if lookup(dpath="maf/control_alias", within=config, default=False)
             else ""
         ),

diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk
@@ -1,4 +1,4 @@
-rule map_reads:
+rule map_reads_bwa:
     input:
         reads=get_map_reads_input,
         idx=rules.bwa_index.output,
@@ -15,6 +15,59 @@ rule map_reads:
         "v3.8.0/bio/bwa/mem"
 
 
+# Create distance and minimizer index before mapping
+# Otherwise it will be performed on the first execution leading to race conditions for multiple samples
+rule map_reads_vg:
+    input:
+        reads=get_map_reads_input,
+        index=rules.vg_autoindex.output,
+    output:
+        temp("results/mapped/vg/{sample}.preprocessed.bam"),
+    log:
+        "logs/mapped/vg/{sample}.log",
+    benchmark:
+        "benchmarks/vg_giraffe/{sample}.tsv"
+    params:
+        extra="",
+        sorting="fgbio",
+        sort_order="queryname",
+    threads: 8
+    wrapper:
+        "v5.3.0/bio/vg/giraffe"
+
+
+# samtools fixmate requires querysorted input
+rule fix_mate:
+    input:
+        "results/mapped/vg/{sample}.preprocessed.bam",
+    output:
+        temp("results/mapped/vg/{sample}.mate_fixed.bam"),
+    log:
+        "logs/samtools/fix_mate/{sample}.log",
+    threads: 8
+    params:
+        extra="",
+    wrapper:
+        "v4.7.2/bio/samtools/fixmate"
+
+
+# adding read groups is necessary because base recalibration throws errors
+# for not being able to find read group information
+rule add_read_group:
+    input:
+        "results/mapped/vg/{sample}.mate_fixed.bam",
+    output:
+        temp("results/mapped/vg/{sample}.bam"),
+    log:
+        "logs/picard/add_rg/{sample}.log",
+    params:
+        extra=get_vg_read_group,
+    resources:
+        mem_mb=1024,
+    wrapper:
+        "v2.3.2/bio/picard/addorreplacereadgroups"
+
+
 rule merge_untrimmed_fastqs:
     input:
         get_untrimmed_fastqs,
@@ -41,6 +94,7 @@ rule sort_untrimmed_fastqs:
         "fgbio SortFastq -i {input} -o {output} 2> {log}"
 
 
+# fgbio AnnotateBamsWithUmis requires querynamed sorted fastqs and bams
 rule annotate_umis:
     input:
         bam="results/mapped/{aligner}/{sample}.bam",
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		sample_name alias group platform purity panel umi_read umi_read_structure datatype calling
		SRR702070 tumor SRR702070_group ILLUMINA 1.0 dna variants
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		sample_name unit_name fq1 fq2 sra adapters
		SRR702070 lane1 /projects/koesterlab/orthanq/HapMap_data/SRR702070_1.fastq.gz /projects/koesterlab/orthanq/HapMap_data/SRR702070_2.fastq.gz