Skip to content

Commit

Permalink
feat: add option to allow mapping to pangenome (#274)
Browse files Browse the repository at this point in the history
  • Loading branch information
huzuner authored Dec 5, 2024
1 parent 5d41a44 commit e86cc81
Show file tree
Hide file tree
Showing 20 changed files with 218 additions and 16 deletions.
5 changes: 4 additions & 1 deletion .test/config-chm-eval/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ ref:
release: 100
# Genome build
build: GRCh38
pangenome:
activate: false
vcf: ""

primers:
trimming:
Expand Down Expand Up @@ -180,4 +183,4 @@ report:
max_read_depth: 250
stratify:
activate: false
by-column: condition
by-column: condition
5 changes: 4 additions & 1 deletion .test/config-giab/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ ref:
# Genome build
build: GRCh38
chromosome: 1
pangenome:
activate: false
vcf: ""

primers:
trimming:
Expand Down Expand Up @@ -153,4 +156,4 @@ params:
min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs

gene_coverage:
min_avg_coverage: 5
min_avg_coverage: 5
5 changes: 4 additions & 1 deletion .test/config-no-candidate-filtering/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ ref:
release: 100
# Genome build
build: R64-1-1

pangenome:
activate: false
vcf: ""

primers:
trimming:
primers_fa1: ""
Expand Down
3 changes: 3 additions & 0 deletions .test/config-simple/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ ref:
release: 100
# Genome build
build: R64-1-1
pangenome:
activate: false
vcf: ""

primers:
trimming:
Expand Down
3 changes: 3 additions & 0 deletions .test/config-sra/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ ref:
release: 110
# Genome build
build: R64-1-1
pangenome:
activate: false
vcf: ""

primers:
trimming:
Expand Down
3 changes: 3 additions & 0 deletions .test/config-target-regions/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ ref:
release: 100
# Genome build
build: R64-1-1
pangenome:
activate: false
vcf: ""

primers:
trimming:
Expand Down
3 changes: 3 additions & 0 deletions .test/config-target-regions/config_multiple_beds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ ref:
release: 100
# Genome build
build: R64-1-1
pangenome:
activate: false
vcf: ""

primers:
trimming:
Expand Down
3 changes: 3 additions & 0 deletions .test/config_primers/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ ref:
snpeff_release: 86
# Genome build
build: R64-1-1
pangenome:
activate: false
vcf: ""

primers:
trimming:
Expand Down
16 changes: 16 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,21 @@ ref:
release: 111
# Genome build
build: GRCh38
pangenome:
# if active, reads will be aligned to given pangenome instead of to the linear reference genome
# Important: this is only supported for homo_sapiens so far
activate: true
# URL to pangenome haplotypes (vcf-file)
# Graph resources v1.1: https://github.com/human-pangenomics/hpp_pangenome_resources/
# Graph resources v1.0: https://github.com/human-pangenomics/hpp_pangenome_resources/blob/main/hprc-v1.0-mc.md
# Important: ensure that the haplotype vcf is built against the same reference genome as specified above under build
vcf: https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.1-mc-grch38/hprc-v1.1-mc-grch38.raw.vcf.gz
# optional: In case contigs of the hoplotype vcf need to be renamed
# a list of python expressions can be defined.
# Contigs can be accessed by `contig`.
# Each expression will be successively saved into `renamed_contig`
rename_expressions:
- "'MT' if contig == 'chrM' else contig[3:]"
# Optionally, instead of downloading the whole reference from Ensembl via the
# parameters above, specify a specific chromosome below and uncomment the line.
# This is usually only relevant for testing.
Expand Down Expand Up @@ -162,6 +177,7 @@ calling:
# Add varlociraptor events to aggregated over.
# The probability for the union of these events is used for controlling
# the FDR.
- present
- somatic_tumor_high
- somatic_tumor_medium
filter: # myfilter
Expand Down
1 change: 1 addition & 0 deletions config/samples.tsv
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
sample_name alias group platform purity panel umi_read umi_read_structure datatype calling
SRR702070 tumor SRR702070_group ILLUMINA 1.0 dna variants
1 change: 1 addition & 0 deletions config/units.tsv
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
sample_name unit_name fq1 fq2 sra adapters
SRR702070 lane1 /projects/koesterlab/orthanq/HapMap_data/SRR702070_1.fastq.gz /projects/koesterlab/orthanq/HapMap_data/SRR702070_2.fastq.gz
6 changes: 6 additions & 0 deletions workflow/envs/pysam.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
dependencies:
- python =3.12
- pysam =0.22
4 changes: 2 additions & 2 deletions workflow/envs/varlociraptor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ channels:
- bioconda
- nodefaults
dependencies:
- varlociraptor >=8.4.11,<8.5
- varlociraptor >=8.4.13,<8.5
- vega-lite-cli =5.16
- bcftools =1.19
- bcftools =1.21
3 changes: 2 additions & 1 deletion workflow/rules/benchmarking.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ruleorder: chm_eval_sample > map_reads
# TODO Is this ruleorder of any use?!
ruleorder: chm_eval_sample > map_reads_bwa


rule gather_benchmark_calls:
Expand Down
43 changes: 36 additions & 7 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ genome_prefix = f"resources/{genome_name}"
genome = f"{genome_prefix}.fasta"
genome_fai = f"{genome}.fai"
genome_dict = f"{genome_prefix}.dict"
pangenome_name = f"pangenome.{species}.{build}"
pangenome_prefix = f"resources/{pangenome_name}"

# cram variables
use_cram = config.get("use_cram", False)
Expand Down Expand Up @@ -263,6 +265,8 @@ def get_control_fdr_input(wildcards):
def get_aligner(wildcards):
if get_sample_datatype(wildcards.sample) == "rna":
return "star"
elif is_activated("ref/pangenome"):
return "vg"
else:
return "bwa"

Expand Down Expand Up @@ -436,7 +440,7 @@ def get_recalibrate_quality_input(wildcards, bai=False):
def get_consensus_input(wildcards, bai=False):
ext = "bai" if bai else "bam"
if sample_has_primers(wildcards):
return "results/trimmed/{{sample}}.trimmed.{ext}".format(ext=ext)
return f"results/trimmed/{{sample}}.trimmed.{ext}"
else:
return get_trimming_input(wildcards, bai)

Expand All @@ -452,6 +456,13 @@ def get_trimming_input(wildcards, bai=False):
)


def get_vg_autoindex_vcf():
if config["ref"]["pangenome"].get("rename_expressions"):
return f"{pangenome_prefix}.renamed.vcf.gz"
else:
return f"{pangenome_prefix}.vcf.gz"


def get_primer_bed(wc):
if isinstance(primer_panels, pd.DataFrame):
if not pd.isna(primer_panels.loc[wc.panel, "fa2"]):
Expand Down Expand Up @@ -618,6 +629,13 @@ def get_read_group(wildcards):
)


def get_vg_read_group(wildcards):
platform = extract_unique_sample_column_value(wildcards.sample, "platform")
return r"--RGLB lib1 --RGPL {platform} --RGPU {sample} --RGSM {sample} --RGID {sample}".format(
sample=wildcards.sample, platform=platform
)


def get_map_reads_sorting_params(wildcards, ordering=False):
match (sample_has_umis(wildcards.sample), ordering):
case (True, True):
Expand All @@ -630,6 +648,13 @@ def get_map_reads_sorting_params(wildcards, ordering=False):
return "samtools"


def get_add_readgroup_input(wildcards):
if sample_has_umis(wildcards.sample):
return "results/mapped/vg/{sample}.annotated.bam"
else:
return "results/mapped/vg/{sample}.mate_fixed.bam"


def get_mutational_burden_targets():
mutational_burden_targets = []
if is_activated("mutational_burden"):
Expand Down Expand Up @@ -680,15 +705,19 @@ def get_selected_annotations():
return selection


def get_annotated_bcf(wildcards):
def get_annotated_bcf(wildcards, index=False):
ext = ".csi" if index else ""
selection = (
get_selected_annotations() if wildcards.calling_type == "variants" else ""
)
return "results/calls/{group}.{calling_type}.{scatteritem}{selection}.bcf".format(
group=wildcards.group,
calling_type=wildcards.calling_type,
selection=selection,
scatteritem=wildcards.scatteritem,
return (
"results/calls/{group}.{calling_type}.{scatteritem}{selection}.bcf{ext}".format(
group=wildcards.group,
calling_type=wildcards.calling_type,
selection=selection,
scatteritem=wildcards.scatteritem,
ext=ext,
)
)


Expand Down
1 change: 1 addition & 0 deletions workflow/rules/filtering.smk
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ rule filter_candidates_by_annotation:
rule filter_by_annotation:
input:
bcf=get_annotated_bcf,
csi=partial(get_annotated_bcf, index=True),
aux=get_annotation_filter_aux_files,
output:
"results/calls/{group}.{event}.{calling_type}.{scatteritem}.filtered_ann.bcf",
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/maf.smk
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ rule group_vcf_to_maf:
dpath="maf/primary_alias", within=config, default="tumor"
),
vcf_control_alias_option=(
f'--vcf-normal-id {lookup(dpath= "maf/control_alias", within= config, default= "")}'
f'--vcf-normal-id {lookup(dpath = "maf/control_alias", within = config , default = "")}'
if lookup(dpath="maf/control_alias", within=config, default=False)
else ""
),
normal_id=lambda wc: (
f'--normal-id {wc.group}_{lookup(dpath= "maf/control_alias", within= config, default= "")}'
f'--normal-id {wc.group}_{lookup(dpath = "maf/control_alias", within = config , default = "")}'
if lookup(dpath="maf/control_alias", within=config, default=False)
else ""
),
Expand Down
56 changes: 55 additions & 1 deletion workflow/rules/mapping.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
rule map_reads:
rule map_reads_bwa:
input:
reads=get_map_reads_input,
idx=rules.bwa_index.output,
Expand All @@ -15,6 +15,59 @@ rule map_reads:
"v3.8.0/bio/bwa/mem"


# Create distance and minimizer index before mapping
# Otherwise it will be performed on the first execution leading to race conditions for multiple samples
rule map_reads_vg:
input:
reads=get_map_reads_input,
index=rules.vg_autoindex.output,
output:
temp("results/mapped/vg/{sample}.preprocessed.bam"),
log:
"logs/mapped/vg/{sample}.log",
benchmark:
"benchmarks/vg_giraffe/{sample}.tsv"
params:
extra="",
sorting="fgbio",
sort_order="queryname",
threads: 8
wrapper:
"v5.3.0/bio/vg/giraffe"


# samtools fixmate requires querysorted input
rule fix_mate:
input:
"results/mapped/vg/{sample}.preprocessed.bam",
output:
temp("results/mapped/vg/{sample}.mate_fixed.bam"),
log:
"logs/samtools/fix_mate/{sample}.log",
threads: 8
params:
extra="",
wrapper:
"v4.7.2/bio/samtools/fixmate"


# adding read groups is necessary because base recalibration throws errors
# for not being able to find read group information
rule add_read_group:
input:
"results/mapped/vg/{sample}.mate_fixed.bam",
output:
temp("results/mapped/vg/{sample}.bam"),
log:
"logs/picard/add_rg/{sample}.log",
params:
extra=get_vg_read_group,
resources:
mem_mb=1024,
wrapper:
"v2.3.2/bio/picard/addorreplacereadgroups"


rule merge_untrimmed_fastqs:
input:
get_untrimmed_fastqs,
Expand All @@ -41,6 +94,7 @@ rule sort_untrimmed_fastqs:
"fgbio SortFastq -i {input} -o {output} 2> {log}"


# fgbio AnnotateBamsWithUmis requires querynamed sorted fastqs and bams
rule annotate_umis:
input:
bam="results/mapped/{aligner}/{sample}.bam",
Expand Down
Loading

0 comments on commit e86cc81

Please sign in to comment.