-
Notifications
You must be signed in to change notification settings - Fork 0
/
igblast.smk
81 lines (70 loc) · 2.81 KB
/
igblast.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""
Generic IgBLAST rules.
To igblast a path from the analysis directory to one of the built-in rhesus
references in igseq, ask for:
analysis/igblast/{ref}/{path}.tsv.gz
...which will use analysis/{path} as input and treat {ref} as an argument to
igseq igblast -r.
To igblast a path to a custom VDJ database on disk:
analysis/igblast/custom-{ref}/{path}.tsv.gz
... will look for analysis/igblast/ref-custom-{ref} as an input and the
argument to igblast -r.
custom-{subject}.{locus} will automatically refer to the germline reference for
that subject and locus.
"""
# No input reference required unless prefixed with "custom-"
def input_for_igblast(w):
# special handling for .fasta: use xz-compressed FASTA directly if that's
# available on disk and the expected .fa isn't, but otherwise just use the
# input path that's implied
query = Path(f"analysis/{w.path}")
if not query.exists() and query.suffix == ".fa" and Path(str(query) + ".xz").exists():
query = str(query) + ".xz"
targets = {"query": query}
if w.ref.startswith("custom-"):
targets["ref"] = f"analysis/igblast/ref-{w.ref}"
return targets
# If there's an input named ref, use that; othewise assume it's a built-in
# rhesus reference from igseq
def ref_for_igblast(w, input):
try:
return input.ref
except AttributeError:
return f"rhesus/{w.ref}"
rule igblast:
output: "analysis/igblast/{ref,[^/]+}/{path}.tsv.gz"
input: unpack(input_for_igblast)
params:
ref=ref_for_igblast,
outfmt=19, # AIRR TSV format
species="rhesus"
log:
conda="analysis/igblast/{ref}/{path}.conda_build.txt"
conda: "envs/igseq.yaml"
threads: 8
# include special handling for xz-compressed FASTA I have for some older
# output files.
shell:
"""
conda list --explicit > {log.conda}
if [[ {input.query} =~ \.fa\.xz$ ]]; then
xzcat {input.query} | igseq igblast --input-format fa -Q - \
-t {threads} -S {params.species} -r {params.ref} \
-outfmt {params.outfmt} | gzip > {output}
else
igseq igblast -Q {input.query} \
-t {threads} -S {params.species} -r {params.ref} \
-outfmt {params.outfmt} | gzip > {output}
fi
"""
# One automatically-available custom reference: per-subject per-locus germline
# files
rule igblast_ref_germline:
output: directory("analysis/igblast/ref-custom-{subject}.{locus}")
input:
fastas=expand("analysis/germline/{{subject}}.{{locus}}/{segment}.fasta", segment=["V", "D", "J"])
params:
germline_dir="analysis/germline/{subject}.{locus}"
run:
link_target = relative_link_target(params.germline_dir, output[0])
Path(output[0]).symlink_to(link_target)