-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.nf
235 lines (179 loc) · 6.94 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env nextflow
/*
*
* Pipeline MGAP
* Version v2.2
* Description Microbial Genome Assembly Pipeline
* Authors Derek Sarovich and Erin Price.
*
*/
log.info """
================================================================================
NF-MGAP
v2.2
================================================================================
Optional Parameters:
--fastq Input PE read file wildcard (default: *_{1,2}.fastq.gz)
Currently this is set to $params.fastq
--ref Set to the name of the reference file.
Reference file used for reference assisted assembly using
ABACAS. For best results please set this to a closely related
reference (i.e. same species and sequence type is ideal)
Currently ref is set to $params.ref
--spades Set to true/false. Spades can alternatively be used for the initial assembly
instead of Velvet. This mode is suggested if the initial assembly is
very poor or fails to produce an assembly file of the correct size.
Most often this is caused by poor quality input data that needs
to be cleaned but it is worth a shot.
Currently spades is set to $params.spades
--kraken Set to true/false. Kraken2 can be used to filter raw sequence reads if multiple
species contamination is suspected. To use this feature, please
set this flag to a specific genus and/or species. E.g. to filter
all reads that match Pseudomonas or "Pseudomonas aeruginosa". If
specifying species, please use quotes to correctly handle the space
between genus and species.
Currrently kraken is set to $params.kraken
--executor Change this flag for running in a HPC scheduler environment.
Default behavior is to run without a scheduler but a
wide range of schedulers are supported with nextflow.
Some of the supported schedulers include sge, pbs, pbspro,
slurm, lsf, moab, nqsii. For a full list please visit the
nextflow documentation
Currently executor is set to $params.executor
If you want to make changes to the default `nextflow.config` file
clone the workflow into a local directory and change parameters
in `nextflow.config`:
nextflow clone dsarov/mgap outdir/
Update to the local cache of this workflow:
nextflow pull dsarov/mgap
==================================================================
==================================================================
"""
fastq = Channel
.fromFilePairs("${params.fastq}", flat: true)
.ifEmpty { exit 1, """ Input read files could not be found.
Have you included the read files in the current directory and do they have the correct naming?
With the parameters specified, MGAP is looking for reads named ${params.fastq}.
To fix this error either rename your reads to match this formatting or specify the desired format
when initializing MGAP e.g. --fastq "*_{1,2}_sequence.fastq.gz"
"""
}
if (params.ref) {
reference_file = file(params.ref)
if( !reference_file.exists() ) {
exit 1, """
MGAP can't find the reference file.
It is currently looking for this file --> ${params.ref}
If this file doesn't exist, please download and copy to the analysis dirrectory
"""
}
}
/*
======================================================================
Part 1: join multifasta file into single contig
======================================================================
*/
if (params.ref) {
process IndexReference {
label "index"
input:
file ref from reference_file
output:
file "ref.ABACAS" into ref_index_ch
"""
contig_count=\$(grep -c '>' ${ref})
echo -e "Joining contigs for ABACAS\n"
if [ \${contig_count} == 1 ]; then
mv ${ref} ref.ABACAS
else
perl ${baseDir}/bin/joinMultifasta.pl ${ref} ref.ABACAS
fi
"""
}
}
/*
=======================================================================
Part 2A: Trim reads with light quality filter and remove adapters
=======================================================================
*/
process Trimmomatic {
label "trimmomatic"
tag {"$id"}
input:
set id, file(forward), file(reverse) from fastq
output:
set id, "${id}_1.fq.gz", "${id}_2.fq.gz" into kraken, assemble
script:
"""
$params.TRIMMOMATIC PE -threads ${task.cpus} ${forward} ${reverse} \
${id}_1.fq.gz ${id}_1_u.fq.gz ${id}_2.fq.gz ${id}_2_u.fq.gz \
ILLUMINACLIP:${baseDir}/resources/trimmomatic/all_adapters.fa:2:30:10: \
LEADING:10 TRAILING:10 SLIDINGWINDOW:4:15 MINLEN:36
rm ${id}_1_u.fq.gz ${id}_2_u.fq.gz
"""
}
/*
=======================================================================
Part 2B: Optionally filter reads for specific species
=======================================================================
*/
if (params.kraken) {
process Kraken {
label "kraken"
tag {"$id"}
input:
set id, file(forward), file(reverse) from kraken
output:
set id, "${id}_1.fq.gz", "${id}_2.fq.gz" into assemble
script:
"""
kraken2 --db ${KRAKEN_DB} --threads ${task.cpus} --use-names --output ${id}.output --paired ${id}_1.fq.gz ${id}_2.fq.gz
while read line; do
echo ${line} >> read.lst
done < <(grep "${params.kraken}" ${id}.output | awk '{ print \$2 }')
seqtk subseq ${id}_1.fq.gz read.lst > out_1.fq
seqtk subseq ${id}_2.fq.gz read.lst > out_2.fq
gzip out_1.fq
gzip out_2.fq
rm ${id}_1.fq.gz ${id}_2.fq.gz
mv out_1.fq.gz ${id}_1.fq.gz
mv out_2.fq.gz ${id}_2.fq.gz
"""
}
}
/*
=======================================================================
Part 3: Run assembly script
=======================================================================
*/
if (params.ref) {
process Assembly {
label "assembly"
tag { "$id" }
publishDir "./Outputs/", mode: 'copy', pattern: "*final.fasta", overwrite: true
input:
file reference from reference_file
set id, "${id}_1.fq.gz", "${id}_2.fq.gz" from assemble
file "ref.ABACAS" from ref_index_ch
output:
set id, file("${id}_final.fasta")
script:
"""
bash assemble.sh ${id} ${baseDir} $task.cpus no ref $params.spades $task.memory
"""
}
} else {
process Assembly_no_ref {
label "assembly"
tag { "$id" }
publishDir "./Outputs/", mode: 'copy', pattern: "*final.fasta", overwrite: true
input:
set id, "${id}_1.fq.gz", "${id}_2.fq.gz" from assemble
output:
set id, file("${id}_final.fasta")
script:
"""
bash assemble.sh ${id} ${baseDir} $task.cpus no none $params.spades $task.memory
"""
}
}