-
Notifications
You must be signed in to change notification settings - Fork 3
/
mutect2_pon.nf
executable file
·132 lines (110 loc) · 3.65 KB
/
mutect2_pon.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env nextflow
gatk4_jar = "/code/gatk/4.1.3.0/gatk-package-4.1.3.0-local.jar"
// this version is needed to build a PON from multiple VCFs, it may change in the future
gatk40_jar = "/code/gatk/4.0.12.0/gatk-package-4.0.12.0-local.jar"
picard_jar = "/code/picard/2.21.2/picard.jar"
params.help= false
params.input_files = false
params.reference = "/projects/data/gatk_bundle/hg19/ucsc.hg19.fasta" // TODO: remove this hard coded bit
params.intervals = "/projects/data/gatk_bundle/hg19/hg19_refseq_exons.sorted.merged.bed"
params.output = 'output'
def helpMessage() {
log.info"""
Usage:
mutect2_pon.nf --input_files input_files
This workflow aims to compute a panel of normals to be used with MuTect2
Input:
* input_files: the path to a file containing in each row the sample name and the path to a BAM file to be included in the PON
example:
sample1 /path/to/sample1.bam
sample2 /path/to/sample2.bam
NOTE: the sample name must be set in the @SN annotation
Optional input:
* output: the folder where to publish output
Output:
* Output combined VCF pon.vcf
"""
}
if (params.help) {
helpMessage()
exit 0
}
// checks required inputs
if (params.input_files) {
Channel
.fromPath(params.input_files)
.splitCsv(header: ['name', 'bam'], sep: "\t")
.map { row-> tuple(row.name, file(row.bam), file(row.bam.replaceAll(/.bam$/, ".bai"))) }
.set { input_files }
} else {
exit 1, "Input file not specified!"
}
if (params.intervals) {
Channel
.fromPath(params.intervals)
.splitText(by: 30000, file: true)
.set { intervals }
}
process mutect2Pon {
cpus 2
memory '16g'
module 'java/1.8.0'
errorStrategy 'finish'
input:
set val(name), file(bam), file(bai), file(interval) from input_files.combine(intervals)
output:
set val("${bam.baseName}"), file("${bam.baseName}.${interval.baseName}.mutect.vcf") into mutect_vcfs
"""
mkdir -p `pwd`/scratch/tmp
java -Xmx16g -Djava.io.tmpdir=`pwd`/scratch/tmp -jar ${gatk4_jar} \
Mutect2 \
--reference ${params.reference} \
--intervals ${interval} \
--input ${bam} \
--tumor-sample ${name} \
--max-mnp-distance 0 \
--output ${bam.baseName}.${interval.baseName}.mutect.vcf
"""
}
process gatherVcfs {
cpus 1
memory '32g'
module 'java/1.8.0'
publishDir "${params.output}", mode: "copy"
input:
set name, file(vcf_list) from mutect_vcfs.groupTuple() // group by name
output:
file("${name}.vcf") into whole_vcfs
script:
// NOTE: the input VCFs need to be provided in order by genomic coordinates
input_vcfs = "$vcf_list".split(" ")
.sort{ a, b -> a.tokenize(".")[-3].toInteger().compareTo b.tokenize(".")[-3].toInteger() }
.collect{"INPUT=" + it}.join(" ")
"""
mkdir -p `pwd`/scratch/tmp
java -Xmx32g -Djava.io.tmpdir=`pwd`/scratch/tmp -jar $picard_jar \
GatherVcfs \
${input_vcfs} \
OUTPUT=${name}.vcf
"""
}
process createPON {
cpus 1
memory '32g'
module 'java/1.8.0'
publishDir "${params.output}", mode: "copy"
input:
file(vcf_list) from whole_vcfs.collect()
output:
file("pon.vcf")
file("pon.vcf.idx")
script:
input_vcfs = "$vcf_list".split(" ").collect{"--vcfs " + it}.join(" ")
"""
# combines VCFs and keeps variants occuring in at least two VCFs
mkdir -p `pwd`/scratch/tmp
java -Xmx32g -Djava.io.tmpdir=`pwd`/scratch/tmp -jar $gatk40_jar \
CreateSomaticPanelOfNormals ${input_vcfs} \
--output pon.vcf
"""
}