-
Notifications
You must be signed in to change notification settings - Fork 0
/
BUSCO-to-Phylogeny_2.sh
502 lines (490 loc) · 27.6 KB
/
BUSCO-to-Phylogeny_2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
#!/bin/bash
#
# A pipeline created to automatize all my steps for phylogenetic analyses using the BUSCO toolkit.
# In the end you'll have a concatenated supermatrix tree and a consensus supertree.
#
#
# Does require specific conda environments!
#
# by Magnus Wolf 2021 ([email protected])
#
# In the following part you need to provide paths and desired settings for one or all of the different scripts as well as some general
# tree constructing steps:
#
#################################general dependencies and parameters#####################################################################
source ~/anaconda3/etc/profile.d/conda.sh #make sure to be able to switch between conda envs
WORKINGDIR=/path/to/your/working/dir/ #path to the Working directory
ulimit -n 2048 #get more doable jobs-temp files, I suggest not to change that.
THREADS=N #number of threads
TASKS=N #number of parallel task to run alignment or trimming processes
MAFFTenv=MAFFTenv #name of a conda environment with the newest MAFFT installation
CLIPKITenv=CLIPKITenv #name of a conda environment with the newest CLIPKIT installation
FASCONCAT=$WORKINGDIR"/FASconCAT-G_v1.04.pl" #path to the FASconCAT-G perl script by Patrick Kueck from Koenig Museum Bonn, no change necessary when the script is in the working directory
IQTREEenv=IQTREEenv #name of a conda environment with the newest IQTREE installation
ASTRAL=$WORKINGDIR"/Astral/astral.5.7.3.jar" #path to the Astral java script for supermatrix trees, no change necessary when the directory is in the working directory
#
##################################BUSCO-to-Phylogeny dependencies and parameters#########################################################
ODB10=whatever_odb10 #name of the OrthoDB database that should be used for all species
SPECIES=human #name of the species that augustus is trained for and should be used in BUSCO
export BUSCO_CONFIG_FILE=$WORKINGDIR"/config.ini" #provide the path to a BUSCO config file after the #=# sign, not necessary when config is in Working Dir
EXTRACTSCOS=$WORKINGDIR"/extract_scos_BUSCO.py" #path to my python script extract_scos_BUSCO.py, no change necessary when the script is in the working directory
BUSCOenv=BUSCOenv #name of BUSCO5 env, should have the newest BUSCO5 installed.
# #use this command to install BUSCO5: conda install -n BUSCO5env -c bioconda -c conda-forge busco=5.1.2
#
#
# Now that you provided the dependencies, you can choose which script you wanne run, everything you'll call #TRUE# will run!
# You can find a detailed discription for each step in a makeshift square down below. Additionaly, this discription will pop up if you run the respective
# step.
#
##################################Options for BUSCO-to-PHYLOGENY#########################################################################
runbusco=TRUE #this will take a loooooong time!
findsharedscos=TRUE
makealignments=TRUE
trimmgenealignments=TRUE
constructgenetrees=TRUE
filtergenetrees=TRUE
concatgenealignments=TRUE
trimmsupermatrix=TRUE
constructsupermtree=TRUE #this will take a loooooong time!
constructsuperttree=TRUE
#
#
#########################################################################################################################################
# Now you finished. Go into the Working Directory and hit #bash BUSCO-to-Phylogeny.sh#. Then everything you chosed will run one by one.
# I recommend using some sort of log pipeing since long pipelines like this one tend to produce a lot of hidden errors. Try e.g.
# #2>&1 | tee error.log# behind the bash command.
#
# After this line I wouldn't touch anything...
#########################################################################################################################################
if [[ "$runbusco" = TRUE ]]
then
echo "########################start runbusco#############################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is runbusco of BUSCO-to-PHYLOGENY, a script for fast #"
echo "#phylogenetic studies based on the BUSCO pipline. This part will #"
echo "#run BUSCO and keeps all single copy orthologs. For space saving #"
echo "#purposes every other results that come with a BUSCO run will be #"
echo "#removed. Requires a working conda env with BUSCO and a regarding #"
echo "#config file available from path. Most functions in this config #"
echo "#file will be overwritten by my command but make sure that the #"
echo "#config file states that it uses the #short# training. #"
echo "#please provide also a lineage database from OrthoDB10 and an #"
echo "#augustus training species above. #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
conda activate $BUSCOenv
mkdir BUSCO
cd BUSCO
for file in ./../*.fasta;
do
bn=`basename $file .fasta`
busco --cpu $THREADS --mode genome --in $file --out $bn"_BUSCO" --lineage $ODB10 --augustus_species $SPECIES
cd $bn"_BUSCO"
mv "run_"$ODB10 "./../run_"$bn
cd ..
rm -r $bn"_BUSCO"
cd "run_"$bn
rm -r augustus_output #apparently BUSCO5 does not produce these output directories anoymore so might be redundant
rm -r blast_output
rm -r hmmer_output
cd ./../
done
rm busco_downloads
cd ./../
conda deactivate
wait
fi
if [[ "$findsharedscos" = TRUE ]]
then
echo "########################start findsharedscos#######################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is findsharedscos of BUSCO-to-PHYLOGENY, a script for #"
echo "#fast phylogenetic studies based on the BUSCO pipline. This part #"
echo "#will screen through all BUSCO output directories to find #"
echo "#orthologs shared between all species resulted from the step above#"
echo "#Make sure to provide the path to my script extract_scos_BUSCO.py #"
echo "#Outputs a directory in the BUSCO folder filled with concatenated #"
echo "#fasta files. Per default, 25% missing data in SCOS is allowed #"
echo "#in the extract_scos_BUSCO.py script. If you want to change it, #"
echo "#change the line ...exludecount = counter // 4... to whatever you #"
echo "#desire. 4 means 1/4 are allowed to be absent. #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
cd BUSCO
mkdir SCOS
python $EXTRACTSCOS $WORKINGDIR"/BUSCO/"
mv *.fasta ./SCOS
cd ./../
wait
fi
if [[ "$makealignments" = TRUE ]]
then
echo "########################start makealignments#######################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is makealignments of BUSCO-to-PHYLOGENY, a script for #"
echo "#fast phylogenetic studies based on the BUSCO pipline. This part #"
echo "#will make alignments for each shared single copy ortholog found #"
echo "#in the steps above and located in the BUSCO/SCOS/ directory. #"
echo "#Requires a working conda env with mafft > v7 installed. Since #"
echo "#gene sequences are short, mafft is using the most accurate #"
echo "#option with 1000 iterative refinements incorporating local #"
echo "#pairwise alignment information. Afterwards it will also remove #"
echo "#every character of the IUPAC code that represents more then one #"
echo "#amino acid and will remove every MSA with unequal length. #"
echo "#FASTCONCAT can't handle thouse...This was the easiest method to #"
echo "#implement and there might be better ones! #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
conda activate $MAFFTenv
mkdir ALIGNMENTS
cd ALIGNMENTS
for file in ./../BUSCO/SCOS/*.fasta;
do
ln -s $file
done
task1(){
genename=$(echo $1 | cut -d "_" -f1)
mafft --maxiterate 1000 --localpair $1 > $genename"_alig.fasta";
}
N=$TASKS
(
for file in *complete.fasta;
do
((i=i%N)); ((i++==0)) && wait
task1 "$file" &
done
)
wait
for file in *_alig.fasta;
do
filename=$(echo $file | cut -d "_" -f1)
while IFS= read -r line;
do
if [[ $line == ">"* ]];
then
echo $line >> $filename"_alig_noIUPAC.fasta"
else
echo $line | sed "s/J/-/g" | sed "s/Z/-/g" | sed "s/B/-/g" | sed "s/X/-/g" >> $filename"_alig_noIUPAC.fasta"
fi
done < "$file"
done
for file in *_alig_noIUPAC.fasta;
do
filename=$(echo $file | cut -d "_" -f1)
seqlength=0
adder=0
while IFS= read -r line;
do
if [[ $line == ">"* ]];
then
echo $seqlength >> $filename"_noIUPAC_temp.txt"
seqlength=0
else
adder=$(echo $line | wc -c)
seqlength=$(($seqlength + $adder))
fi
done < "$file"
echo $seqlength >> $filename"_noIUPAC_temp.txt"
uniqlengths=$(cat $filename"_noIUPAC_temp.txt" | uniq | wc -l)
if [ $uniqlengths -gt 2 ];
then
#echo $file
#echo $uniqlengths
rm $file
fi
done
rm *_noIUPAC_temp.txt
rm *_alig.fasta
rm *complete.fasta
cd ./../
conda deactivate
wait
fi
if [[ "$trimmgenealignments" = TRUE ]]
then
echo "########################start trimmgenealignments##################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is trimmgenealignments of BUSCO-to-PHYLOGENY, a script #"
echo "#for fast phylogenetic studies based on the BUSCO pipline. This #"
echo "#part will trimm all individual SCOS alignments in the ALIGNMENTS #"
echo "#directory. Make sure to provide a working conda env with ClipKIT #"
echo "#installed. The program will only keep parsimony-informative #"
echo "#sites and completely conserved sites. It also incorporates a #"
echo "#dynamic gap filtering. Will output timmed.fasta files. #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
cd ALIGNMENTS
conda activate $CLIPKITenv
task2(){
genename=$(echo $1 | cut -d "_" -f1)
clipkit $1 -m kpic-smart-gap -o $genename"_alig_trimmed.fasta"
clipkit $1 -m kpi-smart-gap -o $genename"_parinfosite.fasta"
}
N=$TASKS
(
for file in *alig_noIUPAC.fasta;
do
((i=i%N)); ((i++==0)) && wait
task2 "$file" &
done
)
wait
cd ./../
wait
fi
if [[ "$constructgenetrees" = TRUE ]]
then
echo "########################start constructgenetrees###################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is constructgenetrees of BUSCO-to-PHYLOGENY a script #"
echo "#for fast phylogenetic studies based on the BUSCO pipline. This #"
echo "#part will construct individual gene trees for each of the #"
echo "#alignments in the ALIGNMENTS directory using IQtree. This #"
echo "#requires a working conda env with IQtree installed. Make sure #"
echo "#to provide the name of the env above. Will make 1000 bootstrap #"
echo "#replications and will use 5 threads since more will not increase #"
echo "#the speed with so short sequences. You'll find constructed trees #"
echo "#in the GENETREES directory. #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
conda activate $IQTREEenv
mkdir GENETREES
cd GENETREES
for file in ./../ALIGNMENTS/*.fasta;
do
ln -s $file
done
for file in *trimmed.fasta;
do
iqtree -s $file -bb 1000 -nt 5
done
cd ./../
conda deactivate
wait
fi
if [[ "$filtergenetrees" = TRUE ]]
then
echo "########################start filtergenetrees######################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is filtergenetrees of BUSCO-to-PHYLOGENY a script #"
echo "#for fast phylogenetic studies based on the BUSCO pipline. This #"
echo "#part uses the maximum likelihood distance of the alignment that #"
echo "#was calculated by iqtree to filter out genetrees and alignments #"
echo "#that are either too conserved to produce a meaningful tree #"
echo "#or had a too high genetic distance that might speak for a #"
echo "#miss-alignment artefact. To do so, it will calculate the #"
echo "#distribution of maximum likelihood distances of all alignments #"
echo "#and will then infere the 0.05 and 0.95 quantiles. All alignments #"
echo "#and genetrees within these quantiles will be removed. A report, #"
echo "#which files were remove is printed to the console. #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
cd GENETREES
numofspec=$(cat *_alig_trimmed.fasta | grep ">" | sort | uniq | wc -l)
numofspec2=$(($numofspec + 1))
for file in *.mldist;
do
sum=0
column=1
filename=$(echo $file | cut -d "." -f1)
while [[ $column -le $numofspec2 ]];
do
((column = column + 1))
adder=$(cat $file | awk FNR!=1 | sed "s/ / /g" | sed "s/ / /g" | sed "s/ / /g" | sed "s/ / /g" | sed "s/ / /g" | cut -d " " -f$column | awk '{s+=$1} END {print s}')
sum=$(echo "scale=6; $sum + $adder" | bc)
adder=0
done
filelength1=$(cat $file | wc -l)
filelength2=$(echo "scale=6; $filelength1 - 1" | bc)
relativesum=$(echo "scale=6; $sum / $filelength2" | bc)
sumint=$(echo $relativesum | cut -d "." -f1)
echo $sumint $filename >> mldist_dist.txt
relativesum=0
sumint=0
sum=0
done
for file in mldist_dist.txt;
do
numofgenetrees=$(cat $file | wc -l)
lowquantfloat=$(bc<<<$numofgenetrees*0.05)
lowquantint=$(echo $lowquantfloat | cut -d "." -f1)
cat $file | sort -k1 -h | head -$lowquantint >> mldist_toremove.txt
cat $file | sort -k1 -h | tail -$lowquantint >> mldist_toremove.txt
done
for file in mldist_toremove.txt;
do
while IFS= read -r line;
do
toremove=$(echo $line | cut -d " " -f2)
rm $toremove*
done < "$file"
done
numofremoved=$(cat mldist_toremove.txt | wc -l)
echo "Removed "$numofremoved" files because their maximum likelihood distance was either in the low 5percent quantile or the high 5percent quantile."
echo "Therefore, it might be that these files were outlier and either to conserved for a meaningful tree or had an alignment artefact that"
echo "resulted in an exeptional high genetic distance. Here a list of the files removed:"
echo "#int of total relative genetic distance #filename"
cat mldist_toremove.txt
rm mldist_dist.txt
rm mldist_toremove.txt
cd ./../
wait
fi
if [[ "$concatgenealignments" = TRUE ]]
then
echo "########################start concatgenealignments#################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is concatgenealignments of BUSCO-to-PHYLOGENY a script #"
echo "#for fast phylogenetic studies based on the BUSCO pipline. This #"
echo "#part will concatenate all gene alignments in the ALIGNMENTS #"
echo "#directory using the FASconCAT_v1.11.pl perl script written by #"
echo "#Patrick Kueck from the Museum Koenig (https://github #"
echo "#.com/PatrickKueck/FASconCAT) with default options. Make sure to #"
echo "#provide the path to the script above. Will output a so called #"
echo "#FcC_supermatrix.fas in a dedicated SUPERMATRIX directory. #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
mkdir SUPERMATRIX
cd SUPERMATRIX
for file in ./../GENETREES/*.treefile;
do
bn=`basename $file _trimmed.fasta.treefile`
ln -s "./../GENETREES/"$bn"_noIUPAC.fasta"
done
perl $FASCONCAT -s
rm *.fasta
cd ./../
wait
fi
if [[ "$trimmsupermatrix" = TRUE ]]
then
echo "########################start trimmsupermatrix#####################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is trimmsupermatrix of BUSCO-to-PHYLOGENY a script #"
echo "#for fast phylogenetic studies based on the BUSCO pipline. This #"
echo "#part will trimm the supermatrix made in the step above using the #"
echo "#same clipkit command as used to trimm the genealignments. This is#"
echo "#part is only necessary when no trimming was applied to the #"
echo "#genealignments. Otherwise it might be redundant. Use if desired. #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
cd SUPERMATRIX
conda activate $CLIPKITenv
clipkit FcC_supermatrix.fas -m kpic-smart-gap -o FcC_supermatrix_trimmed.fas
conda deactivate
cd ./../
wait
fi
if [[ "$constructsupermtree" = TRUE ]]
then
echo "########################start constructsupermtree##################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is constructgenetrees of BUSCO-to-PHYLOGENY a script #"
echo "#for fast phylogenetic studies based on the BUSCO pipline. This #"
echo "#part will construct a supermatrix tree using IQtree and the #"
echo "#supermatrix alignment constructed above. Make sure that the #"
echo "#alignment is trimmed and that you provided the name of the #"
echo "#working conda env above. Will make 1000 bootstrap replications. #"
echo "#Constructed tree is lated placed in the SUPERMATRIX directory. #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
conda activate $IQTREEenv
cd SUPERMATRIX
iqtree -s FcC_supermatrix_trimmed.fas -bb 1000 -nt $THREADS
cd ./../
conda deactivate
wait
fi
if [[ "$constructsuperttree" = TRUE ]]
then
echo "########################start constructsuperttree##################"
echo "# #"
echo "# #"
echo "# #"
echo "# #"
echo "#this is constructsuperttree of BUSCO-to-PHYLOGENY a script #"
echo "#for fast phylogenetic studies based on the BUSCO pipline. This #"
echo "#part will construct a consensus supertree tree from the gene #"
echo "#trees constructed in the steps above. This will be done by using #"
echo "#Astral-III, a java script that takes a concatenated newick file #"
echo "#and can be downloaded at https://github.com/smirarab/ASTRAL. #"
echo "#Make sure to provide the path to the java script above. If your #"
echo "#java version isn't up to date create a conda env with a new #"
echo "#java installation and then run the overall script. #"
echo "# #"
echo "# #"
echo "#good luck! #"
echo "# #"
echo "# #"
echo "###################################################################"
mkdir SUPERTREE
cd SUPERTREE
for file in ./../GENETREES/*.treefile;
do
ln -s $file
done
for file in *.treefile;
do
filename=$(echo $file | cut -d "_" -f1)
mv $file $filename"_1000bt.nex"
done
cat *_1000bt.nex > GENETREES_concat.nex
java -jar $ASTRAL -i GENETREES_concat.nex -o SUPERTREE.nex
rm *1000bt.nex
cd ./../
wait
fi