-
Notifications
You must be signed in to change notification settings - Fork 1
/
Dendrobates_auratus_bash_code_commented.txt
257 lines (184 loc) · 9.27 KB
/
Dendrobates_auratus_bash_code_commented.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/bin/bash
# Bash code for building a Dendrobates auratus transcriptome from data that had lain a bit dormant
# Download the data from the host
wget https://s3.amazonaws.com/NCSU_KC/frog/raw/Roberts_PFLib_FastqFiles.tar.gz
wget https://s3.amazonaws.com/NCSU_KC/frog/raw/FastqFiles_EM1.tar.gz
# Untar the data so I can use it
tar -xzvf *tar.gz
# Move it to the working directory so I don't ruin the actual data...
cp FastqFiles/*.gz .
ls *fastq.gz | wc -l
# output is 50, and I don't want to use the unknown data for now.
# Remove that data
rm U*
# Rename the end of all of the fastq files
for i in *001.fastq
do
mv -- "$i" "${i/%_001.fastq/.fastq}"
done
# Rename oddly abbreviated fastq files...doing it bit by bit because otherwise they would be renamed incorrectly...
for i in Blue-Black*; do
mv -- "$i" "${i/_S*_R/_L1_R}"
done
for i in Microspot*; do
mv -- "$i" "${i/_S*_R/_L1_R}"
done
for i in San-Felix*; do
mv -- "$i" "${i/_S*_R/_L1_R}"
done
# Now rename the odd technical replicates with shortened names. First, specify lane 2.
for i in *L001*; do
mv -- "$i" "${i/_S*_R/_L2_R}"
done
# Rename all the abbreviations
for i in bb*; do
mv -- "$i" "${i/bb-/Blue-Black}"
done
for i in sb*; do
mv -- "$i" "${i/sb-/Super-Blue}"
done
for i in sf*; do
mv -- "$i" "${i/sf-/San-Felix}"
done
for i in us*; do
mv -- "$i" "${i/us-/Microspot}"
done
Cat *R1.fq > all_reads_R1.fastq
Cat *R2.fq > all_reads_R2.fastq
seqtk sample -s100 all_reads_R1.fastq 20000000 > subsamp.R1.fastq
seqtk sample -s100 all_reads_R2.fastq 20000000 > subsamp.R2.fastq
/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=28 \
READ1=subsamp.R1.fastq \
READ2=subsamp.R2.fastq \
RUNOUT=subsamp
# The transcriptome is in a different file, so move it and give it a somewhat more informative name
cp assemblies/subsamp.orthomerged.fasta .
mv subsamp.orthomerged.fasta auratus.subsampled.fasta
############################################################
####### Note: this transcriptome was not great #############
############################################################
## Build a random transcriptome for each different color morph, then merge them all together.
# Given the sequencing depth, I will just combine the 'tech reps' into one file.
ids=$(ls *L1_TRIM_1P.fastq | sed 's/_L1_TRIM_1P.fastq//g')
for i in $ids; do cat $i_L1_TRIM_1P.fastq $i_L2_TRIM_1P.fastq > $i_COM_R1.fastq; done
for i in $ids; do cat $i_L1_TRIM_2P.fastq $i_L2_TRIM_2P.fastq > $i_COM_R2.fastq; done
# Presumably, this should do the above in parallel...but double check...
parallel -j 12 cat {}_L1_TRIM_1P.fastq {}_L2_TRIM_1P.fastq > {}_COM_R1.fastq ::: $ids
parallel -j 12 cat {}_L1_TRIM_2P.fastq {}_L2_TRIM_2P.fastq > {}_COM_R2.fastq ::: $ids
/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=24 \
READ1=Blue-Black1_merged_R1.fastq.gz \
READ2=Blue-Black1_merged_R2.fastq.gz \
RUNOUT=Blue-Black1
/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=28 \
READ1=Microspot2_merged_R1.fastq.gz \
READ2=Microspot2_merged_R2.fastq.gz \
RUNOUT=Microspot2
/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=20 \
READ1=reads/San-Felix2_merged_R1.fastq.gz \
READ2=reads/San-Felix2_merged_R2.fastq.gz \
RUNOUT=San-Felix2
/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=24 \
READ1=reads/Super-Blue1_merged_R1.fastq.gz \
READ2=reads/Super-Blue1_merged_R2.fastq.gz \
RUNOUT=Super-Blue1
####MOVE THE ASSEMBLED TRANSCRIPTOMES INTO A COMMON FOLDER#####
mkdir orthomerged_fastas
cp assemblies/*orthomerged.fasta orthomerged_fastas/
#######RENAME TRANSCRIPTS TO WORK WTIH ORTHOFUSER##########
cd orthomerged_fastas
awk '/^>/{print ">Blue-Black1_" ++i; next}{print}' Blue-Black1.orthomerged.fasta > Blue-Black1.orthomerged.renamed.fasta
awk '/^>/{print ">Microspot2_" ++i; next}{print}' Microspot2.orthomerged.fasta > Microspot2.orthomerged.renamed.fasta
awk '/^>/{print ">San-Felix2_" ++i; next}{print}' San-Felix2.orthomerged.fasta > San-Felix2.orthomerged.renamed.fasta
awk '/^>/{print ">Super-Blue1_" ++i; next}{print}' Super-Blue1.orthomerged.fasta > Super-Blue1.orthomerged.renamed.fasta
rm *orthomerged.fasta
cd ..
#####combine other reads into one big one, also clean all the reads####
Cat *R1.fq > all_reads_R1.fastq
Cat *R2.fq > all_reads_R2.fastq
(ls reads/*fastq.gz | sed "s/_merged_R1.fastq.gz//g" | sed "s/_merged_R2.fastq.gz//g" | uniq) | \
parallel -j 10 trimmomatic-0.36.jar PE -threads 4 \
-baseout rcorr/{}_TRIM.fastq {}_R1.fastq {}_R2.fastq \
LEADING:3 TRAILING:3 ILLUMINACLIP:barcodes.fa:2:30:10 MINLEN:25
# Then run R corrector
(ls reads/*fastq.gz | sed "s/_merged_R1.fastq.gz//g" | sed "s/_merged_R2.fastq.gz//g" | uniq) | \
parallel -j 10 run_rcorrector.pl -t 10 -k 31 -1 rcorr/{}_TRIM_1P.fastq -2 rcorr/{}_TRIM_2P.fastq -od rcorr
# Merge all the cleaned reads
for i in rcorr/*1P.cor.fq; do cat $i; echo; done all_reads_R1.cor.fq
for i in rcorr/*2P.cor.fq; do cat $i; echo; done all_reads_R2.cor.fq
####### Merge all the assemblies into a single transcriptome using orthofuser
/home/summersk/programs/Oyster_River_Protocol/orthofuser.mk all \
READ1=rcorr/all_reads_R1.cor.fq \
READ2=rcorr/all_reads_R2.cor.fq \
CPU=30 RUNOUT=ortho_merged \
FASTADIR=orthomerged_fastas/ LINEAGE=eukaryota_odb9
####### Calculate BUSCO and transrate scores for the full dataset
/home/summersk/programs/mod_Oyster_River_Protocol/software/orp-transrate/transrate \
-o /home/summersk/auratus/reports/transrate_assembly-reads \
-a /home/summersk/auratus/auratus.merged.fasta \
--left /home/summersk/auratus/all_reads_R1.cor.fq \
--right /home/summersk/auratus/all_reads_R1.cor.fq \
-t 30
python /home/summersk/programs/busco/scripts/run_BUSCO.py -i /home/summersk/auratus/auratus.merged.fasta -m transcriptome --cpu 24 -l /home/summersk/busco_dbs/eukaryota_odb9 -o /home/summersk/auratus/good_busco
# Scores were really atrocious for transrate, so we are going to use only the 'good' fasta scores from it.
### Move the 'good' fasta into the main directory and rename it
cp orthofuse/ortho_merged/merged/merged/good.merged.fasta .
mv good.merged.fasta good.auratus.merged.fasta
# Rename all the transcripts because aesthetics.
awk '/^>/{print ">Transcript_" ++i; next}{print}' good.auratus2.fasta > good.auratus2.renamed.fasta
# Pseudo-quantification with kallisto, build the index first
kallisto index -i new_auratus.idx good.auratus2.renamed.fasta
# This will list all the samples
samples=$(ls *cor.fq | sed "s/_TRIM_1P.cor.fq//g" | sed "s/_TRIM_2P.cor.fq//g" | uniq |\
grep -v subsamp | grep -v all_reads)
# Make directories for each sample:
mkdir new_kallisto_quants
cd new_kallisto_quants
for i in $samples; do mkdir $i; done
cd ..
# Perform the actual pseudo-quantification for all of the samples + technical replicates
parallel -j 12 kallisto quant -i /home/summersk/auratus/new_auratus.idx -o new_kallisto_quants/{} -b 100 \
{}_TRIM_1P.cor.fq {}_TRIM_2P.cor.fq ::: $samples
#### Annotation with diamond to amphibian (xenopus, nanorana, rana peptides) + UniRef90 databases
# cat all peptides together (, makedb in diamond
cat GCA_002284835.2_RCv2.1_protein.faa Nanorana_parkeri.gene.v2.pep orthodb.fasta uniref90.fasta Xenopus_tropicalis.JGI_4.2.pep.all.fa > all_peptides.fa
diamond makedb --in all_peptides.fa -d allpep
diamond blastx -d /home/summersk/peptide_databases/allpep.dmnd -q good.auratus2.renamed.fasta -o newauratus2allpep.m8 --threads 32
# sort by top hit
sort newauratus2allpep.m8 -k 1,1 -k11,11g | sort -u -k 1,1 --merge > newallpep_tophit.txt
# xenopus only...
diamond blastx -d /home/summersk/peptide_databases/xen.dmnd -q good.auratus2.renamed.fasta -o newxen2allpep.m8 --threads 32
sort newxen2allpep.m8 -k 1,1 -k11,11g | sort -u -k 1,1 --merge > newxen_tophit.txt
## This approach yields 47% annotation rate
sort tmp.txt -k 1,1 -k11,11g | sort -u -k 1,1 --merge > tmp2.txt
# Filter out transcripts in the transcriptome that match to *Dendrbobates spp* mtDNA
# Data downloaded from NCBI, from Lyra et al 2017, The mitochondrial genomes of three species of poison frogs (Anura: Dendrobates)
cd
cd mtDNA_Dendrobates
cat * > Dendrobates_mtDNA.fa
# make the blast database
makeblastdb -in Dendrobates_mtDNA.fa -dbtype nucl -parse_seqids
# move in to the appropriate folder and run blast
cd /home/summersk/mtDNA_Dendrobates
blastn -db /home/summersk/mtDNA_Dendrobates/Dendrobates_mtDNA.fa -query good.auratus2.renamed.fasta -out mtDNAquery -outfmt 6 -num_threads 50
# only 91 transcripts map to mtDNA, so I'm not concerned about that.
# why don't we have mc1r?
cd ~/mc1r_seqs #contains sequences from Ranitomeya imitator and Oophaga histrionica
cat * > all_mc1r_seqs.fa
makeblastdb -in all_mc1r_seqs.fa -dbtype nucl -parse_seqids
# move in to the appropriate folder and run blast
cd /home/summersk/auratus
blastn -db /home/summersk/mc1r_seqs/all_mc1r_seqs.fa -query good.auratus2.renamed.fasta -out mc1r_query -outfmt 6 -num_threads 50
sort mc1r_query -k 1,1 -k11,11g | sort -u -k 1,1 --merge > mc1r_tophit.txt
# search by the transcript id...
seqs=$(awk '{print $1}' mc1r_tophit.txt)
for i in $seqs; do grep $i newallpep_tophit.txt; done