Dendrobates_auratus_bash_code_commented.txt

#!/bin/bash

# Bash code for building a Dendrobates auratus transcriptome from data that had lain a bit dormant

# Download the data from the host
wget https://s3.amazonaws.com/NCSU_KC/frog/raw/Roberts_PFLib_FastqFiles.tar.gz
wget https://s3.amazonaws.com/NCSU_KC/frog/raw/FastqFiles_EM1.tar.gz

# Untar the data so I can use it
tar -xzvf *tar.gz

# Move it to the working directory so I don't ruin the actual data...
cp FastqFiles/*.gz .

ls *fastq.gz | wc -l
# output is 50, and I don't want to use the unknown data for now.

# Remove that data
rm U*

# Rename the end of all of the fastq files
for i in *001.fastq
do
mv -- "$i" "${i/%_001.fastq/.fastq}"
done

# Rename oddly abbreviated fastq files...doing it bit by bit because otherwise they would be renamed incorrectly...
for i in Blue-Black*; do
  mv -- "$i" "${i/_S*_R/_L1_R}"
done

for i in Microspot*; do
  mv -- "$i" "${i/_S*_R/_L1_R}"
done

for i in San-Felix*; do
  mv -- "$i" "${i/_S*_R/_L1_R}"
done

# Now rename the odd technical replicates with shortened names. First, specify lane 2.
for i in *L001*; do
  mv -- "$i" "${i/_S*_R/_L2_R}"
done

# Rename all the abbreviations
for i in bb*; do
  mv -- "$i" "${i/bb-/Blue-Black}"
done

for i in sb*; do
  mv -- "$i" "${i/sb-/Super-Blue}"
done

for i in sf*; do
  mv -- "$i" "${i/sf-/San-Felix}"
done

for i in us*; do
  mv -- "$i" "${i/us-/Microspot}"
done

Cat *R1.fq > all_reads_R1.fastq
Cat *R2.fq > all_reads_R2.fastq

seqtk sample -s100 all_reads_R1.fastq 20000000 > subsamp.R1.fastq
seqtk sample -s100 all_reads_R2.fastq 20000000 > subsamp.R2.fastq

/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=28 \
READ1=subsamp.R1.fastq \
READ2=subsamp.R2.fastq \
RUNOUT=subsamp

# The transcriptome is in a different file, so move it and give it a somewhat more informative name
cp assemblies/subsamp.orthomerged.fasta .
mv subsamp.orthomerged.fasta auratus.subsampled.fasta

############################################################
####### Note: this transcriptome was not great #############
############################################################

## Build a random transcriptome for each different color morph, then merge them all together.

# Given the sequencing depth, I will just combine the 'tech reps' into one file. 
ids=$(ls *L1_TRIM_1P.fastq | sed 's/_L1_TRIM_1P.fastq//g') 
for i in $ids; do cat $i_L1_TRIM_1P.fastq $i_L2_TRIM_1P.fastq > $i_COM_R1.fastq; done
for i in $ids; do cat $i_L1_TRIM_2P.fastq $i_L2_TRIM_2P.fastq > $i_COM_R2.fastq; done

# Presumably, this should do the above in parallel...but double check...
parallel -j 12 cat {}_L1_TRIM_1P.fastq {}_L2_TRIM_1P.fastq > {}_COM_R1.fastq ::: $ids
parallel -j 12 cat {}_L1_TRIM_2P.fastq {}_L2_TRIM_2P.fastq > {}_COM_R2.fastq ::: $ids


/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=24 \
READ1=Blue-Black1_merged_R1.fastq.gz \
READ2=Blue-Black1_merged_R2.fastq.gz \
RUNOUT=Blue-Black1

/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=28 \
READ1=Microspot2_merged_R1.fastq.gz \
READ2=Microspot2_merged_R2.fastq.gz \
RUNOUT=Microspot2

/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=20 \
READ1=reads/San-Felix2_merged_R1.fastq.gz \
READ2=reads/San-Felix2_merged_R2.fastq.gz \
RUNOUT=San-Felix2

/home/summersk/programs/Oyster_River_Protocol/oyster.mk main \
MEM=1250 \
CPU=24 \
READ1=reads/Super-Blue1_merged_R1.fastq.gz \
READ2=reads/Super-Blue1_merged_R2.fastq.gz \
RUNOUT=Super-Blue1

####MOVE THE ASSEMBLED TRANSCRIPTOMES INTO A COMMON FOLDER#####
mkdir orthomerged_fastas
cp assemblies/*orthomerged.fasta orthomerged_fastas/


#######RENAME TRANSCRIPTS TO WORK WTIH ORTHOFUSER##########
cd orthomerged_fastas
awk '/^>/{print ">Blue-Black1_" ++i; next}{print}'  Blue-Black1.orthomerged.fasta > Blue-Black1.orthomerged.renamed.fasta

awk '/^>/{print ">Microspot2_" ++i; next}{print}'  Microspot2.orthomerged.fasta > Microspot2.orthomerged.renamed.fasta

awk '/^>/{print ">San-Felix2_" ++i; next}{print}'  San-Felix2.orthomerged.fasta > San-Felix2.orthomerged.renamed.fasta

awk '/^>/{print ">Super-Blue1_" ++i; next}{print}'  Super-Blue1.orthomerged.fasta > Super-Blue1.orthomerged.renamed.fasta

rm *orthomerged.fasta

cd ..

#####combine other reads into one big one, also clean all the reads####
Cat *R1.fq > all_reads_R1.fastq
Cat *R2.fq > all_reads_R2.fastq

(ls reads/*fastq.gz | sed "s/_merged_R1.fastq.gz//g" | sed "s/_merged_R2.fastq.gz//g" | uniq) | \
parallel -j 10 trimmomatic-0.36.jar PE -threads 4 \
-baseout rcorr/{}_TRIM.fastq {}_R1.fastq {}_R2.fastq \
LEADING:3 TRAILING:3 ILLUMINACLIP:barcodes.fa:2:30:10 MINLEN:25 

# Then run R corrector
(ls reads/*fastq.gz | sed "s/_merged_R1.fastq.gz//g" | sed "s/_merged_R2.fastq.gz//g" | uniq) | \
parallel -j 10 run_rcorrector.pl -t 10 -k 31 -1 rcorr/{}_TRIM_1P.fastq -2 rcorr/{}_TRIM_2P.fastq -od rcorr

# Merge all the cleaned reads
for i in rcorr/*1P.cor.fq; do cat $i; echo; done all_reads_R1.cor.fq
for i in rcorr/*2P.cor.fq; do cat $i; echo; done all_reads_R2.cor.fq


####### Merge all the assemblies into a single transcriptome using orthofuser

/home/summersk/programs/Oyster_River_Protocol/orthofuser.mk all \
READ1=rcorr/all_reads_R1.cor.fq \
READ2=rcorr/all_reads_R2.cor.fq \
CPU=30 RUNOUT=ortho_merged \
FASTADIR=orthomerged_fastas/ LINEAGE=eukaryota_odb9

####### Calculate BUSCO and transrate scores for the full dataset

/home/summersk/programs/mod_Oyster_River_Protocol/software/orp-transrate/transrate \
-o /home/summersk/auratus/reports/transrate_assembly-reads  \
-a /home/summersk/auratus/auratus.merged.fasta \
--left /home/summersk/auratus/all_reads_R1.cor.fq \
--right /home/summersk/auratus/all_reads_R1.cor.fq \
-t 30

python /home/summersk/programs/busco/scripts/run_BUSCO.py -i /home/summersk/auratus/auratus.merged.fasta -m transcriptome --cpu 24 -l /home/summersk/busco_dbs/eukaryota_odb9 -o /home/summersk/auratus/good_busco

# Scores were really atrocious for transrate, so we are going to use only the 'good' fasta scores from it.

### Move the 'good' fasta into the main directory and rename it
cp orthofuse/ortho_merged/merged/merged/good.merged.fasta .
mv good.merged.fasta good.auratus.merged.fasta

# Rename all the transcripts because aesthetics.
awk '/^>/{print ">Transcript_" ++i; next}{print}'  good.auratus2.fasta > good.auratus2.renamed.fasta


# Pseudo-quantification with kallisto, build the index first
kallisto index -i new_auratus.idx  good.auratus2.renamed.fasta

# This will list all the samples
samples=$(ls *cor.fq | sed "s/_TRIM_1P.cor.fq//g" | sed "s/_TRIM_2P.cor.fq//g" | uniq  |\
grep -v subsamp | grep -v all_reads)


# Make directories for each sample:
mkdir new_kallisto_quants
cd new_kallisto_quants
for i in $samples; do mkdir $i; done
cd ..


# Perform the actual pseudo-quantification for all of the samples + technical replicates

parallel -j 12 kallisto quant -i /home/summersk/auratus/new_auratus.idx -o new_kallisto_quants/{} -b 100 \
{}_TRIM_1P.cor.fq {}_TRIM_2P.cor.fq ::: $samples


#### Annotation with diamond to amphibian (xenopus, nanorana, rana peptides) + UniRef90 databases

# cat all peptides together (, makedb in diamond
cat GCA_002284835.2_RCv2.1_protein.faa Nanorana_parkeri.gene.v2.pep orthodb.fasta uniref90.fasta Xenopus_tropicalis.JGI_4.2.pep.all.fa > all_peptides.fa
  
  
  diamond makedb --in all_peptides.fa -d allpep
  diamond blastx -d /home/summersk/peptide_databases/allpep.dmnd -q good.auratus2.renamed.fasta -o newauratus2allpep.m8 --threads 32 

# sort by top hit
sort newauratus2allpep.m8 -k 1,1 -k11,11g | sort -u -k 1,1 --merge > newallpep_tophit.txt

# xenopus only...
diamond blastx -d /home/summersk/peptide_databases/xen.dmnd -q good.auratus2.renamed.fasta -o newxen2allpep.m8 --threads 32 
sort newxen2allpep.m8 -k 1,1 -k11,11g | sort -u -k 1,1 --merge > newxen_tophit.txt
## This approach yields 47% annotation rate

sort tmp.txt -k 1,1 -k11,11g | sort -u -k 1,1 --merge > tmp2.txt


# Filter out transcripts in the transcriptome that match to *Dendrbobates spp* mtDNA
# Data downloaded from NCBI, from Lyra et al 2017, The mitochondrial genomes of three species of poison frogs (Anura: Dendrobates)

cd 
cd mtDNA_Dendrobates
cat * > Dendrobates_mtDNA.fa
# make the blast database
makeblastdb -in Dendrobates_mtDNA.fa -dbtype nucl -parse_seqids	
# move in to the appropriate folder and run blast
cd /home/summersk/mtDNA_Dendrobates
blastn -db /home/summersk/mtDNA_Dendrobates/Dendrobates_mtDNA.fa -query good.auratus2.renamed.fasta -out mtDNAquery -outfmt 6 -num_threads 50

# only 91 transcripts map to mtDNA, so I'm not concerned about that.

# why don't we have mc1r?
cd ~/mc1r_seqs #contains sequences from Ranitomeya imitator and Oophaga histrionica
cat * > all_mc1r_seqs.fa

  makeblastdb -in all_mc1r_seqs.fa -dbtype nucl -parse_seqids	

# move in to the appropriate folder and run blast
cd /home/summersk/auratus
blastn -db /home/summersk/mc1r_seqs/all_mc1r_seqs.fa -query good.auratus2.renamed.fasta -out mc1r_query -outfmt 6 -num_threads 50
sort mc1r_query -k 1,1 -k11,11g | sort -u -k 1,1 --merge > mc1r_tophit.txt

# search by the transcript id...
seqs=$(awk '{print $1}' mc1r_tophit.txt)
for i in $seqs; do grep $i newallpep_tophit.txt; done