%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
%%bash
mkdir /home/sam/data/geoduck
mkdir /home/sam/data/geoduck/transcriptomes
mkdir /home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits
%%bash
cd /home/sam/data/geoduck/transcriptomes
# Uncomment following line to retrieve file using wget
# wget http://gannet.fish.washington.edu/Atumefaciens/20181121_geo_transdecoder/20180827_trinity_geoduck.fasta.transdecoder.cds
rsync gannet:/volume1/web/Atumefaciens/20181121_geo_transdecoder/20180827_trinity_geoduck.fasta.transdecoder.cds
ls -lh
%%bash
cd /home/sam/data/geoduck/transcriptomes/
# Count sequences in FastA
echo "-------------------"
echo "NUMBER OF SEQUENCES IN ORIGINAL FASTA"
grep -c ">" 20180827_trinity_geoduck.fasta.transdecoder.cds
echo "-------------------"
echo ""
echo ""
cd /home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/
# Split FastA
time \
/home/sam/software/bin/pyfaidx-0.5.5.2 \
--split-files \
../20180827_trinity_geoduck.fasta.transdecoder.cds
# Count number of individual FastA files
echo "-------------------"
echo "NUMBER OF INDIVIDUAL FASTA FILES"
ls -1 | wc -l
echo "-------------------"
Whoops! Ran rsync
command incorrectly. Duh!
%%bash
cd /home/sam/data/geoduck/transcriptomes
rsync \
--archive \
gannet:/volume1/web/Atumefaciens/20181121_geo_transdecoder/20180827_trinity_geoduck.fasta.transdecoder.cds .
echo "-------------------"
ls -lh
%%bash
cd /home/sam/data/geoduck/transcriptomes/
# Count sequences in FastA
echo "-------------------"
echo "NUMBER OF SEQUENCES IN ORIGINAL FASTA"
grep -c ">" 20180827_trinity_geoduck.fasta.transdecoder.cds
echo "-------------------"
echo ""
echo ""
cd /home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/
# Split FastA
time \
/home/sam/software/bin/pyfaidx-0.5.5.2 \
--split-files \
../20180827_trinity_geoduck.fasta.transdecoder.cds
# Count number of individual FastA files
echo "-------------------"
echo "NUMBER OF INDIVIDUAL FASTA FILES"
ls -1 | wc -l
echo "-------------------"
uses grep
to search for vitellogenin terms and the word "complete"
translates commas to spaces to aid in parsing/formatting for sort command
sorts in reverse order, using the "version" flag to help sort strings that contain numbers and sorts on column 7 (i.e. the "score" column
awk prints a header line to provide column descriptions and then prints out desired columns
column -t
formats output into nicely spaced columns
tee
prints output to file and to screen
%%bash
cd /home/sam/data/geoduck/transcriptomes/
grep --ignore-case "Vitellogenin" 20180827_trinity_geoduck.fasta.transdecoder.cds \
| grep "complete" \
| tr "," " " \
| sort -Vr -k7 \
| awk 'BEGIN{print "transcript_ID", "transcript_type", "transcript_length", "strand", "score", "annotation"}; \
{print $1, $4, $5, $6, $7 , $8}' \
| column -t \
| tee 20181127_geoduck_Vitellogenin_cds_matches.txt
%%bash
cd /home/sam/data/geoduck/transcriptomes/
grep --ignore-case "Vtg" 20180827_trinity_geoduck.fasta.transdecoder.cds \
| grep "complete" \
| tr "," " " \
| sort -Vr -k7 \
| awk 'BEGIN{print "transcript_ID", "transcript_type", "transcript_length", "strand", "score", "annotation"}; \
{print $1, $4, $5, $6, $7 , $8}' \
| column -t \
| tee 20181127_geoduck_Vtg_cds_matches.txt
%%bash
cd /home/sam/data/geoduck/transcriptomes/
grep --ignore-case "Vg" 20180827_trinity_geoduck.fasta.transdecoder.cds \
| grep "complete" \
| tr "," " " \
| sort -Vr -k7 \
| awk 'BEGIN{print "transcript_ID", "transcript_type", "transcript_length", "strand", "score", "annotation"}; \
{print $1, $4, $5, $6, $7 , $8}' \
| column -t \
| tee 20181127_geoduck_Vg_cds_matches.txt
Looks like the only search that produced viable results is the "vitellogenin". Will use the top scoring (i.e. best e-value) match for primer design.
%%bash
mkdir /home/sam/analyses
mkdir /home/sam/analyses/20181129_geoduck_vtg_primers
SEQUENCE_ID=${seq_id}
SEQUENCE_TEMPLATE=${sequence}
PRIMER_TASK=generic
PRIMER_PICK_LEFT_PRIMER=3
PRIMER_PICK_RIGHT_PRIMER=3
PRIMER_OPT_SIZE=18
PRIMER_MIN_SIZE=15
PRIMER_MAX_SIZE=21
PRIMER_MAX_NS_ACCEPTED=1
PRIMER_PRODUCT_SIZE_RANGE=75-150
P3_FILE_FLAG=1
PRIMER_EXPLAIN_FLAG=1
=
Values after the "=" on each line can be changed to whatever values the user decides. The ${sequence}
must be a nucletoide sequence on a single line, with no line breaks.
The code below uses a heredoc
to write this information to a file. Everything between the following two lines gets printed (via cat
) as shown and then redirected to the indicated file (20181129_primer3_params.txt
):
cat << EOF > /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_params.txt
EOF
--format_output
to make a nice, human-readable output format.%%bash
cd /home/sam/analyses/20181129_geoduck_vtg_primers
# Store sequence only from desired FastA.
# Print all lines after the first line and then delete newlines
# Creates a sequence that exists on a single line, which is necessary for Primer3
sequence=$(tail -n +2 /home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/TRINITY_DN51983_c0_g1_i8.p1.cds | tr -d '\n')
# Store file name of targeted FastA file.
seq_id=$(echo "$(head -n 1 /home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/TRINITY_DN51983_c0_g1_i8.p1.cds | tr -d '>').cds")
# Use heredoc to create Primer3 parameters file
cat << EOF > /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_params.txt
SEQUENCE_ID=${seq_id}
SEQUENCE_TEMPLATE=${sequence}
PRIMER_TASK=generic
PRIMER_PICK_LEFT_PRIMER=3
PRIMER_PICK_RIGHT_PRIMER=3
PRIMER_OPT_SIZE=18
PRIMER_MIN_SIZE=15
PRIMER_MAX_SIZE=21
PRIMER_MAX_NS_ACCEPTED=1
PRIMER_PRODUCT_SIZE_RANGE=75-150
P3_FILE_FLAG=1
PRIMER_EXPLAIN_FLAG=1
=
EOF
# Run Primer3
/home/sam/software/primer3-2.4.0/src/primer3_core \
--format_output \
--output=/home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_primers.txt \
/home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_params.txt
%%bash
cat /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_primers.txt
%%bash
cd /home/sam/analyses/20181129_geoduck_vtg_primers
# Store sequence only from desired FastA.
# Print all lines after the first line and then delete newlines
sequence=$(tail -n +2 /home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/TRINITY_DN51983_c0_g1_i8.p1.cds | tr -d '\n')
#
seq_id=$(echo "$(head -n 1 /home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/TRINITY_DN51983_c0_g1_i8.p1.cds | tr -d '>').cds")
# Use heredoc to create Primer3 parameters file
cat << EOF > /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_params.txt
SEQUENCE_ID=${seq_id}
SEQUENCE_TEMPLATE=${sequence}
PRIMER_TASK=generic
PRIMER_PICK_LEFT_PRIMER=3
PRIMER_PICK_RIGHT_PRIMER=3
PRIMER_OPT_SIZE=18
PRIMER_MIN_SIZE=15
PRIMER_MAX_SIZE=21
PRIMER_MAX_NS_ACCEPTED=1
PRIMER_PRODUCT_SIZE_RANGE=75-150
P3_FILE_FLAG=1
PRIMER_EXPLAIN_FLAG=1
PRIMER_THERMODYNAMIC_PARAMETERS_PATH=/home/sam/software/primer3-2.4.0/src/primer3_config/
=
EOF
# Run Primer3
/home/sam/software/primer3-2.4.0/src/primer3_core \
--format_output \
--output=/home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_primers.txt \
/home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_params.txt
%%bash
cat /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_primers.txt
%%bash
cd /home/sam/analyses/
rsync \
--archive \
--relative \
./20181129_geoduck_vtg_primers/ gannet:/volume1/web/Atumefaciens
%%bash
cd /home/sam/analyses/20181129_geoduck_vtg_primers
# Store sequence only from desired FastA.
# Print all lines after the first line and then delete newlines
sequence=$(tail -n +2 /home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/TRINITY_DN51983_c0_g1_i8.p1.cds | tr -d '\n')
#
seq_id=$(echo "$(head -n 1 /home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/TRINITY_DN51983_c0_g1_i8.p1.cds | tr -d '>').cds")
# Use heredoc to create Primer3 parameters file
cat << EOF > /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_params.txt
SEQUENCE_ID=${seq_id}
SEQUENCE_TEMPLATE=${sequence}
PRIMER_TASK=generic
PRIMER_PICK_LEFT_PRIMER=3
PRIMER_PICK_RIGHT_PRIMER=3
PRIMER_OPT_SIZE=18
PRIMER_MIN_SIZE=15
PRIMER_MAX_SIZE=21
PRIMER_MAX_NS_ACCEPTED=1
PRIMER_PRODUCT_SIZE_RANGE=75-150
P3_FILE_FLAG=1
PRIMER_EXPLAIN_FLAG=1
PRIMER_THERMODYNAMIC_PARAMETERS_PATH=/home/sam/software/primer3-2.4.0/src/primer3_config/
=
EOF
# Run Primer3
/home/sam/software/primer3-2.4.0/src/primer3_core \
--output=/home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_primers_default_format.txt \
/home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_params.txt
cat /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_primers_default_format.txt
Parses out sequence id, left, and right primers and creates the proper tab-delimited primer sequences file needed by primersearch
Runs primersearch
using the newly created primer sequences file and the target FastA file that was used to generate our primers in Primer3
%%bash
cd /home/sam/analyses/20181129_geoduck_vtg_primers/
seq_id=$(grep "SEQUENCE_ID=" /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_primers_default_format.txt | sed 's/SEQUENCE_ID=//')
left_primer=$(grep "PRIMER_LEFT_0_SEQUENCE=" /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_primers_default_format.txt | sed 's/PRIMER_LEFT_0_SEQUENCE=//')
right_primer=$(grep "PRIMER_RIGHT_0_SEQUENCE=" /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_primer3_primers_default_format.txt | sed 's/PRIMER_RIGHT_0_SEQUENCE=//')
printf "%s\t" "${seq_id}" "${left_primer}" "${right_primer}" > /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_emboss_primers.txt
# Add newline to end of file
printf "\n" >> /home/sam/analyses/20181129_geoduck_vtg_primers/20181129_emboss_primers.txt
/home/sam/software/EMBOSS-6.6.0/emboss/primersearch \
-auto \
/home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/TRINITY_DN51983_c0_g1_i8.p1.cds \
/home/sam/analyses/20181129_geoduck_vtg_primers/20181129_emboss_primers.txt
cat /home/sam/analyses/20181129_geoduck_vtg_primers/TRINITY_DN51983_c0_g1_i8.primersearch
%%bash
cd /home/sam/analyses/20181129_geoduck_vtg_primers/
ls
%%bash
cat /home/sam/analyses/20181129_geoduck_vtg_primers/trinity_dn51983_c0_g1_i8.primersearch
Primers match up to their source sequence, as expected. Now, to test the primers on the rest of the transcriptome to ensure specificity.
Sets variables for file/folder paths
Runs for loop over all individual CDS FastA files:
tr
to convert filenames to lowercaseprimersearch
on each CDS FastA filegrep
to evaluate if the word "Amplimer" is found in the resulting output file; if it is not, the file is deleted.%%bash
cd /home/sam/analyses/20181129_geoduck_vtg_primers/
fasta_loc="/home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/"
primersearch="/home/sam/software/EMBOSS-6.6.0/emboss/primersearch"
primers="/home/sam/analyses/20181129_geoduck_vtg_primers/20181129_emboss_primers.txt"
time \
for fasta in ${fasta_loc}*.cds
do
fasta_no_path=$(echo ${fasta##*/})
fasta_no_ext=$(echo ${no_path%%.*})
fasta_no_ext_lower=$(echo ${fasta_no_ext} | tr '[:upper:]' '[:lower:]')
${primersearch} -auto ${fasta} ${primers}
if ! grep --quiet "Amplimer" "${fasta_no_ext_lower}.primersearch"
then rm ${fasta_no_ext_lower}.primersearch
fi
done
Killed it to change code to allow primer mismatches.
Also fix extension removal.
%%bash
cd /home/sam/analyses/20181129_geoduck_vtg_primers/
fasta_loc="/home/sam/data/geoduck/transcriptomes/transdecoder_fasta_splits/"
primersearch="/home/sam/software/EMBOSS-6.6.0/emboss/primersearch"
primers="/home/sam/analyses/20181129_geoduck_vtg_primers/20181129_emboss_primers.txt"
time \
for fasta in ${fasta_loc}*.cds
do
fasta_no_path=$(echo ${fasta##*/})
fasta_no_ext=$(echo ${fasta_no_path%%.*})
fasta_no_ext_lower=$(echo ${fasta_no_ext} | tr '[:upper:]' '[:lower:]')
${primersearch} -auto ${fasta} ${primers} 20
if ! grep --quiet "Amplimer" "${fasta_no_ext_lower}.primersearch"
then rm ${fasta_no_ext_lower}.primersearch
fi
done
%%bash
cd /home/sam/analyses/20181129_geoduck_vtg_primers/
# Check contents of files with matches
for file in *.primersearch
do
echo "FILE: ${file}"
echo ""
cat ${file}
echo "----------------------------------"
echo ""
done
# Copy data to Gannet
cd /home/sam/analyses/
rsync \
--archive \
--relative \
./20181129_geoduck_vtg_primers/ gannet:/volume1/web/Atumefaciens