%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Tue Aug 6 07:38:18 PDT 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: roadrunner ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 2 Core(s) per socket: 4 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 26 Model name: Intel(R) Xeon(R) CPU E5520 @ 2.27GHz Stepping: 5 CPU MHz: 2394.000 CPU max MHz: 2394.0000 CPU min MHz: 1596.0000 BogoMIPS: 4521.81 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 8192K NUMA node0 CPU(s): 0-15 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts nopl xtopology nonstop_tsc aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm dca sse4_1 sse4_2 popcnt lahf_lm ssbd ibrs ibpb stibp pti tpr_shadow vnmi flexpriority ept vpid dtherm ida flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 47G 1.3G 43G 443M 2.8G 44G Swap: 47G 0B 47G
No LSB modules are available.
%env wd=/home/sam/analyses/20190805_Pgenerosa_v070_repeatmasker_all
%env fasta_url=https://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa
%env fasta=Pgenerosa_v070.fa
%env repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker
%env cpus=16
# Checksum taken from https://github.com/RobertsLab/resources/wiki/Genomic-Resources
%env checksum=de0958fec4b9f8845babf3717ce7168c
env: wd=/home/sam/analyses/20190805_Pgenerosa_v070_repeatmasker_all env: fasta_url=https://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa env: fasta=Pgenerosa_v070.fa env: repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker env: cpus=16 env: checksum=de0958fec4b9f8845babf3717ce7168c
%%bash
mkdir --parents ${wd}
Info on FastA file is here: https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome
%%bash
cd ${wd}
rsync \
--archive \
--verbose \
--progress \
owl:/volume1/web/halfshell/genomic-databank/${fasta} .
echo ""
echo ""
echo "----------------------------------------------------------"
ls -lh
receiving incremental file list Pgenerosa_v070.fa 2,247,117,885 100% 17.87MB/s 0:01:59 (xfr#1, to-chk=0/1) sent 30 bytes received 2,247,392,295 bytes 18,497,056.17 bytes/sec total size is 2,247,117,885 speedup is 1.00 ---------------------------------------------------------- total 2.1G -rw-r--r-- 1 sam users 2.1G Feb 11 12:13 Pgenerosa_v070.fa
%%bash
time
wget ${fasta_url} \
--quiet \
--directory-prefix=${wd}
ls -lh ${wd}
Original MD5 checksum taken from GitHub Genomic Resource linked above.
Use md5sum
to generate checksum from downloaded FastA file and awk
to print the first field (i.e. the checksum value). This is saved to the variable: dl_md5
Then, check for differences between the two variables.
No output confirms no difference.
%%bash md5=${checksum} dl_md5=$(md5sum ${wd}/${fasta} | awk '{ print $1 }') diff <(echo "$md5") <(echo "$dl_md5")
%%bash
cd ${wd}
echo "Original FastA header:"
head -n 1 "${fasta}"
echo ""
cut -f 1-2 -d "_" "${fasta}" > Pgenerosa_v070.fa.tmp
mv Pgenerosa_v070.fa.tmp "${fasta}"
echo "---------------------------------"
echo "Updated FastA header:"
head -n 1 "${fasta}"
ls -ltrh
Original FastA header: --------------------------------- Updated FastA header: >PGA_scaffold1 total 2.1G -rw-rw-r-- 1 sam sam 2.1G Aug 6 07:48 Pgenerosa_v070.fa
bash: line 3: head-n: command not found
-species "all"
: Sets species to all
-par ${cpus}
: Use n CPU threads
-gff
: Create GFF output file (in addition to default files)
-excln
: Adjusts output table calculations to exclude sequence runs of >=25Ns. Useful for draft genome assemblies.
-1>
: Send stdout to file instead of printing to notebook.
-2>
: Send stderr to file instead of printing to notebook.
%%bash
cd ${wd}
time \
${repeat_masker} \
${fasta} \
-species "all" \
-par ${cpus} \
-gff \
-excln \
1> stdout.out \
2> stderr.err
sed '/^Subject:/ s/ / repeatmasker_pgenv070_all JOB COMPLETE/' ~/.default-subject.mail | msmtp "$EMAIL"
real 5729m38.010s user 88723m33.224s sys 786m15.300s
%%bash
ls -lh ${wd}
total 5.1G -rw-rw-r-- 1 sam sam 2.1G Aug 6 07:48 Pgenerosa_v070.fa -rw-rw-r-- 1 sam sam 438M Aug 10 07:19 Pgenerosa_v070.fa.cat.gz -rw-rw-r-- 1 sam sam 2.2G Aug 10 07:18 Pgenerosa_v070.fa.masked -rw-rw-r-- 1 sam sam 254M Aug 10 07:19 Pgenerosa_v070.fa.out -rw-rw-r-- 1 sam sam 157M Aug 10 07:18 Pgenerosa_v070.fa.out.gff -rw-rw-r-- 1 sam sam 2.4K Aug 10 07:18 Pgenerosa_v070.fa.tbl -rw-rw-r-- 1 sam sam 0 Aug 6 07:49 stderr.err -rw-rw-r-- 1 sam sam 6.8M Aug 10 07:18 stdout.out
%%bash
cat ${wd}/${fasta}.tbl
================================================== file name: Pgenerosa_v070.fa sequences: 313649 total length: 2205688688 bp (2005531528 bp excl N/X-runs) GC level: 33.92 % bases masked: 175175579 bp ( 8.73 %) ================================================== number of length percentage elements* occupied of sequence -------------------------------------------------- Retroelements 565711 87788537 bp 4.38 % SINEs: 332333 39506023 bp 1.97 % Penelope 6883 788411 bp 0.04 % LINEs: 142649 32744907 bp 1.63 % CRE/SLACS 1237 100944 bp 0.01 % L2/CR1/Rex 40317 7764197 bp 0.39 % R1/LOA/Jockey 10137 2942539 bp 0.15 % R2/R4/NeSL 3825 551996 bp 0.03 % RTE/Bov-B 26939 6768723 bp 0.34 % L1/CIN4 21435 4046589 bp 0.20 % LTR elements: 90729 15537607 bp 0.77 % BEL/Pao 6594 918331 bp 0.05 % Ty1/Copia 16409 1268565 bp 0.06 % Gypsy/DIRS1 50972 11376086 bp 0.57 % Retroviral 9680 690936 bp 0.03 % DNA transposons 259955 34987123 bp 1.74 % hobo-Activator 29756 3192075 bp 0.16 % Tc1-IS630-Pogo 67456 9717356 bp 0.48 % En-Spm 0 0 bp 0.00 % MuDR-IS905 0 0 bp 0.00 % PiggyBac 1553 121136 bp 0.01 % Tourist/Harbinger 7596 1054167 bp 0.05 % Other (Mirage, 1803 123196 bp 0.01 % P-element, Transib) Rolling-circles 0 0 bp 0.00 % Unclassified: 99928 13654973 bp 0.68 % Total interspersed repeats: 136430633 bp 6.80 % Small RNA: 42601 2192413 bp 0.11 % Satellites: 33350 6282246 bp 0.31 % Simple repeats: 596607 32793030 bp 1.64 % Low complexity: 75831 3754962 bp 0.19 % ================================================== * most repeats fragmented by insertions or deletions have been counted as one element Runs of >=20 X/Ns in query were excluded in % calcs The query species was assumed to be root RepeatMasker Combined Database: Dfam_Consensus-20170127, RepBase-20170127 run with rmblastn version 2.6.0+
rsync
to my folder on Gannet¶%%bash
cd ${wd}
rm ${fasta}
cd /home/sam/analyses/
rsync \
--archive \
--verbose \
--progress \
--relative \
./20190805_Pgenerosa_v070_repeatmasker_all \
gannet:/volume2/web/Atumefaciens
sending incremental file list ./ 20190805_Pgenerosa_v070_repeatmasker_all/ 20190805_Pgenerosa_v070_repeatmasker_all/Pgenerosa_v070.fa.cat.gz 459,015,612 100% 79.27MB/s 0:00:05 (xfr#1, to-chk=6/9) 20190805_Pgenerosa_v070_repeatmasker_all/Pgenerosa_v070.fa.masked 2,256,119,169 100% 84.20MB/s 0:00:25 (xfr#2, to-chk=5/9) 20190805_Pgenerosa_v070_repeatmasker_all/Pgenerosa_v070.fa.out 265,847,056 100% 84.79MB/s 0:00:02 (xfr#3, to-chk=4/9) 20190805_Pgenerosa_v070_repeatmasker_all/Pgenerosa_v070.fa.out.gff 164,437,744 100% 56.43MB/s 0:00:02 (xfr#4, to-chk=3/9) 20190805_Pgenerosa_v070_repeatmasker_all/Pgenerosa_v070.fa.tbl 2,423 100% 3.08kB/s 0:00:00 (xfr#5, to-chk=2/9) 20190805_Pgenerosa_v070_repeatmasker_all/stderr.err 0 100% 0.00kB/s 0:00:00 (xfr#6, to-chk=1/9) 20190805_Pgenerosa_v070_repeatmasker_all/stdout.out 7,126,317 100% 6.79MB/s 0:00:01 (xfr#7, to-chk=0/9) sent 3,153,318,561 bytes received 163 bytes 86,392,293.81 bytes/sec total size is 3,152,548,321 speedup is 1.00