%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Wed Mar 27 08:34:53 PDT 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: roadrunner ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 2 Core(s) per socket: 4 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 26 Model name: Intel(R) Xeon(R) CPU E5520 @ 2.27GHz Stepping: 5 CPU MHz: 1596.000 CPU max MHz: 2394.0000 CPU min MHz: 1596.0000 BogoMIPS: 4521.81 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 8192K NUMA node0 CPU(s): 0-15 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts nopl xtopology nonstop_tsc aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm dca sse4_1 sse4_2 popcnt lahf_lm ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 47G 1.3G 38G 584M 7.0G 44G Swap: 47G 0B 47G
No LSB modules are available.
%env wd=/home/sam/analyses/20190327_cgig_repeatmasker_all
%env fasta_url=http://owl.fish.washington.edu/halfshell/genomic-databank/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa
%env fasta=Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa
%env repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker
%env cpus=16
%env checksum=6de9d1239eb10ea0545bed6c4e746d6c
env: wd=/home/sam/analyses/20190327_cgig_repeatmasker_all env: fasta_url=http://owl.fish.washington.edu/halfshell/genomic-databank/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa env: fasta=Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa env: repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker env: cpus=16 env: checksum=6de9d1239eb10ea0545bed6c4e746d6c
%%bash
mkdir --parents ${wd}
Info on FastA file is here: https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome
%%bash
# Using rsync
cd ${wd}
rsync \
--archive \
--verbose \
--progress \
owl:/volume1/web/halfshell/genomic-databank/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa .
ls -lh
receiving incremental file list Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa 567,592,991 100% 5.15MB/s 0:01:45 (xfr#1, to-chk=0/1) sent 30 bytes received 567,662,409 bytes 5,137,216.64 bytes/sec total size is 567,592,991 speedup is 1.00 total 542M -rw-rw-rw- 1 sam users 542M Aug 24 2018 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa
%%bash
time
wget ${fasta_url} \
--quiet \
--directory-prefix=${wd}
ls -lh ${wd}
Original MD5 checksum taken from GitHub Genomic Resource linked above.
Use md5sum
to generate checksum from downloaded FastA file and awk
to print the first field (i.e. the checksum value). This is saved to the variable: dl_md5
Then, check for differences between the two variables.
No output confirms no difference.
%%bash md5=${checksum} dl_md5=$(md5sum ${wd}/${fasta} | awk '{ print $1 }') diff <(echo "$md5") <(echo "$dl_md5")
-species "all"
: Sets species to all
-par 15
: Use 15 CPU threads
-gff
: Create GFF output file (in addition to default files)
-excln
: Adjusts output table calculations to exclude sequence runs of >=25Ns. Useful for draft genome assemblies.
-1>
: Send stdout to file instead of printing to notebook.
-2>
: Send stderr to file instead of printing to notebook.
%%bash
cd ${wd}
time \
${repeat_masker} \
${fasta} \
-species "all" \
-par ${cpus} \
-gff \
-excln \
1> stdout.out \
2> stderr.err
sed '/^Subject:/ s/ / repeatmasker_gigas_all JOB COMPLETE/' ~/.default-subject.mail | msmtp "$EMAIL"
real 1419m34.303s user 21339m49.472s sys 111m2.000s
%%bash
ls -lh ${wd}
total 1.5G -rw-rw-rw- 1 sam users 542M Aug 24 2018 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa -rw-rw-r-- 1 sam sam 244M Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.cat.gz -rw-rw-r-- 1 sam sam 544M Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.masked -rw-rw-r-- 1 sam sam 90M Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out -rw-rw-r-- 1 sam sam 58M Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out.gff -rw-rw-r-- 1 sam sam 2.4K Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.tbl -rw-rw-r-- 1 sam sam 0 Mar 27 09:04 stderr.err -rw-rw-r-- 1 sam sam 2.1M Mar 28 08:43 stdout.out
%%bash
cat ${wd}/${fasta}.tbl
================================================== file name: Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa sequences: 7658 total length: 557717710 bp (491860439 bp excl N/X-runs) GC level: 33.42 % bases masked: 160369613 bp ( 32.60 %) ================================================== number of length percentage elements* occupied of sequence -------------------------------------------------- Retroelements 48481 19773596 bp 4.02 % SINEs: 2498 317084 bp 0.06 % Penelope 5749 1808270 bp 0.37 % LINEs: 26463 10472676 bp 2.13 % CRE/SLACS 15 1289 bp 0.00 % L2/CR1/Rex 1712 307207 bp 0.06 % R1/LOA/Jockey 299 21470 bp 0.00 % R2/R4/NeSL 218 69735 bp 0.01 % RTE/Bov-B 8417 3631379 bp 0.74 % L1/CIN4 983 64189 bp 0.01 % LTR elements: 19520 8983836 bp 1.83 % BEL/Pao 2050 1349545 bp 0.27 % Ty1/Copia 2139 189535 bp 0.04 % Gypsy/DIRS1 11971 6501545 bp 1.32 % Retroviral 1263 69288 bp 0.01 % DNA transposons 299050 85782505 bp 17.44 % hobo-Activator 9348 2278556 bp 0.46 % Tc1-IS630-Pogo 32515 8695261 bp 1.77 % En-Spm 0 0 bp 0.00 % MuDR-IS905 0 0 bp 0.00 % PiggyBac 4136 747000 bp 0.15 % Tourist/Harbinger 11590 2828277 bp 0.58 % Other (Mirage, 232 14514 bp 0.00 % P-element, Transib) Rolling-circles 0 0 bp 0.00 % Unclassified: 109149 49075277 bp 9.98 % Total interspersed repeats: 154631378 bp 31.44 % Small RNA: 830 93282 bp 0.02 % Satellites: 2087 401812 bp 0.08 % Simple repeats: 110847 4687373 bp 0.95 % Low complexity: 16716 787611 bp 0.16 % ================================================== * most repeats fragmented by insertions or deletions have been counted as one element Runs of >=20 X/Ns in query were excluded in % calcs The query species was assumed to be root RepeatMasker Combined Database: Dfam_Consensus-20170127, RepBase-20170127 run with rmblastn version 2.6.0+
rsync
to my folder on Gannet¶%%bash
cd ${wd}
rm ${fasta}
cd ..
rsync \
--archive \
--verbose \
--progress \
--relative \
./home/sam/analyses/20190327_cgig_repeatmasker_all
gannet:/volume2/web/Atumefaciens
sending incremental file list drwxrwxr-x 4,096 2019/03/27 08:45:17 . sent 49 bytes received 64 bytes 226.00 bytes/sec total size is 0 speedup is 0.00
rsync: link_stat "/home/sam/analyses/home/sam/analyses/20190327_cgig_repeatmasker_all" failed: No such file or directory (2) rsync error: some files/attrs were not transferred (see previous errors) (code 23) at main.c(1183) [sender=3.1.1] bash: line 10: gannet:/volume2/web/Atumefaciens: No such file or directory
%%bash
cd ${wd}
rm ${fasta}
cd ..
rsync \
--archive \
--verbose \
--progress \
--relative \
./20190327_cgig_repeatmasker_all
gannet:/volume2/web/Atumefaciens
sending incremental file list drwxrwxr-x 4,096 2019/03/27 08:45:17 . drwxrwxr-x 4,096 2019/03/28 11:54:45 20190327_cgig_repeatmasker_all -rw-rw-r-- 255,343,202 2019/03/28 08:43:43 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.cat.gz -rw-rw-r-- 569,452,085 2019/03/28 08:43:44 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.masked -rw-rw-r-- 93,415,468 2019/03/28 08:43:44 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out -rw-rw-r-- 59,778,663 2019/03/28 08:43:43 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out.gff -rw-rw-r-- 2,444 2019/03/28 08:43:43 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.tbl -rw-rw-r-- 0 2019/03/27 09:04:11 20190327_cgig_repeatmasker_all/stderr.err -rw-rw-r-- 2,119,050 2019/03/28 08:43:41 20190327_cgig_repeatmasker_all/stdout.out sent 353 bytes received 1,000 bytes 2,706.00 bytes/sec total size is 980,110,912 speedup is 724,398.31
rm: cannot remove 'Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa': No such file or directory bash: line 10: gannet:/volume2/web/Atumefaciens: No such file or directory
gannet
shortuct wasn't loaded. However, problem was most likely due to missing continuatino slash after repeatmasker directory...¶%%bash
cd /home/sam/analyses/
rsync \
--archive \
--verbose \
--progress \
--relative \
./20190327_cgig_repeatmasker_all \
gannet:/volume2/web/Atumefaciens
sending incremental file list ./ 20190327_cgig_repeatmasker_all/ 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.cat.gz 255,343,202 100% 112.72MB/s 0:00:02 (xfr#1, to-chk=6/9) 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.masked 569,452,085 100% 108.29MB/s 0:00:05 (xfr#2, to-chk=5/9) 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out 93,415,468 100% 109.04MB/s 0:00:00 (xfr#3, to-chk=4/9) 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out.gff 59,778,663 100% 42.96MB/s 0:00:01 (xfr#4, to-chk=3/9) 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.tbl 2,444 100% 7.28kB/s 0:00:00 (xfr#5, to-chk=2/9) 20190327_cgig_repeatmasker_all/stderr.err 0 100% 0.00kB/s 0:00:00 (xfr#6, to-chk=1/9) 20190327_cgig_repeatmasker_all/stdout.out 2,119,050 100% 5.82MB/s 0:00:00 (xfr#7, to-chk=0/9) sent 980,350,785 bytes received 163 bytes 103,194,836.63 bytes/sec total size is 980,110,912 speedup is 1.00