%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Wed Jun 26 14:32:38 PDT 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: roadrunner ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 2 Core(s) per socket: 4 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 26 Model name: Intel(R) Xeon(R) CPU E5520 @ 2.27GHz Stepping: 5 CPU MHz: 2394.000 CPU max MHz: 2394.0000 CPU min MHz: 1596.0000 BogoMIPS: 4521.78 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 8192K NUMA node0 CPU(s): 0-15 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts nopl xtopology nonstop_tsc aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm dca sse4_1 sse4_2 popcnt lahf_lm ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 47G 1.3G 41G 206M 4.3G 45G Swap: 47G 0B 47G
No LSB modules are available.
%env wd=/home/sam/analyses/20190626_Pgenerosa_v074_repeatmasker_all
%env fasta_url=https://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v074.fa
%env fasta=Pgenerosa_v074.fa
%env repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker
%env cpus=16
# Checksum taken from https://github.com/RobertsLab/resources/wiki/Genomic-Resources
%env checksum=32976550b9030126c07920d5f2db179c
env: wd=/home/sam/analyses/20190626_Pgenerosa_v074_repeatmasker_all env: fasta_url=https://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v074.fa env: fasta=Pgenerosa_v074.fa env: repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker env: cpus=16 env: checksum=32976550b9030126c07920d5f2db179c
%%bash
mkdir --parents ${wd}
Info on FastA file is here: https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome
%%bash
cd ${wd}
rsync \
--archive \
--verbose \
owl:/volume1/web/halfshell/genomic-databank/${fasta} .
ls -lh
receiving incremental file list Pgenerosa_v074.fa sent 30 bytes received 958,176,954 bytes 3,809,848.84 bytes/sec total size is 958,059,901 speedup is 1.00 total 914M -rw-rw-rw- 1 sam users 914M Jun 26 08:49 Pgenerosa_v074.fa
%%bash
time
wget ${fasta_url} \
--quiet \
--directory-prefix=${wd}
ls -lh ${wd}
Original MD5 checksum taken from GitHub Genomic Resource linked above.
Use md5sum
to generate checksum from downloaded FastA file and awk
to print the first field (i.e. the checksum value). This is saved to the variable: dl_md5
Then, check for differences between the two variables.
No output confirms no difference.
%%bash md5=${checksum} dl_md5=$(md5sum ${wd}/${fasta} | awk '{ print $1 }') diff <(echo "$md5") <(echo "$dl_md5")
%%bash
cd ${wd}
cut -f 1-2 -d "_" "${fasta}" > Pgenerosa_v074.fa.tmp
mv Pgenerosa_v074.fa.tmp "${fasta}"
ls -ltrh
total 914M -rw-rw-r-- 1 sam sam 914M Jun 26 14:43 Pgenerosa_v074.fa
-species "all"
: Sets species to all
-par ${cpus}
: Use n CPU threads
-gff
: Create GFF output file (in addition to default files)
-excln
: Adjusts output table calculations to exclude sequence runs of >=25Ns. Useful for draft genome assemblies.
-1>
: Send stdout to file instead of printing to notebook.
-2>
: Send stderr to file instead of printing to notebook.
%%bash
cd ${wd}
time \
${repeat_masker} \
${fasta} \
-species "all" \
-par ${cpus} \
-gff \
-excln \
1> stdout.out \
2> stderr.err
sed '/^Subject:/ s/ / repeatmasker_pgenv074_all JOB COMPLETE/' ~/.default-subject.mail | msmtp "$EMAIL"
real 2023m59.832s user 31453m4.236s sys 167m40.112s
%%bash
ls -lh ${wd}
total 2.2G -rw-rw-r-- 1 sam sam 914M Jun 26 14:43 Pgenerosa_v074.fa -rw-rw-r-- 1 sam sam 170M Jun 28 00:27 Pgenerosa_v074.fa.cat.gz -rw-rw-r-- 1 sam sam 917M Jun 28 00:27 Pgenerosa_v074.fa.masked -rw-rw-r-- 1 sam sam 91M Jun 28 00:27 Pgenerosa_v074.fa.out -rw-rw-r-- 1 sam sam 60M Jun 28 00:27 Pgenerosa_v074.fa.out.gff -rw-rw-r-- 1 sam sam 2.4K Jun 28 00:27 Pgenerosa_v074.fa.tbl -rw-rw-r-- 1 sam sam 0 Jun 26 14:43 stderr.err -rw-rw-r-- 1 sam sam 3.2M Jun 28 00:27 stdout.out
%%bash
cat ${wd}/${fasta}.tbl
================================================== file name: Pgenerosa_v074.fa sequences: 18 total length: 942353201 bp (784808881 bp excl N/X-runs) GC level: 33.78 % bases masked: 65221692 bp ( 8.31 %) ================================================== number of length percentage elements* occupied of sequence -------------------------------------------------- Retroelements 204336 32863590 bp 4.19 % SINEs: 127691 15752737 bp 2.01 % Penelope 2382 279223 bp 0.04 % LINEs: 49426 11965761 bp 1.52 % CRE/SLACS 453 37114 bp 0.00 % L2/CR1/Rex 13913 2779414 bp 0.35 % R1/LOA/Jockey 3341 1189171 bp 0.15 % R2/R4/NeSL 1211 165338 bp 0.02 % RTE/Bov-B 9983 2559753 bp 0.33 % L1/CIN4 6194 1146568 bp 0.15 % LTR elements: 27219 5145092 bp 0.66 % BEL/Pao 1918 317492 bp 0.04 % Ty1/Copia 4335 355225 bp 0.05 % Gypsy/DIRS1 16012 3831098 bp 0.49 % Retroviral 2945 204333 bp 0.03 % DNA transposons 89437 12061369 bp 1.54 % hobo-Activator 10103 1142451 bp 0.15 % Tc1-IS630-Pogo 24664 3657788 bp 0.47 % En-Spm 0 0 bp 0.00 % MuDR-IS905 0 0 bp 0.00 % PiggyBac 472 38428 bp 0.00 % Tourist/Harbinger 2582 369771 bp 0.05 % Other (Mirage, 628 39925 bp 0.01 % P-element, Transib) Rolling-circles 0 0 bp 0.00 % Unclassified: 38482 5369675 bp 0.68 % Total interspersed repeats: 50294634 bp 6.41 % Small RNA: 16303 859653 bp 0.11 % Satellites: 10312 1878369 bp 0.24 % Simple repeats: 239752 12742842 bp 1.62 % Low complexity: 31725 1550615 bp 0.20 % ================================================== * most repeats fragmented by insertions or deletions have been counted as one element Runs of >=20 X/Ns in query were excluded in % calcs The query species was assumed to be root RepeatMasker Combined Database: Dfam_Consensus-20170127, RepBase-20170127 run with rmblastn version 2.6.0+
rsync
to my folder on Gannet¶%%bash
cd /home/sam/analyses/
rsync \
--archive \
--verbose \
--progress \
--relative \
./20190626_Pgenerosa_v074_repeatmasker_all \
gannet:/volume2/web/Atumefaciens
sending incremental file list sent 288 bytes received 16 bytes 608.00 bytes/sec total size is 2,258,210,520 speedup is 7,428,324.08