%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Mon Aug 5 15:01:13 PDT 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: swoose ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 24 On-line CPU(s) list: 0-23 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 44 Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz Stepping: 2 CPU MHz: 2925.984 BogoMIPS: 5851.89 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 12288K NUMA node0 CPU(s): 0-23 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 70G 9.5G 48G 593M 12G 60G Swap: 4.7G 0B 4.7G
No LSB modules are available.
%env wd=/home/sam/analyses/20190805_Pgenerosa_v070_repeatmasker_all
%env fasta_url=https://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa
%env fasta=Pgenerosa_v070.fa
%env repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker
%env cpus=23
# Checksum taken from https://github.com/RobertsLab/resources/wiki/Genomic-Resources
%env checksum=de0958fec4b9f8845babf3717ce7168c
env: wd=/home/sam/analyses/20190805_Pgenerosa_v070_repeatmasker_all env: fasta_url=https://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa env: fasta=Pgenerosa_v070.fa env: repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker env: cpus=23 env: checksum=de0958fec4b9f8845babf3717ce7168c
%%bash
mkdir --parents ${wd}
Info on FastA file is here: https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome
%%bash
cd ${wd}
rsync \
--archive \
--verbose \
--progress \
owl:/volume1/web/halfshell/genomic-databank/${fasta} .
echo ""
echo ""
echo "----------------------------------------------------------"
ls -lh
receiving incremental file list Pgenerosa_v070.fa 2,247,117,885 100% 22.06MB/s 0:01:37 (xfr#1, to-chk=0/1) sent 30 bytes received 2,247,392,295 bytes 22,586,857.54 bytes/sec total size is 2,247,117,885 speedup is 1.00 ---------------------------------------------------------- total 2.1G -rw-r--r-- 1 sam users 2.1G Feb 11 12:13 Pgenerosa_v070.fa
%%bash
time
wget ${fasta_url} \
--quiet \
--directory-prefix=${wd}
ls -lh ${wd}
Original MD5 checksum taken from GitHub Genomic Resource linked above.
Use md5sum
to generate checksum from downloaded FastA file and awk
to print the first field (i.e. the checksum value). This is saved to the variable: dl_md5
Then, check for differences between the two variables.
No output confirms no difference.
%%bash md5=${checksum} dl_md5=$(md5sum ${wd}/${fasta} | awk '{ print $1 }') diff <(echo "$md5") <(echo "$dl_md5")
%%bash
cd ${wd}
cut -f 1-2 -d "_" "${fasta}" > Pgenerosa_v070.fa.tmp
mv Pgenerosa_v070.fa.tmp "${fasta}"
ls -ltrh
total 2.1G -rw-rw-r-- 1 sam sam 2.1G Aug 5 15:41 Pgenerosa_v070.fa
-species "all"
: Sets species to all
-par ${cpus}
: Use n CPU threads
-gff
: Create GFF output file (in addition to default files)
-excln
: Adjusts output table calculations to exclude sequence runs of >=25Ns. Useful for draft genome assemblies.
-1>
: Send stdout to file instead of printing to notebook.
-2>
: Send stderr to file instead of printing to notebook.
%%bash
cd ${wd}
time \
${repeat_masker} \
${fasta} \
-species "all" \
-par ${cpus} \
-gff \
-excln \
1> stdout.out \
2> stderr.err
sed '/^Subject:/ s/ / repeatmasker_pgenv070_all JOB COMPLETE/' ~/.default-subject.mail | msmtp "$EMAIL"
real 0m0.001s user 0m0.000s sys 0m0.000s
--------------------------------------------------------------------------- CalledProcessError Traceback (most recent call last) <ipython-input-9-0125799e4d02> in <module> ----> 1 get_ipython().run_cell_magic('bash', '', '\ncd ${wd}\ntime \\\n${repeat_masker} \\\n${fasta} \\\n-species "all" \\\n-par ${cpus} \\\n-gff \\\n-excln \\\n1> stdout.out \\\n2> stderr.err\n') ~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell) 2350 with self.builtin_trap: 2351 args = (magic_arg_s, cell) -> 2352 result = fn(*args, **kwargs) 2353 return result 2354 ~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magics/script.py in named_script_magic(line, cell) 140 else: 141 line = script --> 142 return self.shebang(line, cell) 143 144 # write a basic docstring: </home/sam/programs/minicocnda3/lib/python3.6/site-packages/decorator.py:decorator-gen-110> in shebang(self, line, cell) ~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k) 185 # but it's overkill for just that one bit of state. 186 def magic_deco(arg): --> 187 call = lambda f, *a, **k: f(*a, **k) 188 189 if callable(arg): ~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magics/script.py in shebang(self, line, cell) 243 sys.stderr.flush() 244 if args.raise_error and p.returncode!=0: --> 245 raise CalledProcessError(p.returncode, cell, output=out, stderr=err) 246 247 def _run_script(self, p, cell, to_close): CalledProcessError: Command 'b'\ncd ${wd}\ntime \\\n${repeat_masker} \\\n${fasta} \\\n-species "all" \\\n-par ${cpus} \\\n-gff \\\n-excln \\\n1> stdout.out \\\n2> stderr.err\n'' returned non-zero exit status 127.
%%bash
ls -lh ${wd}
total 2.2G -rw-rw-r-- 1 sam sam 914M Jun 26 14:43 Pgenerosa_v074.fa -rw-rw-r-- 1 sam sam 170M Jun 28 00:27 Pgenerosa_v074.fa.cat.gz -rw-rw-r-- 1 sam sam 917M Jun 28 00:27 Pgenerosa_v074.fa.masked -rw-rw-r-- 1 sam sam 91M Jun 28 00:27 Pgenerosa_v074.fa.out -rw-rw-r-- 1 sam sam 60M Jun 28 00:27 Pgenerosa_v074.fa.out.gff -rw-rw-r-- 1 sam sam 2.4K Jun 28 00:27 Pgenerosa_v074.fa.tbl -rw-rw-r-- 1 sam sam 0 Jun 26 14:43 stderr.err -rw-rw-r-- 1 sam sam 3.2M Jun 28 00:27 stdout.out
%%bash
cat ${wd}/${fasta}.tbl
================================================== file name: Pgenerosa_v074.fa sequences: 18 total length: 942353201 bp (784808881 bp excl N/X-runs) GC level: 33.78 % bases masked: 65221692 bp ( 8.31 %) ================================================== number of length percentage elements* occupied of sequence -------------------------------------------------- Retroelements 204336 32863590 bp 4.19 % SINEs: 127691 15752737 bp 2.01 % Penelope 2382 279223 bp 0.04 % LINEs: 49426 11965761 bp 1.52 % CRE/SLACS 453 37114 bp 0.00 % L2/CR1/Rex 13913 2779414 bp 0.35 % R1/LOA/Jockey 3341 1189171 bp 0.15 % R2/R4/NeSL 1211 165338 bp 0.02 % RTE/Bov-B 9983 2559753 bp 0.33 % L1/CIN4 6194 1146568 bp 0.15 % LTR elements: 27219 5145092 bp 0.66 % BEL/Pao 1918 317492 bp 0.04 % Ty1/Copia 4335 355225 bp 0.05 % Gypsy/DIRS1 16012 3831098 bp 0.49 % Retroviral 2945 204333 bp 0.03 % DNA transposons 89437 12061369 bp 1.54 % hobo-Activator 10103 1142451 bp 0.15 % Tc1-IS630-Pogo 24664 3657788 bp 0.47 % En-Spm 0 0 bp 0.00 % MuDR-IS905 0 0 bp 0.00 % PiggyBac 472 38428 bp 0.00 % Tourist/Harbinger 2582 369771 bp 0.05 % Other (Mirage, 628 39925 bp 0.01 % P-element, Transib) Rolling-circles 0 0 bp 0.00 % Unclassified: 38482 5369675 bp 0.68 % Total interspersed repeats: 50294634 bp 6.41 % Small RNA: 16303 859653 bp 0.11 % Satellites: 10312 1878369 bp 0.24 % Simple repeats: 239752 12742842 bp 1.62 % Low complexity: 31725 1550615 bp 0.20 % ================================================== * most repeats fragmented by insertions or deletions have been counted as one element Runs of >=20 X/Ns in query were excluded in % calcs The query species was assumed to be root RepeatMasker Combined Database: Dfam_Consensus-20170127, RepBase-20170127 run with rmblastn version 2.6.0+
rsync
to my folder on Gannet¶%%bash
cd /home/sam/analyses/
rsync \
--archive \
--verbose \
--progress \
--relative \
./20190805_Pgenerosa_v070_repeatmasker_all \
gannet:/volume2/web/Atumefaciens
sending incremental file list sent 288 bytes received 16 bytes 608.00 bytes/sec total size is 2,258,210,520 speedup is 7,428,324.08