%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Wed Aug 22 14:10:02 PDT 2018 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.5 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: roadrunner ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 2 Core(s) per socket: 4 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 26 Model name: Intel(R) Xeon(R) CPU E5520 @ 2.27GHz Stepping: 5 CPU MHz: 1596.000 CPU max MHz: 2394.0000 CPU min MHz: 1596.0000 BogoMIPS: 4521.80 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 8192K NUMA node0 CPU(s): 0-15 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts nopl xtopology nonstop_tsc aperfmperf eagerfpu pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm dca sse4_1 sse4_2 popcnt lahf_lm kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida ------------ Memory Specs total used free shared buff/cache available Mem: 47G 1.1G 44G 78M 1.3G 45G Swap: 47G 0B 47G
No LSB modules are available.
%%bash
mkdir /home/sam/data/Cvirginica_genome
mkdir /home/sam/analyses/20180822_virginica_repeatmasker_all
mkdir /home/sam/analyses/20180822_virginica_repeatmasker_Cgigas
mkdir /home/sam/analyses/20180822_virginica_repeatmasker_Cvirginica
mkdir /home/sam/analyses/20180822_virginica_repeatmasker_defaults
Info on FastA file is here: https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-1
%%bash
time \
wget http://owl.fish.washington.edu/halfshell/genomic-databank/Cvirginica_v300.fa \
--quiet \
--directory-prefix=/home/sam/data/Cvirginica_genome/
ls -lh /home/sam/data/Cvirginica_genome/
total 662M -rw-rw-r-- 1 sam sam 662M Jun 7 14:40 Cvirginica_v300.fa
real 0m8.624s user 0m0.504s sys 0m3.072s
Original MD5 checksum taken from GitHub Genomic Resource linked above.
Use md5sum
to generate checksum from downloaded FastA file and awk
to print the first field (i.e. the checksum value). This is saved to the variable: dl_md5
Then, check for differences between the two variables.
No output confirms no difference.
%%bash
md5=f9135e323583dc77fc726e9df2677a32
dl_md5=$(md5sum /home/sam/data/Cvirginica_genome/Cvirginica_v300.fa | awk '{ print $1 }')
diff <(echo "$md5") <(echo "$dl_md5")
-species "all"
: Sets species to all
-par 15
: Use 15 CPU threads
-gff
: Create GFF output file (in addition to default files)
-excln
: Adjusts output table calculations to exclude sequence runs of >=25Ns. Useful for draft genome assemblies.
-1>
: Send stdout to file instead of printing to notebook.
-2>
: Send stderr to file instead of printing to notebook.
%%bash
wd="/home/sam/analyses/20180822_virginica_repeatmasker_all"
cd /home/sam/data/Cvirginica_genome/
time \
/home/shared/RepeatMasker-4.0.7/RepeatMasker \
/home/sam/data/Cvirginica_genome/Cvirginica_v300.fa \
-species "all" \
-par 15 \
-gff \
-excln \
1> "$wd"/stdout.out \
2> "$wd"/stderr.err
find ./ -not -name Cvirginica_v300.fa -exec mv '{}' "$wd" \;
sed '/^Subject:/ s/ / repeatmasker_virginica_all JOB COMPLETE/' ~/.default-subject.mail | msmtp "$EMAIL"
real 1806m14.965s user 27975m51.556s sys 137m10.360s mv: cannot move './' to '/home/sam/analyses/20180822_virginica_repeatmasker_all/.': Device or resource busy
%%bash
ls /home/sam/data/Cvirginica_genome/
Cvirginica_v300.fa
%%bash
ls /home/sam/analyses/20180822_virginica_repeatmasker_all/
Cvirginica_v300.fa.cat.gz Cvirginica_v300.fa.masked Cvirginica_v300.fa.out Cvirginica_v300.fa.out.gff Cvirginica_v300.fa.tbl stderr.err stdout.out
-species "crassostrea gigas"
: Sets species to Crassostrea gigas
-par 15
: Use 15 CPU threads
-gff
: Create GFF output file (in addition to default files)
-excln
: Adjusts output table calculations to exclude sequence runs of >=25Ns. Useful for draft genome assemblies.
-1>
: Send stdout to file instead of printing to notebook.
-2>
: Send stderr to file instead of printing to notebook.
%%bash
wd="/home/sam/analyses/20180822_virginica_repeatmasker_Cgigas"
cd /home/sam/data/Cvirginica_genome/
time \
/home/shared/RepeatMasker-4.0.7/RepeatMasker \
/home/sam/data/Cvirginica_genome/Cvirginica_v300.fa \
-species "crassostrea gigas" \
-par 15 \
-gff \
-excln \
1> "$wd"/stdout.out \
2> "$wd"/stderr.err
find ./ -not -name Cvirginica_v300.fa -exec mv '{}' "$wd" \;
sed '/^Subject:/ s/ / repeatmasker_virginica_gigas JOB COMPLETE/' ~/.default-subject.mail | msmtp "$EMAIL"
real 161m24.668s user 1801m15.348s sys 125m13.016s mv: cannot move './' to '/home/sam/analyses/20180822_virginica_repeatmasker_Cgigas/.': Device or resource busy
%%bash
ls /home/sam/data/Cvirginica_genome/
Cvirginica_v300.fa
%%bash
ls /home/sam/analyses/20180822_virginica_repeatmasker_Cgigas
Cvirginica_v300.fa.cat.gz Cvirginica_v300.fa.masked Cvirginica_v300.fa.out Cvirginica_v300.fa.out.gff Cvirginica_v300.fa.tbl stderr.err stdout.out
-species "crassostrea virginica"
: Sets species to Crassostrea virginica
-par 15
: Use 15 CPU threads
-gff
: Create GFF output file (in addition to default files)
-excln
: Adjusts output table calculations to exclude sequence runs of >=25Ns. Useful for draft genome assemblies.
-1>
: Send stdout to file instead of printing to notebook.
-2>
: Send stderr to file instead of printing to notebook.
%%bash
wd="/home/sam/analyses/20180822_virginica_repeatmasker_Cvirginica"
cd /home/sam/data/Cvirginica_genome/
time \
/home/shared/RepeatMasker-4.0.7/RepeatMasker \
/home/sam/data/Cvirginica_genome/Cvirginica_v300.fa \
-species "crassostrea virginica" \
-par 15 \
-gff \
-excln \
1> "$wd"/stdout.out \
2> "$wd"/stderr.err
find ./ -not -name Cvirginica_v300.fa -exec mv '{}' "$wd" \;
sed '/^Subject:/ s/ / repeatmasker_virginica_virginica JOB COMPLETE/' ~/.default-subject.mail | msmtp "$EMAIL"
real 88m45.684s user 911m23.604s sys 139m47.128s mv: cannot move './' to '/home/sam/analyses/20180822_virginica_repeatmasker_Cvirginica/.': Device or resource busy
%%bash
ls /home/sam/data/Cvirginica_genome/
Cvirginica_v300.fa
%%bash
ls /home/sam/analyses/20180822_virginica_repeatmasker_Cvirginica
Cvirginica_v300.fa.cat.gz Cvirginica_v300.fa.masked Cvirginica_v300.fa.out Cvirginica_v300.fa.out.gff Cvirginica_v300.fa.tbl stderr.err stdout.out
-par 15
: Use 15 CPU threads
-gff
: Create GFF output file (in addition to default files)
-excln
: Adjusts output table calculations to exclude sequence runs of >=25Ns. Useful for draft genome assemblies.
-1>
: Send stdout to file instead of printing to notebook.
-2>
: Send stderr to file instead of printing to notebook.
%%bash
wd="/home/sam/analyses/20180822_virginica_repeatmasker_defaults"
cd /home/sam/data/Cvirginica_genome/
time \
/home/shared/RepeatMasker-4.0.7/RepeatMasker \
/home/sam/data/Cvirginica_genome/Cvirginica_v300.fa \
-par 15 \
-gff \
-excln \
1> "$wd"/stdout.out \
2> "$wd"/stderr.err
find ./ -not -name Cvirginica_v300.fa -exec mv '{}' "$wd" \;
sed '/^Subject:/ s/ / repeatmasker_virginica_defaults JOB COMPLETE/' ~/.default-subject.mail | msmtp "$EMAIL"
real 148m33.929s user 1883m54.940s sys 175m28.320s mv: cannot move './' to '/home/sam/analyses/20180822_virginica_repeatmasker_defaults/.': Device or resource busy
%%bash
ls /home/sam/data/Cvirginica_genome/
Cvirginica_v300.fa
%%bash
ls /home/sam/analyses/20180822_virginica_repeatmasker_defaults
Cvirginica_v300.fa.cat.gz Cvirginica_v300.fa.masked Cvirginica_v300.fa.out Cvirginica_v300.fa.out.gff Cvirginica_v300.fa.tbl stderr.err stdout.out
%%bash
cat /home/sam/analyses/20180822_virginica_repeatmasker_all/Cvirginica_v300.fa.tbl
================================================== file name: Cvirginica_v300.fa sequences: 11 total length: 684741128 bp (684675328 bp excl N/X-runs) GC level: 34.83 % bases masked: 113771462 bp ( 16.62 %) ================================================== number of length percentage elements* occupied of sequence -------------------------------------------------- Retroelements 97003 27946871 bp 4.08 % SINEs: 48145 9242559 bp 1.35 % Penelope 1429 256929 bp 0.04 % LINEs: 27022 10570154 bp 1.54 % CRE/SLACS 28 2219 bp 0.00 % L2/CR1/Rex 2160 316660 bp 0.05 % R1/LOA/Jockey 3058 386611 bp 0.06 % R2/R4/NeSL 511 226938 bp 0.03 % RTE/Bov-B 7377 3276312 bp 0.48 % L1/CIN4 1331 95476 bp 0.01 % LTR elements: 21836 8134158 bp 1.19 % BEL/Pao 1807 936488 bp 0.14 % Ty1/Copia 3046 296183 bp 0.04 % Gypsy/DIRS1 12789 6060883 bp 0.89 % Retroviral 2369 152228 bp 0.02 % DNA transposons 180693 29492426 bp 4.31 % hobo-Activator 12869 1114188 bp 0.16 % Tc1-IS630-Pogo 17233 2485049 bp 0.36 % En-Spm 0 0 bp 0.00 % MuDR-IS905 0 0 bp 0.00 % PiggyBac 2388 405926 bp 0.06 % Tourist/Harbinger 9302 992476 bp 0.14 % Other (Mirage, 238 15946 bp 0.00 % P-element, Transib) Rolling-circles 0 0 bp 0.00 % Unclassified: 137707 45460608 bp 6.64 % Total interspersed repeats: 102899905 bp 15.03 % Small RNA: 45243 9057873 bp 1.32 % Satellites: 3852 760316 bp 0.11 % Simple repeats: 203542 8946510 bp 1.31 % Low complexity: 26205 1281043 bp 0.19 % ================================================== * most repeats fragmented by insertions or deletions have been counted as one element Runs of >=20 X/Ns in query were excluded in % calcs The query species was assumed to be root RepeatMasker Combined Database: Dfam_Consensus-20170127, RepBase-20170127 run with rmblastn version 2.6.0+
%%bash
cat /home/sam/analyses/20180822_virginica_repeatmasker_Cgigas/Cvirginica_v300.fa.tbl
================================================== file name: Cvirginica_v300.fa sequences: 11 total length: 684741128 bp (684675328 bp excl N/X-runs) GC level: 34.83 % bases masked: 93923386 bp ( 13.72 %) ================================================== number of length percentage elements* occupied of sequence -------------------------------------------------- Retroelements 26397 15008601 bp 2.19 % SINEs: 4 722 bp 0.00 % Penelope 675 190160 bp 0.03 % LINEs: 17645 8922188 bp 1.30 % CRE/SLACS 0 0 bp 0.00 % L2/CR1/Rex 70 39188 bp 0.01 % R1/LOA/Jockey 0 0 bp 0.00 % R2/R4/NeSL 4 5110 bp 0.00 % RTE/Bov-B 6194 2718955 bp 0.40 % L1/CIN4 0 0 bp 0.00 % LTR elements: 8748 6085691 bp 0.89 % BEL/Pao 933 788887 bp 0.12 % Ty1/Copia 47 82743 bp 0.01 % Gypsy/DIRS1 6819 4822734 bp 0.70 % Retroviral 0 0 bp 0.00 % DNA transposons 163945 26422122 bp 3.86 % hobo-Activator 7742 720623 bp 0.11 % Tc1-IS630-Pogo 15615 2328538 bp 0.34 % En-Spm 0 0 bp 0.00 % MuDR-IS905 0 0 bp 0.00 % PiggyBac 2246 393498 bp 0.06 % Tourist/Harbinger 8431 876020 bp 0.13 % Other (Mirage, 0 0 bp 0.00 % P-element, Transib) Rolling-circles 0 0 bp 0.00 % Unclassified: 160681 41266796 bp 6.03 % Total interspersed repeats: 82697519 bp 12.08 % Small RNA: 214 40811 bp 0.01 % Satellites: 1396 217317 bp 0.03 % Simple repeats: 216869 9637447 bp 1.41 % Low complexity: 27520 1418990 bp 0.21 % ================================================== * most repeats fragmented by insertions or deletions have been counted as one element Runs of >=20 X/Ns in query were excluded in % calcs The query species was assumed to be crassostrea gigas RepeatMasker Combined Database: Dfam_Consensus-20170127, RepBase-20170127 run with rmblastn version 2.6.0+
%%bash
cat /home/sam/analyses/20180822_virginica_repeatmasker_Cvirginica/Cvirginica_v300.fa.tbl
================================================== file name: Cvirginica_v300.fa sequences: 11 total length: 684741128 bp (684675328 bp excl N/X-runs) GC level: 34.83 % bases masked: 46637065 bp ( 6.81 %) ================================================== number of length percentage elements* occupied of sequence -------------------------------------------------- Retroelements 43139 8952068 bp 1.31 % SINEs: 43139 8952068 bp 1.31 % Penelope 0 0 bp 0.00 % LINEs: 0 0 bp 0.00 % CRE/SLACS 0 0 bp 0.00 % L2/CR1/Rex 0 0 bp 0.00 % R1/LOA/Jockey 0 0 bp 0.00 % R2/R4/NeSL 0 0 bp 0.00 % RTE/Bov-B 0 0 bp 0.00 % L1/CIN4 0 0 bp 0.00 % LTR elements: 0 0 bp 0.00 % BEL/Pao 0 0 bp 0.00 % Ty1/Copia 0 0 bp 0.00 % Gypsy/DIRS1 0 0 bp 0.00 % Retroviral 0 0 bp 0.00 % DNA transposons 3538 1564942 bp 0.23 % hobo-Activator 0 0 bp 0.00 % Tc1-IS630-Pogo 0 0 bp 0.00 % En-Spm 0 0 bp 0.00 % MuDR-IS905 0 0 bp 0.00 % PiggyBac 0 0 bp 0.00 % Tourist/Harbinger 0 0 bp 0.00 % Other (Mirage, 0 0 bp 0.00 % P-element, Transib) Rolling-circles 0 0 bp 0.00 % Unclassified: 65151 23982146 bp 3.50 % Total interspersed repeats: 34499156 bp 5.04 % Small RNA: 43353 8992879 bp 1.31 % Satellites: 1 222 bp 0.00 % Simple repeats: 232627 10544162 bp 1.54 % Low complexity: 29762 1561018 bp 0.23 % ================================================== * most repeats fragmented by insertions or deletions have been counted as one element Runs of >=20 X/Ns in query were excluded in % calcs The query species was assumed to be crassostrea virginica RepeatMasker Combined Database: Dfam_Consensus-20170127, RepBase-20170127 run with rmblastn version 2.6.0+
%%bash
cat /home/sam/analyses/20180822_virginica_repeatmasker_defaults/Cvirginica_v300.fa.tbl
================================================== file name: Cvirginica_v300.fa sequences: 11 total length: 684741128 bp (684675328 bp excl N/X-runs) GC level: 34.83 % bases masked: 13461422 bp ( 1.97 %) ================================================== number of length percentage elements* occupied of sequence -------------------------------------------------- SINEs: 2056 120820 bp 0.02 % ALUs 0 0 bp 0.00 % MIRs 240 14635 bp 0.00 % LINEs: 3408 331585 bp 0.05 % LINE1 240 16835 bp 0.00 % LINE2 728 69177 bp 0.01 % L3/CR1 1369 135234 bp 0.02 % LTR elements: 704 236625 bp 0.03 % ERVL 14 944 bp 0.00 % ERVL-MaLRs 12 892 bp 0.00 % ERV_classI 272 36695 bp 0.01 % ERV_classII 4 206 bp 0.00 % DNA elements: 1088 100026 bp 0.01 % hAT-Charlie 27 1543 bp 0.00 % TcMar-Tigger 142 9891 bp 0.00 % Unclassified: 57 6096 bp 0.00 % Total interspersed repeats: 795152 bp 0.12 % Small RNA: 3698 279669 bp 0.04 % Satellites: 73 5524 bp 0.00 % Simple repeats: 247957 10848509 bp 1.58 % Low complexity: 30084 1536314 bp 0.22 % ================================================== * most repeats fragmented by insertions or deletions have been counted as one element Runs of >=20 X/Ns in query were excluded in % calcs The query species was assumed to be homo sapiens RepeatMasker Combined Database: Dfam_Consensus-20170127, RepBase-20170127 run with rmblastn version 2.6.0+
Files copied to owl/Athaliana outside of notebook due to sudo
requirement.
%%bash
ls /mnt/owl/Athaliana/20180822_virginica_repeatmasker_*
/mnt/owl/Athaliana/20180822_virginica_repeatmasker_all: Cvirginica_v300.fa.cat.gz Cvirginica_v300.fa.masked Cvirginica_v300.fa.out Cvirginica_v300.fa.out.gff Cvirginica_v300.fa.tbl stderr.err stdout.out /mnt/owl/Athaliana/20180822_virginica_repeatmasker_Cgigas: Cvirginica_v300.fa.cat.gz Cvirginica_v300.fa.masked Cvirginica_v300.fa.out Cvirginica_v300.fa.out.gff Cvirginica_v300.fa.tbl stderr.err stdout.out /mnt/owl/Athaliana/20180822_virginica_repeatmasker_Cvirginica: Cvirginica_v300.fa.cat.gz Cvirginica_v300.fa.masked Cvirginica_v300.fa.out Cvirginica_v300.fa.out.gff Cvirginica_v300.fa.tbl stderr.err stdout.out /mnt/owl/Athaliana/20180822_virginica_repeatmasker_defaults: Cvirginica_v300.fa.cat.gz Cvirginica_v300.fa.masked Cvirginica_v300.fa.out Cvirginica_v300.fa.out.gff Cvirginica_v300.fa.tbl stderr.err stdout.out