%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Tue Mar 3 08:05:13 PST 2020 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: swoose ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 24 On-line CPU(s) list: 0-23 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 44 Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz Stepping: 2 CPU MHz: 2925.931 BogoMIPS: 5851.96 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 12288K NUMA node0 CPU(s): 0-23 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 70G 5.6G 51G 626M 14G 63G Swap: 4.7G 0B 4.7G
No LSB modules are available.
%env
variables are good for passing to bash cells
# Set workding directory
%env wd=/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming
wd="/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming"
%env rsync_owl=owl:/volume1/web/halfshell/genomic-databank/
%env wget_command=--directory-prefix=${wd} --quiety --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/
%env og_fasta=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta
%env og_fai=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta.fai
%env og_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff
%env gene_gff=Olurida_v081-20190709.gene.gff
%env genes_fasta=Olurida_v081.bedtools.genes.fasta
%env genes_fai=Olurida_v081.bedtools.genes.fasta.fai
%env final_genes_fasta=Olurida_v081.genes.fasta
%env final_genes_fai=Olurida_v081.genes.fasta.fai
%env getfasta=/home/sam/programs/bedtools-2.28.0/bin/fastaFromBed
%env samtools=/home/sam/programs/samtools-1.9/samtools
env: wd=/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming env: rsync_owl=owl:/volume1/web/halfshell/genomic-databank/ env: wget_command=--directory-prefix=$/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming --quiety --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/ env: og_fasta=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta env: og_fai=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta.fai env: og_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff env: gene_gff=Olurida_v081-20190709.gene.gff env: genes_fasta=Olurida_v081.bedtools.genes.fasta env: genes_fai=Olurida_v081.bedtools.genes.fasta.fai env: final_genes_fasta=Olurida_v081.genes.fasta env: final_genes_fai=Olurida_v081.genes.fasta.fai env: getfasta=/home/sam/programs/bedtools-2.28.0/bin/fastaFromBed env: samtools=/home/sam/programs/samtools-1.9/samtools
%%bash
mkdir --parents ${wd}
cd {wd}
/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming
Taken from: https://owl.fish.washington.edu/halfshell/genomic-databank/
%%bash
# Create array of files from list
files_array=(Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff Olurida_v081-20190709.gene.gff)
for file in "${files_array[@]}"
do
rsync \
--archive \
--progress \
--verbose \
"${rsync_owl}${file}" \
.
done
ls -lh
receiving incremental file list Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff 3,104,658,743 100% 24.69MB/s 0:01:59 (xfr#1, to-chk=0/1) sent 30 bytes received 3,105,037,889 bytes 24,353,238.58 bytes/sec total size is 3,104,658,743 speedup is 1.00 receiving incremental file list Olurida_v081-20190709.gene.gff 9,248,086 100% 38.18MB/s 0:00:00 (xfr#1, to-chk=0/1) sent 30 bytes received 9,249,332 bytes 2,642,674.86 bytes/sec total size is 9,248,086 speedup is 1.00 total 3.0G -rw-r--r-- 1 sam users 8.9M Jul 16 2019 Olurida_v081-20190709.gene.gff -rw-rw-r-- 1 sam users 2.9G Dec 13 21:30 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff
%%bash
# Find the first line in the file that begins with ">"
# and print to the end of the file ('p' enables printing the first matching line, which would be skipped by default)
sed --quiet '/^>/,$p' ${og_gff} > ${og_fasta}
%%bash
grep --with-filename --count "^>" ${og_gff}
grep --with-filename --count "^>" ${og_fasta}
Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff:159429 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:159429
%%bash
${samtools} faidx ${og_fasta} > ${og_fai}
head ${og_fai}
Contig56127 12532 13 60 61 Contig81695 22750 12767 60 61 Contig130560 1525 35911 60 61 Contig155059 2121 37476 60 61 Contig61093 10407 39646 60 61 Contig79811 2248 50240 60 61 Contig89862 14043 52539 60 61 Contig1111 28792 66829 60 61 Contig160984 4700 96115 60 61 Contig214118 1068 100908 60 61
Splits input FastA based on GFF coordinates
%%bash
${getfasta} -fi ${og_fasta} -bed ${gene_gff} > ${genes_fasta}
grep ">" ${genes_fasta} | head
>Contig61093:7492-7946 >Contig1111:24967-28696 >Contig214118:200-926 >Contig58217:9735-11541 >Contig2046:2294-18394 >Contig9540:4302-10179 >Contig52254:8907-11733 >Contig36645:2184-3340 >Contig3008:531-6482 >Contig67269:2368-12743
%%bash
# Array of "old" scaffold names
# Formats names to match FastA headers.
# Subtracts "1" from the start position to match bedTools 0-based counting
mapfile -t orig_scaffold_names < <(awk -F"\t" 'NR > 1 {print $1":"$4-1"-"$5}' ${gene_gff})
# Array of new scaffold names
# Separators set as "=" and ";" to pull out new IDs
# NR > 1 skips the first line (i.e. header)
mapfile -t new_scaffold_names < <(awk -F"[=;]" 'NR > 1 {print $4}' ${gene_gff})
# sed substitution
# creates sed script to find original scaffold names and replace them with new scafold names
# and passes to sed via stdin
for index in "${!orig_scaffold_names[@]}"
do
printf "s/%s/%s/\n" "${orig_scaffold_names[index]}" "${new_scaffold_names[index]}"
done | sed --file - "${genes_fasta}" \
>> "${final_genes_fasta}"
done
ls -ltrh
bash: line 20: syntax error near unexpected token `done' bash: line 20: `done'
--------------------------------------------------------------------------- CalledProcessError Traceback (most recent call last) <ipython-input-10-5a4d4adbd931> in <module> ----> 1 get_ipython().run_cell_magic('bash', '', '\n# Array of "old" scaffold names\n# Formats names to match FastA headers.\n# Subtracts "1" from the start position to match bedTools 0-based counting\nmapfile -t orig_scaffold_names < <(awk -F"\\t" \'NR > 1 {print $1":"$4-1"-"$5}\' ${gene_gff})\n\n# Array of new scaffold names\n# Separators set as "=" and ";" to pull out new IDs\n# NR > 1 skips the first line (i.e. header)\nmapfile -t new_scaffold_names < <(awk -F"[=;]" \'NR > 1 {print $4}\' ${gene_gff})\n\n# sed substitution\n# creates sed script to find original scaffold names and replace them with new scafold names\n# and passes to sed via stdin\nfor index in "${!orig_scaffold_names[@]}"\n do\n printf "s/%s/%s/\\n" "${orig_scaffold_names[index]}" "${new_scaffold_names[index]}"\n done | sed --file - "${genes_fasta}" \\\n >> "${final_genes_fasta}"\ndone\n\nls -ltrh\n') ~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell) 2350 with self.builtin_trap: 2351 args = (magic_arg_s, cell) -> 2352 result = fn(*args, **kwargs) 2353 return result 2354 ~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magics/script.py in named_script_magic(line, cell) 140 else: 141 line = script --> 142 return self.shebang(line, cell) 143 144 # write a basic docstring: </home/sam/programs/minicocnda3/lib/python3.6/site-packages/decorator.py:decorator-gen-110> in shebang(self, line, cell) ~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k) 185 # but it's overkill for just that one bit of state. 186 def magic_deco(arg): --> 187 call = lambda f, *a, **k: f(*a, **k) 188 189 if callable(arg): ~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magics/script.py in shebang(self, line, cell) 243 sys.stderr.flush() 244 if args.raise_error and p.returncode!=0: --> 245 raise CalledProcessError(p.returncode, cell, output=out, stderr=err) 246 247 def _run_script(self, p, cell, to_close): CalledProcessError: Command 'b'\n# Array of "old" scaffold names\n# Formats names to match FastA headers.\n# Subtracts "1" from the start position to match bedTools 0-based counting\nmapfile -t orig_scaffold_names < <(awk -F"\\t" \'NR > 1 {print $1":"$4-1"-"$5}\' ${gene_gff})\n\n# Array of new scaffold names\n# Separators set as "=" and ";" to pull out new IDs\n# NR > 1 skips the first line (i.e. header)\nmapfile -t new_scaffold_names < <(awk -F"[=;]" \'NR > 1 {print $4}\' ${gene_gff})\n\n# sed substitution\n# creates sed script to find original scaffold names and replace them with new scafold names\n# and passes to sed via stdin\nfor index in "${!orig_scaffold_names[@]}"\n do\n printf "s/%s/%s/\\n" "${orig_scaffold_names[index]}" "${new_scaffold_names[index]}"\n done | sed --file - "${genes_fasta}" \\\n >> "${final_genes_fasta}"\ndone\n\nls -ltrh\n'' returned non-zero exit status 2.
Hmmm, this threw an error? Weird.
Also, this took a little over 17 hours to run and then this happens??!! Maybe it would've run faster if I told it just to process lines that began with a ">
", which would, theoreticaly, prevent sed
from searching every sinlge line in the file?
Ugh. Let's check 'em out to see if things look OK or not.
%%bash
grep --with-filename --count "^>" ${genes_fasta}
grep --with-filenam --count "^>" ${final_genes_fasta}
Olurida_v081.bedtools.genes.fasta:32210 Olurida_v081.genes.fasta:32210
%%bash
grep --with-filename "^>" ${og_fasta} | head
grep --with-filename "^>" ${genes_fasta} | head
grep --with-filenam "^>" ${final_genes_fasta} | head
Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig56127 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig81695 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig130560 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig155059 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig61093 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig79811 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig89862 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig1111 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig160984 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig214118 Olurida_v081.bedtools.genes.fasta:>Contig61093:7492-7946 Olurida_v081.bedtools.genes.fasta:>Contig1111:24967-28696 Olurida_v081.bedtools.genes.fasta:>Contig214118:200-926 Olurida_v081.bedtools.genes.fasta:>Contig58217:9735-11541 Olurida_v081.bedtools.genes.fasta:>Contig2046:2294-18394 Olurida_v081.bedtools.genes.fasta:>Contig9540:4302-10179 Olurida_v081.bedtools.genes.fasta:>Contig52254:8907-11733 Olurida_v081.bedtools.genes.fasta:>Contig36645:2184-3340 Olurida_v081.bedtools.genes.fasta:>Contig3008:531-6482 Olurida_v081.bedtools.genes.fasta:>Contig67269:2368-12743 Olurida_v081.genes.fasta:>OLUR_00020575 Olurida_v081.genes.fasta:>OLUR_00006628 Olurida_v081.genes.fasta:>OLUR_00032161 Olurida_v081.genes.fasta:>OLUR_00019127 Olurida_v081.genes.fasta:>OLUR_00011450 Olurida_v081.genes.fasta:>OLUR_00018391 Olurida_v081.genes.fasta:>OLUR_00011614 Olurida_v081.genes.fasta:>OLUR_00022996 Olurida_v081.genes.fasta:>OLUR_00018754 Olurida_v081.genes.fasta:>OLUR_00017261
Well, the counts and substitutions all look fine, so I guess we're good to go?
%%bash
${samtools} faidx ${final_genes_fasta} > ${final_genes_fai}
head ${final_genes_fai}
OLUR_00020575 454 15 454 455 OLUR_00006628 3729 485 3729 3730 OLUR_00032161 726 4230 726 727 OLUR_00019127 1806 4972 1806 1807 OLUR_00011450 16100 6794 16100 16101 OLUR_00018391 5877 22910 5877 5878 OLUR_00011614 2826 28803 2826 2827 OLUR_00022996 1156 31645 1156 1157 OLUR_00018754 5951 32817 5951 5952 OLUR_00017261 10375 38784 10375 10376
%%bash
rm ${genes_fasta} ${genes_fai} ${og_fasta} ${og_fai} ${og_gff} ${gene_gff}
ls -ltrh
total 213M -rw-rw-r-- 1 sam sam 212M Mar 4 01:22 Olurida_v081.genes.fasta -rw-rw-r-- 1 sam sam 1.2M Mar 4 05:32 Olurida_v081.genes.fasta.fai
rm: cannot remove 'Olurida_v081.bedtools.genes.fasta.fai': No such file or directory