#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # `%env` variables are good for passing to bash cells # In[2]: # Set workding directory get_ipython().run_line_magic('env', 'wd=/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming') wd="/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming" get_ipython().run_line_magic('env', 'rsync_owl=owl:/volume1/web/halfshell/genomic-databank/') get_ipython().run_line_magic('env', 'wget_command=--directory-prefix=${wd} --quiety --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/') get_ipython().run_line_magic('env', 'og_fasta=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta') get_ipython().run_line_magic('env', 'og_fai=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta.fai') get_ipython().run_line_magic('env', 'og_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff') get_ipython().run_line_magic('env', 'gene_gff=Olurida_v081-20190709.gene.gff') get_ipython().run_line_magic('env', 'genes_fasta=Olurida_v081.bedtools.genes.fasta') get_ipython().run_line_magic('env', 'genes_fai=Olurida_v081.bedtools.genes.fasta.fai') get_ipython().run_line_magic('env', 'final_genes_fasta=Olurida_v081.genes.fasta') get_ipython().run_line_magic('env', 'final_genes_fai=Olurida_v081.genes.fasta.fai') get_ipython().run_line_magic('env', 'getfasta=/home/sam/programs/bedtools-2.28.0/bin/fastaFromBed') get_ipython().run_line_magic('env', 'samtools=/home/sam/programs/samtools-1.9/samtools') # #### Create necessary directories # In[3]: get_ipython().run_cell_magic('bash', '', 'mkdir --parents ${wd}\n') # In[4]: cd {wd} # #### Download Olur_v081 genes gff # # Taken from: https://owl.fish.washington.edu/halfshell/genomic-databank/ # In[5]: get_ipython().run_cell_magic('bash', '', '# Create array of files from list\nfiles_array=(Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff Olurida_v081-20190709.gene.gff)\n\n\nfor file in "${files_array[@]}"\ndo\n rsync \\\n --archive \\\n --progress \\\n --verbose \\\n "${rsync_owl}${file}" \\\n .\ndone\n\nls -lh\n') # ### Extract FastA from MAKER GFF # In[6]: get_ipython().run_cell_magic('bash', '', '\n# Find the first line in the file that begins with ">"\n# and print to the end of the file (\'p\' enables printing the first matching line, which would be skipped by default)\nsed --quiet \'/^>/,$p\' ${og_gff} > ${og_fasta}\n') # #### Compare number of FastA header lines # In[7]: get_ipython().run_cell_magic('bash', '', '\ngrep --with-filename --count "^>" ${og_gff}\ngrep --with-filename --count "^>" ${og_fasta}\n') # #### Create FastA index file # In[8]: get_ipython().run_cell_magic('bash', '', '\n${samtools} faidx ${og_fasta} > ${og_fai}\n\nhead ${og_fai}\n') # ### Extract just gene seqences # # Splits input FastA based on GFF coordinates # In[9]: get_ipython().run_cell_magic('bash', '', '${getfasta} -fi ${og_fasta} -bed ${gene_gff} > ${genes_fasta}\n\ngrep ">" ${genes_fasta} | head\n') # ### Change scaffold names and file names # In[10]: get_ipython().run_cell_magic('bash', '', '\n# Array of "old" scaffold names\n# Formats names to match FastA headers.\n# Subtracts "1" from the start position to match bedTools 0-based counting\nmapfile -t orig_scaffold_names < <(awk -F"\\t" \'NR > 1 {print $1":"$4-1"-"$5}\' ${gene_gff})\n\n# Array of new scaffold names\n# Separators set as "=" and ";" to pull out new IDs\n# NR > 1 skips the first line (i.e. header)\nmapfile -t new_scaffold_names < <(awk -F"[=;]" \'NR > 1 {print $4}\' ${gene_gff})\n\n# sed substitution\n# creates sed script to find original scaffold names and replace them with new scafold names\n# and passes to sed via stdin\nfor index in "${!orig_scaffold_names[@]}"\n do\n printf "s/%s/%s/\\n" "${orig_scaffold_names[index]}" "${new_scaffold_names[index]}"\n done | sed --file - "${genes_fasta}" \\\n >> "${final_genes_fasta}"\ndone\n\nls -ltrh\n') # Hmmm, this threw an error? Weird. # # Also, this took a little over 17 _hours_ to run and then this happens??!! Maybe it would've run faster if I told it just to process lines that began with a "`>`", which would, theoreticaly, prevent `sed` from searching every sinlge line in the file? # # Ugh. Let's check 'em out to see if things look OK or not. # #### Check old and new FastAs to confirm substituions # In[11]: get_ipython().run_cell_magic('bash', '', '\ngrep --with-filename --count "^>" ${genes_fasta}\n\ngrep --with-filenam --count "^>" ${final_genes_fasta}\n') # In[12]: get_ipython().run_cell_magic('bash', '', '\ngrep --with-filename "^>" ${og_fasta} | head\n\ngrep --with-filename "^>" ${genes_fasta} | head\n\ngrep --with-filenam "^>" ${final_genes_fasta} | head\n') # Well, the counts and substitutions all look fine, so I guess we're good to go? # #### Make FastA index file for new FastA # In[13]: get_ipython().run_cell_magic('bash', '', '\n${samtools} faidx ${final_genes_fasta} > ${final_genes_fai}\n\nhead ${final_genes_fai}\n') # ### Cleanup # In[15]: get_ipython().run_cell_magic('bash', '', 'rm ${genes_fasta} ${genes_fai} ${og_fasta} ${og_fai} ${og_gff} ${gene_gff}\n\nls -ltrh\n')