#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# ### Set variables
# `%env` variables are good for passing to bash cells

# In[2]:


# Set workding directory
get_ipython().run_line_magic('env', 'wd=/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming')
wd="/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming"

get_ipython().run_line_magic('env', 'rsync_owl=owl:/volume1/web/halfshell/genomic-databank/')
get_ipython().run_line_magic('env', 'wget_command=--directory-prefix=${wd} --quiety --no-directories --no-check-certificate  https://owl.fish.washington.edu/halfshell/genomic-databank/')

get_ipython().run_line_magic('env', 'og_fasta=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta')
get_ipython().run_line_magic('env', 'og_fai=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta.fai')
get_ipython().run_line_magic('env', 'og_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff')
get_ipython().run_line_magic('env', 'gene_gff=Olurida_v081-20190709.gene.gff')
get_ipython().run_line_magic('env', 'genes_fasta=Olurida_v081.bedtools.genes.fasta')
get_ipython().run_line_magic('env', 'genes_fai=Olurida_v081.bedtools.genes.fasta.fai')
get_ipython().run_line_magic('env', 'final_genes_fasta=Olurida_v081.genes.fasta')
get_ipython().run_line_magic('env', 'final_genes_fai=Olurida_v081.genes.fasta.fai')

get_ipython().run_line_magic('env', 'getfasta=/home/sam/programs/bedtools-2.28.0/bin/fastaFromBed')
get_ipython().run_line_magic('env', 'samtools=/home/sam/programs/samtools-1.9/samtools')


# #### Create necessary directories

# In[3]:


get_ipython().run_cell_magic('bash', '', 'mkdir --parents ${wd}\n')


# In[4]:


cd {wd}


# #### Download Olur_v081 genes gff
# 
# Taken from: https://owl.fish.washington.edu/halfshell/genomic-databank/

# In[5]:


get_ipython().run_cell_magic('bash', '', '# Create array of files from list\nfiles_array=(Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff Olurida_v081-20190709.gene.gff)\n\n\nfor file in "${files_array[@]}"\ndo\n  rsync \\\n  --archive \\\n  --progress \\\n  --verbose \\\n  "${rsync_owl}${file}" \\\n  .\ndone\n\nls -lh\n')


# ### Extract FastA from MAKER GFF

# In[6]:


get_ipython().run_cell_magic('bash', '', '\n# Find the first line in the file that begins with ">"\n# and print to the end of the file (\'p\' enables printing the first matching line, which would be skipped by default)\nsed --quiet \'/^>/,$p\' ${og_gff} > ${og_fasta}\n')


# #### Compare number of FastA header lines

# In[7]:


get_ipython().run_cell_magic('bash', '', '\ngrep --with-filename --count "^>" ${og_gff}\ngrep --with-filename --count "^>" ${og_fasta}\n')


# #### Create FastA index file

# In[8]:


get_ipython().run_cell_magic('bash', '', '\n${samtools} faidx ${og_fasta} > ${og_fai}\n\nhead ${og_fai}\n')


# ### Extract just gene seqences
# 
# Splits input FastA based on GFF coordinates

# In[9]:


get_ipython().run_cell_magic('bash', '', '${getfasta} -fi ${og_fasta} -bed ${gene_gff} > ${genes_fasta}\n\ngrep ">" ${genes_fasta} | head\n')


# ### Change scaffold names and file names

# In[10]:


get_ipython().run_cell_magic('bash', '', '\n# Array of "old" scaffold names\n# Formats names to match FastA headers.\n# Subtracts "1" from the start position to match bedTools 0-based counting\nmapfile -t orig_scaffold_names < <(awk -F"\\t" \'NR > 1 {print $1":"$4-1"-"$5}\' ${gene_gff})\n\n# Array of new scaffold names\n# Separators set as "=" and ";" to pull out new IDs\n# NR > 1 skips the first line (i.e. header)\nmapfile -t new_scaffold_names < <(awk -F"[=;]" \'NR > 1 {print $4}\' ${gene_gff})\n\n# sed substitution\n# creates sed script to find original scaffold names and replace them with new scafold names\n# and passes to sed via stdin\nfor index in "${!orig_scaffold_names[@]}"\n  do\n    printf "s/%s/%s/\\n" "${orig_scaffold_names[index]}" "${new_scaffold_names[index]}"\n  done | sed --file - "${genes_fasta}" \\\n  >> "${final_genes_fasta}"\ndone\n\nls -ltrh\n')


# Hmmm, this threw an error? Weird.
# 
# Also, this took a little over 17 _hours_ to run and then this happens??!! Maybe it would've run faster if I told it just to process lines that began with a "`>`", which would, theoreticaly, prevent `sed` from searching every sinlge line in the file? 
# 
# Ugh. Let's check 'em out to see if things look OK or not.

# #### Check old and new FastAs to confirm substituions

# In[11]:


get_ipython().run_cell_magic('bash', '', '\ngrep --with-filename --count "^>" ${genes_fasta}\n\ngrep --with-filenam --count "^>" ${final_genes_fasta}\n')


# In[12]:


get_ipython().run_cell_magic('bash', '', '\ngrep --with-filename "^>" ${og_fasta} | head\n\ngrep --with-filename "^>" ${genes_fasta} | head\n\ngrep --with-filenam "^>" ${final_genes_fasta} | head\n')


# Well, the counts and substitutions all look fine, so I guess we're good to go?

# #### Make FastA index file for new FastA

# In[13]:


get_ipython().run_cell_magic('bash', '', '\n${samtools} faidx ${final_genes_fasta} > ${final_genes_fai}\n\nhead ${final_genes_fai}\n')


# ### Cleanup

# In[15]:


get_ipython().run_cell_magic('bash', '', 'rm ${genes_fasta} ${genes_fai} ${og_fasta} ${og_fai} ${og_gff} ${gene_gff}\n\nls -ltrh\n')