#!/usr/bin/env python
# coding: utf-8

# ## Generate _P.generosa_ FastA files from CDS, gene, and mRNA GFFs
# 
# See this [GitHub Issue](https://github.com/RobertsLab/resources/issues/1439)
# 
# This notebook relies on [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) to be installed and available in your `$PATH`.

# ### List computer specs

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE"\ndate\necho "------------"\necho ""\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "\nhostname\necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# ### Set variables
# - `%env` indicates a bash variable
# - without `%env` is Python variable

# In[2]:


# Set directories
get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/P_generosa/genomes')
get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20220324-pgen-gffs_to_fastas')
analysis_dir="/home/samb/analyses/20220324-pgen-gffs_to_fastas"

# Input GFFs
get_ipython().run_line_magic('env', 'gff_url=https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/')
get_ipython().run_line_magic('env', 'gff_prefix=Panopea-generosa-v1.0.a4.')

# Genome FastA
get_ipython().run_line_magic('env', 'genome_fasta=Panopea-generosa-v1.0.fa')

# Programs
get_ipython().run_line_magic('env', 'bedtools=/home/sam/programs/bedtools-2.29.1/bin/bedtools')
get_ipython().run_line_magic('env', 'samtools=/home/sam/programs/samtools-1.12/samtools')

# Formatting
get_ipython().run_line_magic('env', 'line_break="----------------------------------------------------------------------------------------------"')


# ### Make data and analysis directories if they don't exist

# In[3]:


get_ipython().run_cell_magic('bash', '', 'mkdir --parents "${analysis_dir}" "${data_dir}"\n')


# ### Download FastA and GFFs
# 
# If needing to download via `wget`, be sure to include `--no-check-certificate` option to avoid error.

# In[4]:


get_ipython().run_cell_magic('bash', '', '\ncd "${data_dir}"\n\n# Array of GFF files.\ngff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)\n\n# Download GFFs\nfor gff in "${gff_array[@]}"\ndo\n    wget \\\n    --no-check-certificate \\\n    --continue \\\n    --quiet \\\n    ${gff_url}${gff}\ndone\n\n# Download FastA\nwget \\\n--no-check-certificate \\\n--continue \\\n--quiet \\\n${gff_url}${genome_fasta}\n\nls -lh\n')


# ### Generate MD5 checksums for reference

# In[5]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\nmd5sum *\n')


# ### Examine GFF files

# In[6]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Array of GFF files\ngff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)\n\n# Make a list so subsequent head command lists filenames in output\ngff_list=$(echo "${gff_array[@]}")\n\nhead ${gff_list}\n')


# ### Create customized BED file from GFFs
# 
# - Formatted to use the "name" column in the BED format for use with `bedtools` later...
# 
#   - 4th column will be: `geneID|parentID` or `geneID`

# In[7]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Array of GFF files\ngff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)\n\n\nfor gff in "${gff_array[@]}"\ndo\n\n    # Trim of filename prefix\n    trimmed_name=${gff/Panopea-generosa-v1.0.a4./}\n    \n    # Trim off filename suffix to get genome feature\n    feature=${trimmed_name/.gff3/}\n    \n    if [[ "${feature}" = "mRNA" ]]\n    then\n        # Run gtf_extractor on GFF files\n        gtf_extract \\\n        --gff \\\n        --fields=chr,start,end,ID,Parent \\\n        "${gff}" \\\n        | awk \'{print $1 "\\t" $2 "\\t" $3 "\\t" $4 "|" $5}\' \\\n        > ${analysis_dir}/"${feature}".bed.tmp\n    \n    # Adds gene parent ID by rmoving mRNA ".m[0-9][0-9]" at end of line\n    elif [[ "${feature}" = "CDS" ]]\n    then\n        # Run gtf_extractor on GFF files\n        gtf_extract \\\n        --gff \\\n        --fields=chr,start,end,ID,Parent,Parent \\\n        "${gff}" \\\n        | awk \'{print $1 "\\t" $2 "\\t" $3 "\\t" $4 "|" $5 "|" $6}\' | sed \'s/.m[0-9][0-9]$//\' \\\n        > ${analysis_dir}/"${feature}".bed.tmp\n    \n    # Excludes Parent ID, since genes don\'t have a Parent ID\n    elif [[ "${feature}" = "gene" ]]\n    then\n        # Run gtf_extractor on GFF files\n        gtf_extract \\\n        --gff \\\n        --fields=chr,start,end,ID \\\n        "${gff}" \\\n        | awk \'{print $1 "\\t" $2 "\\t" $3 "\\t" $4}\' \\\n        > ${analysis_dir}/"${feature}".bed.tmp\n    fi\ndone\n\nls -lh ${analysis_dir}/*.bed.tmp\n')


# ### Examine temporary BED files

# In[8]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor bed in *.bed.tmp\ndo\n    echo ""\n    echo "${bed}"\n    echo ""\n    head "${bed}"\n    echo ""\n    echo "${line_break}"\ndone\n')


# ### Create FastA files

# In[9]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor bed in *.bed.tmp\ndo\n    # Get feature by removing strings after first period\n    feature=${bed%%.*}\n    \n    # Used BEDTOOLS getfasta to make FastAs from GFFs\n    ${bedtools} getfasta \\\n    -name \\\n    -fi ${data_dir}/${genome_fasta} \\\n    -bed ${bed} \\\n    > ${gff_prefix}${feature}.fasta\n    \n    # Remove tmp BED file\n    echo ""\n    echo "Removing ${bed}."\n    rm "${bed}"\ndone\n\nls -lh *.fasta\n')


# ### Create FastA index files

# In[10]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor fasta in *.fasta\ndo\n   ${samtools} faidx "${fasta}"\ndone\n\nls -ltrh\n')


# ### Examine FastAs

# In[11]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor fasta in *.fasta\ndo\n    grep --with-filename "^>" "${fasta}" | head\ndone\n')


# ### Generate MD5 checksums

# In[12]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nmd5sum * | tee --append checksums.md5\n')


# ### Remove unneeded data files

# In[13]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Array of GFF files\ngff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)\n\n# Remove genome FastA\necho "Removing ${genome_fasta}."\nrm "${genome_fasta}"\n\n# Remove GFFs\nfor gff in "${gff_array[@]}"\ndo\n  echo ""\n  echo "Removing ${gff}."\n  rm "${gff}"\ndone\n\nls -lh\n')


# ### Program options

# In[14]:


get_ipython().run_cell_magic('bash', '', '\ngtf_extract -h\n\necho ""\necho "${line_break}"\necho "${line_break}"\necho ""\n\n${samtools} faidx -h\n\necho ""\necho "${line_break}"\necho "${line_break}"\necho ""\n\n${bedtools} getfasta -h\n')


# In[ ]: