#!/usr/bin/env python
# coding: utf-8

# ## Identify lncRNA Transcript IDs
# 
# Use [CPC2 Standalone](https://github.com/gao-lab/CPC2_standalone) to idenitfy _P.generosa_ lncRNAs with no coding potential
# from the Panopea-generosa-v1.0 annotated GTF generated by [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml).
# 
# Notebook uses an aribtrary minimum length cutoff of 200bp.
# 
# #### Notebook relies on:
# 
# - [CPC2 Standalone](https://github.com/gao-lab/CPC2_standalone)
# 
# - [bedtools getfasta](https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html)
# 
# 

# ### List computer specs

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# ### Set variables
# - `%env` indicates a bash variable
# 
# - without `%env` is Python variable

# In[2]:


# Set directories
get_ipython().run_line_magic('env', 'transcriptomes_dir=/home/sam/data/P_generosa/transcriptomes')
get_ipython().run_line_magic('env', 'genomes_dir=/home/sam/data/P_generosa/genomes')
get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20230502-pgen-lncRNA-identification')
analysis_dir="20230502-pgen-lncRNA-identification"

# Set lncRNA minimum length
get_ipython().run_line_magic('env', 'min_lncRNA_length=200')

# Input files
get_ipython().run_line_magic('env', 'gffcompare_gtf=Panopea-generosa-v1.0-gffcmp.annotated.gtf')
get_ipython().run_line_magic('env', 'genome_fasta=Panopea-generosa-v1.0.fasta')
# Genome FastA URL
# https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/Panopea-generosa-v1.0.fa
get_ipython().run_line_magic('env', 'genome_url=gannet:/volume2/web/Atumefaciens/20191105_swoose_pgen_v074_renaming/Panopea-generosa-v1.0.fa')

# URL of file directory
# https://gannet.fish.washington.edu/Atumefaciens/20230426-pgen-HISAT2-stringtie-gffcompare-RNAseq/gffcompare
get_ipython().run_line_magic('env', 'gff_comp_gtf_url=gannet:/volume2/web/Atumefaciens/20230426-pgen-HISAT2-stringtie-gffcompare-RNAseq/gffcompare')

# Output file(s)
get_ipython().run_line_magic('env', 'lncRNAs_gtf=20230502-pgen-lncRNA-IDs.gtf')
get_ipython().run_line_magic('env', 'lncRNA_candidates=lncRNA_candidates.gtf')
get_ipython().run_line_magic('env', 'lncRNA_candidates_bed=lncRNA_candidates.bed')
get_ipython().run_line_magic('env', 'lncRNA_candidates_fasta=lncRNA_candidates.fasta')
get_ipython().run_line_magic('env', 'lncRNA_ids=lncRNA-IDs.txt')
get_ipython().run_line_magic('env', 'cpc2_table=cpc2_output_table')


# Set program locations
get_ipython().run_line_magic('env', 'cpc2=/home/sam/programs/CPC2_standalone-1.0.1/bin/CPC2.py')
get_ipython().run_line_magic('env', 'bedtools=/home/sam/programs/bedtools-2.29.1/bin/bedtools')

# Line for formatting

get_ipython().run_line_magic('env', 'line=-------------------------------------------------------------------------------------')


# ### Create analysis directory

# In[3]:


get_ipython().run_cell_magic('bash', '', '# Make analysis and data directory, if doesn\'t exist\nmkdir --parents "${analysis_dir}"\n\nmkdir --parents "${transcriptomes_dir}"\n')


# ### Download GTF

# In[4]:


get_ipython().run_cell_magic('bash', '', 'cd "${transcriptomes_dir}"\n\nrsync "${gff_comp_gtf_url}/${gffcompare_gtf}" .\n\n\nls -ltrh "${gffcompare_gtf}"\n')


# ### Inspect GTF

# In[5]:


get_ipython().run_cell_magic('bash', '', 'head ${transcriptomes_dir}/"${gffcompare_gtf}"\n\necho ""\necho "${line}"\necho ""\n\necho "Number of lines:"\nwc -l ${transcriptomes_dir}/*.gtf\n\necho ""\necho "${line}"\necho ""\n\necho "Number of transcripts in ${gffcompare_gtf}:"\nawk \'$3 == "transcript" {print}\' ${transcriptomes_dir}/"${gffcompare_gtf}" | wc -l\n')


# ## Download genome FastA

# In[6]:


get_ipython().run_cell_magic('bash', '', '\ncd "${genomes_dir}"\n\nrsync "${genome_url}" .\n\n\nls -ltrh "${genome_fasta}"\n')


# ### Inspect genome FastA

# In[7]:


get_ipython().run_cell_magic('bash', '', 'cd "${genomes_dir}"\n\necho "Number of sequences:"\ngrep "^>" -c "${genome_fasta}"\n')


# ## Parse out transcripts >= 200bp _and_ no overlap with reference transcripts
# 
# string class_code “u” indicates no overlap

# In[8]:


get_ipython().run_cell_magic('bash', '', 'cd "${transcriptomes_dir}"\n\nawk \'$3 == "transcript" {print}\' "${gffcompare_gtf}" | grep \'class_code "u"\' \\\n| awk -v min_lncRNA_length="${min_lncRNA_length}" \'$5 - $4 >= min_lncRNA_length {print}\' \\\n> "${analysis_dir}/${lncRNA_candidates}"\n\nwc -l "${analysis_dir}/${lncRNA_candidates}"\n\necho ""\necho "${line}"\necho ""\n\nhead "${analysis_dir}/${lncRNA_candidates}"\n')


# ## Convert GTF to BED
# 
# This is needed so the subsequent `bedtools getfasta` step will have a unique name for each transcript.
# 
# Otherwise, the generated names generate duplicates because they're based off of the scaffold name and the start/stop sites. As such, when there are multiple transcripts identified for the same locations, they end up with the same names.

# In[9]:


get_ipython().run_cell_magic('bash', '', '\nwhile read -r line\ndo\n  stringtie_transcript=$(echo "${line}" | awk -F "\\"" \'{print $2}\')\n\n  chr=$(echo "${line}" | awk \'{print $1}\')\n\n  start=$(echo "${line}" | awk \'{print $4}\')\n    \n  end=$(echo "${line}" | awk \'{print $5}\')\n\n  score="0"\n    \n  strand=$(echo "${line}" | awk \'{print $7}\')\n\n  printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n" "${chr}" "${start}" "${end}" "${stringtie_transcript}" "${score}" "${strand}"\n\ndone < "${analysis_dir}/${lncRNA_candidates}" > "${analysis_dir}/${lncRNA_candidates_bed}"\n')


# ### Inspect BED

# In[10]:


get_ipython().run_cell_magic('bash', '', '\nhead "${analysis_dir}/${lncRNA_candidates_bed}"\n\necho ""\necho "${line}"\necho ""\n\nwc -l "${analysis_dir}/${lncRNA_candidates_bed}"\n')


# ## Extract FastA sequences from candidate lncRNAs
# 
# Use `bedtools` to extract lncRNA sequences as FastA.
# 
# - `-name` option to use FastA sequence ID and coordinates as the names of the output sequences

# In[11]:


get_ipython().run_cell_magic('bash', '', '\n${bedtools} getfasta -fi "${genomes_dir}/${genome_fasta}" \\\n-bed "${analysis_dir}/${lncRNA_candidates_bed}" \\\n-fo "${analysis_dir}"/"${lncRNA_candidates_fasta}" \\\n-name\n')


# ### Inspect lncRNA FastA
# 
# Number of sequences should match number of transcripts from above (`14076`)

# In[12]:


get_ipython().run_cell_magic('bash', '', '\necho "Number of sequences:"\ngrep -c "^>" "${analysis_dir}"/"${lncRNA_candidates_fasta}"\n\necho ""\necho "${line}"\necho ""\necho "Definition lines formatting:"\ngrep "^>" "${analysis_dir}"/"${lncRNA_candidates_fasta}" | head\n')


# ## Run CPC2 to identify non-coding RNAs

# In[13]:


get_ipython().run_cell_magic('bash', '', '\n"${cpc2}" \\\n-i "${analysis_dir}"/"${lncRNA_candidates_fasta}" \\\n-o "${analysis_dir}/${cpc2_table}"\n')


# ### Inspect CPC2 table

# In[14]:


get_ipython().run_cell_magic('bash', '', '\nhead "${analysis_dir}/${cpc2_table}.txt"\n\necho ""\necho "${line}"\necho ""\n\necho "Number of entries:"\n\n# Skips header line\ntail -n +2 "${analysis_dir}/${cpc2_table}.txt" | wc -l\n\n# Count number of columns\necho ""\necho "${line}"\necho ""\necho "Number of columns in ${cpc2_table}.txt:"\nawk \'{print NF}\' "${analysis_dir}/${cpc2_table}.txt" | sort --unique\n')


# ## Capture noncoding IDs
# 
# The `label` column, which is column 8 (`$8`), will be used to pull out all lncRNA IDs.

# In[15]:


get_ipython().run_cell_magic('bash', '', '\nawk \'$8 == "noncoding" {print $1}\' "${analysis_dir}/${cpc2_table}.txt" |\nawk -F":" \'{print $1}\' \\\n> "${analysis_dir}/${lncRNA_ids}"\n\nwc -l "${analysis_dir}/${lncRNA_ids}"\n\necho ""\nhead "${analysis_dir}/${lncRNA_ids}"\n')


# ## Extract lncRNAs from GTF

# In[16]:


get_ipython().run_cell_magic('bash', '', 'grep --fixed-strings --file="${analysis_dir}/${lncRNA_ids}" "${analysis_dir}/${lncRNA_candidates}" \\\n> "${analysis_dir}/${lncRNAs_gtf}"\n\nwc -l "${analysis_dir}/${lncRNAs_gtf}"\n')


# ### Inspect lncRNA GTF

# In[17]:


get_ipython().run_cell_magic('bash', '', '\nhead "${analysis_dir}/${lncRNAs_gtf}"\n')


# ### Generate checksum(s)

# In[18]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n  md5sum "${file}" | tee --append checksums.md5\ndone\n')


# ### Document GffRead program options

# In[19]:


get_ipython().run_cell_magic('bash', '', '${cpc2} -h\n')