#!/usr/bin/env python # coding: utf-8 # ## Identify lncRNA Transcript IDs # # Use [CPC2 Standalone](https://github.com/gao-lab/CPC2_standalone) to idenitfy _P.generosa_ lncRNAs with no coding potential # from the Panopea-generosa-v1.0 annotated GTF generated by [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml). # # Notebook uses an aribtrary minimum length cutoff of 200bp. # # #### Notebook relies on: # # - [CPC2 Standalone](https://github.com/gao-lab/CPC2_standalone) # # - [bedtools getfasta](https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html) # # # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable # # - without `%env` is Python variable # In[2]: # Set directories get_ipython().run_line_magic('env', 'transcriptomes_dir=/home/sam/data/P_generosa/transcriptomes') get_ipython().run_line_magic('env', 'genomes_dir=/home/sam/data/P_generosa/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20230502-pgen-lncRNA-identification') analysis_dir="20230502-pgen-lncRNA-identification" # Set lncRNA minimum length get_ipython().run_line_magic('env', 'min_lncRNA_length=200') # Input files get_ipython().run_line_magic('env', 'gffcompare_gtf=Panopea-generosa-v1.0-gffcmp.annotated.gtf') get_ipython().run_line_magic('env', 'genome_fasta=Panopea-generosa-v1.0.fasta') # Genome FastA URL # https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/Panopea-generosa-v1.0.fa get_ipython().run_line_magic('env', 'genome_url=gannet:/volume2/web/Atumefaciens/20191105_swoose_pgen_v074_renaming/Panopea-generosa-v1.0.fa') # URL of file directory # https://gannet.fish.washington.edu/Atumefaciens/20230426-pgen-HISAT2-stringtie-gffcompare-RNAseq/gffcompare get_ipython().run_line_magic('env', 'gff_comp_gtf_url=gannet:/volume2/web/Atumefaciens/20230426-pgen-HISAT2-stringtie-gffcompare-RNAseq/gffcompare') # Output file(s) get_ipython().run_line_magic('env', 'lncRNAs_gtf=20230502-pgen-lncRNA-IDs.gtf') get_ipython().run_line_magic('env', 'lncRNA_candidates=lncRNA_candidates.gtf') get_ipython().run_line_magic('env', 'lncRNA_candidates_bed=lncRNA_candidates.bed') get_ipython().run_line_magic('env', 'lncRNA_candidates_fasta=lncRNA_candidates.fasta') get_ipython().run_line_magic('env', 'lncRNA_ids=lncRNA-IDs.txt') get_ipython().run_line_magic('env', 'cpc2_table=cpc2_output_table') # Set program locations get_ipython().run_line_magic('env', 'cpc2=/home/sam/programs/CPC2_standalone-1.0.1/bin/CPC2.py') get_ipython().run_line_magic('env', 'bedtools=/home/sam/programs/bedtools-2.29.1/bin/bedtools') # Line for formatting get_ipython().run_line_magic('env', 'line=-------------------------------------------------------------------------------------') # ### Create analysis directory # In[3]: get_ipython().run_cell_magic('bash', '', '# Make analysis and data directory, if doesn\'t exist\nmkdir --parents "${analysis_dir}"\n\nmkdir --parents "${transcriptomes_dir}"\n') # ### Download GTF # In[4]: get_ipython().run_cell_magic('bash', '', 'cd "${transcriptomes_dir}"\n\nrsync "${gff_comp_gtf_url}/${gffcompare_gtf}" .\n\n\nls -ltrh "${gffcompare_gtf}"\n') # ### Inspect GTF # In[5]: get_ipython().run_cell_magic('bash', '', 'head ${transcriptomes_dir}/"${gffcompare_gtf}"\n\necho ""\necho "${line}"\necho ""\n\necho "Number of lines:"\nwc -l ${transcriptomes_dir}/*.gtf\n\necho ""\necho "${line}"\necho ""\n\necho "Number of transcripts in ${gffcompare_gtf}:"\nawk \'$3 == "transcript" {print}\' ${transcriptomes_dir}/"${gffcompare_gtf}" | wc -l\n') # ## Download genome FastA # In[6]: get_ipython().run_cell_magic('bash', '', '\ncd "${genomes_dir}"\n\nrsync "${genome_url}" .\n\n\nls -ltrh "${genome_fasta}"\n') # ### Inspect genome FastA # In[7]: get_ipython().run_cell_magic('bash', '', 'cd "${genomes_dir}"\n\necho "Number of sequences:"\ngrep "^>" -c "${genome_fasta}"\n') # ## Parse out transcripts >= 200bp _and_ no overlap with reference transcripts # # string class_code ā€œuā€ indicates no overlap # In[8]: get_ipython().run_cell_magic('bash', '', 'cd "${transcriptomes_dir}"\n\nawk \'$3 == "transcript" {print}\' "${gffcompare_gtf}" | grep \'class_code "u"\' \\\n| awk -v min_lncRNA_length="${min_lncRNA_length}" \'$5 - $4 >= min_lncRNA_length {print}\' \\\n> "${analysis_dir}/${lncRNA_candidates}"\n\nwc -l "${analysis_dir}/${lncRNA_candidates}"\n\necho ""\necho "${line}"\necho ""\n\nhead "${analysis_dir}/${lncRNA_candidates}"\n') # ## Convert GTF to BED # # This is needed so the subsequent `bedtools getfasta` step will have a unique name for each transcript. # # Otherwise, the generated names generate duplicates because they're based off of the scaffold name and the start/stop sites. As such, when there are multiple transcripts identified for the same locations, they end up with the same names. # In[9]: get_ipython().run_cell_magic('bash', '', '\nwhile read -r line\ndo\n stringtie_transcript=$(echo "${line}" | awk -F "\\"" \'{print $2}\')\n\n chr=$(echo "${line}" | awk \'{print $1}\')\n\n start=$(echo "${line}" | awk \'{print $4}\')\n \n end=$(echo "${line}" | awk \'{print $5}\')\n\n score="0"\n \n strand=$(echo "${line}" | awk \'{print $7}\')\n\n printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n" "${chr}" "${start}" "${end}" "${stringtie_transcript}" "${score}" "${strand}"\n\ndone < "${analysis_dir}/${lncRNA_candidates}" > "${analysis_dir}/${lncRNA_candidates_bed}"\n') # ### Inspect BED # In[10]: get_ipython().run_cell_magic('bash', '', '\nhead "${analysis_dir}/${lncRNA_candidates_bed}"\n\necho ""\necho "${line}"\necho ""\n\nwc -l "${analysis_dir}/${lncRNA_candidates_bed}"\n') # ## Extract FastA sequences from candidate lncRNAs # # Use `bedtools` to extract lncRNA sequences as FastA. # # - `-name` option to use FastA sequence ID and coordinates as the names of the output sequences # In[11]: get_ipython().run_cell_magic('bash', '', '\n${bedtools} getfasta -fi "${genomes_dir}/${genome_fasta}" \\\n-bed "${analysis_dir}/${lncRNA_candidates_bed}" \\\n-fo "${analysis_dir}"/"${lncRNA_candidates_fasta}" \\\n-name\n') # ### Inspect lncRNA FastA # # Number of sequences should match number of transcripts from above (`14076`) # In[12]: get_ipython().run_cell_magic('bash', '', '\necho "Number of sequences:"\ngrep -c "^>" "${analysis_dir}"/"${lncRNA_candidates_fasta}"\n\necho ""\necho "${line}"\necho ""\necho "Definition lines formatting:"\ngrep "^>" "${analysis_dir}"/"${lncRNA_candidates_fasta}" | head\n') # ## Run CPC2 to identify non-coding RNAs # In[13]: get_ipython().run_cell_magic('bash', '', '\n"${cpc2}" \\\n-i "${analysis_dir}"/"${lncRNA_candidates_fasta}" \\\n-o "${analysis_dir}/${cpc2_table}"\n') # ### Inspect CPC2 table # In[14]: get_ipython().run_cell_magic('bash', '', '\nhead "${analysis_dir}/${cpc2_table}.txt"\n\necho ""\necho "${line}"\necho ""\n\necho "Number of entries:"\n\n# Skips header line\ntail -n +2 "${analysis_dir}/${cpc2_table}.txt" | wc -l\n\n# Count number of columns\necho ""\necho "${line}"\necho ""\necho "Number of columns in ${cpc2_table}.txt:"\nawk \'{print NF}\' "${analysis_dir}/${cpc2_table}.txt" | sort --unique\n') # ## Capture noncoding IDs # # The `label` column, which is column 8 (`$8`), will be used to pull out all lncRNA IDs. # In[15]: get_ipython().run_cell_magic('bash', '', '\nawk \'$8 == "noncoding" {print $1}\' "${analysis_dir}/${cpc2_table}.txt" |\nawk -F":" \'{print $1}\' \\\n> "${analysis_dir}/${lncRNA_ids}"\n\nwc -l "${analysis_dir}/${lncRNA_ids}"\n\necho ""\nhead "${analysis_dir}/${lncRNA_ids}"\n') # ## Extract lncRNAs from GTF # In[16]: get_ipython().run_cell_magic('bash', '', 'grep --fixed-strings --file="${analysis_dir}/${lncRNA_ids}" "${analysis_dir}/${lncRNA_candidates}" \\\n> "${analysis_dir}/${lncRNAs_gtf}"\n\nwc -l "${analysis_dir}/${lncRNAs_gtf}"\n') # ### Inspect lncRNA GTF # In[17]: get_ipython().run_cell_magic('bash', '', '\nhead "${analysis_dir}/${lncRNAs_gtf}"\n') # ### Generate checksum(s) # In[18]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n md5sum "${file}" | tee --append checksums.md5\ndone\n') # ### Document GffRead program options # In[19]: get_ipython().run_cell_magic('bash', '', '${cpc2} -h\n')