#!/usr/bin/env python # coding: utf-8 # ## Create _C.virginia_ long, non-coding RNA files. # # ### Downloads files from NCBI. # # ### Notebook relies on: # # - [GffRead](https://github.com/gpertea/gffread) # # - [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) available in your `$PATH`. # # - I accomplished this by creating/activating a conda environment for [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) and running this notebook from within that environment. # # - [samtools](http://www.htslib.org/). # # ### Resulting files will be used for [_C.virginica_ RNAseq/DML sex/OA project](https://github.com/epigeneticstoocean/2018_L18-adult-methylation) (GitHub repo) # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable # # - without `%env` is Python variable # In[2]: # Set directories, input/output files get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/C_virginica/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20220217-cvir-lncRNA_subsetting') analysis_dir="20220217-cvir-lncRNA_subsetting" # Input files (from NCBI) get_ipython().run_line_magic('env', 'ncbi_fasta=GCF_002022765.2_C_virginica-3.0_genomic.fna') get_ipython().run_line_magic('env', 'ncbi_fasta_index=GCF_002022765.2_C_virginica-3.0_genomic.fna.fai') get_ipython().run_line_magic('env', 'ncbi_fasta_gz=GCF_002022765.2_C_virginica-3.0_genomic.fna.gz') get_ipython().run_line_magic('env', 'ncbi_gff=GCF_002022765.2_C_virginica-3.0_genomic.gff') get_ipython().run_line_magic('env', 'ncbi_gff_gz=GCF_002022765.2_C_virginica-3.0_genomic.gff.gz') # URL to download files from NCBI get_ipython().run_line_magic('env', 'ncbi_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0') # Output files get_ipython().run_line_magic('env', 'lncRNA_bed=GCF_002022765.2_C_virginica-3.0_lncRNA.bed') get_ipython().run_line_magic('env', 'lncRNA_gff=GCF_002022765.2_C_virginica-3.0_lncRNA.gff') get_ipython().run_line_magic('env', 'lncRNA_gtf=GCF_002022765.2_C_virginica-3.0_lncRNA.gtf') get_ipython().run_line_magic('env', 'lncRNA_fasta=GCF_002022765.2_C_virginica-3.0_lncRNA.fa') get_ipython().run_line_magic('env', 'lncRNA_fasta_index=GCF_002022765.2_C_virginica-3.0_lncRNA.fa.fai') # Set program locations get_ipython().run_line_magic('env', 'gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread') get_ipython().run_line_magic('env', 'samtools=/home/sam/programs/samtools-1.12/samtools') # ### Create analysis directory # In[3]: get_ipython().run_cell_magic('bash', '', '# Make analysis directory, if it doesn\'t exist\nmkdir --parents "${analysis_dir}"\n') # ### Download GFF # In[4]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Download with wget.\n# Use --quiet option to prevent wget output from printing too many lines to notebook\n# Use --continue to prevent re-downloading fie if it\'s already been downloaded.\nwget --quiet \\\n--continue \\\n${ncbi_url}/${ncbi_gff_gz}\n\n# Unzip download GFF\ngunzip "${ncbi_gff_gz}"\n\nls -ltrh "${ncbi_gff}"\n') # ### Examine GFF # In[5]: get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${ncbi_gff}"\n') # ### Download NCBI genomic FastA # In[6]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Download with wget.\n# Use --quiet option to prevent wget output from printing too many lines to notebook\n# Use --continue to prevent re-downloading fie if it\'s already been downloaded.\nwget --quiet \\\n--continue \\\n${ncbi_url}/${ncbi_fasta_gz}\n\n# Unzip download GFF\ngunzip "${ncbi_fasta_gz}"\n\nls -ltrh "${ncbi_fasta}"\n') # ### Create FastA index with [Samtools](http://www.htslib.org/) # In[7]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n${samtools} faidx "${ncbi_fasta}"\n\nls -ltrh "${ncbi_fasta_index}"\n') # ### Inspect NCBI genomic FastA index # In[8]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\nhead "${ncbi_fasta_index}"\n') # ### Extracts lncRNAs from genomic GFF using `gtf_extract` from [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) # In[9]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Capture GFF header from NCBI gff\nhead -n 7 "${ncbi_gff}" > ${analysis_dir}/"${lncRNA_gff}"\n\n# Add note about modification\nprintf "#%s%s\\n" "!" "lncRNA only - created by Sam White $(date)" >> ${analysis_dir}/"${lncRNA_gff}"\n\n\n# Finds lncRNAs in NCBI GFF\ngtf_extract \\\n--feature lnc_RNA \\\n--gff "${ncbi_gff}" \\\n>> ${analysis_dir}/"${lncRNA_gff}"\n\n\nhead ${analysis_dir}/"${lncRNA_gff}"\n') # ### Extract lncRNAs to BED using [GffRead](https://github.com/gpertea/gffread) # In[10]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n${gffread} --bed \\\n${analysis_dir}/"${lncRNA_gff}" \\\n> ${analysis_dir}/"${lncRNA_bed}"\n') # ### Inspect lncRNA BED # In[11]: get_ipython().run_cell_magic('bash', '', 'head ${analysis_dir}/"${lncRNA_bed}"\n') # ### Convert lncRNA GFF to GTF # In[12]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n${gffread} -E \\\n${analysis_dir}/"${lncRNA_gff}" -T \\\n1> ${analysis_dir}/"${lncRNA_gtf}" \\\n2> ${analysis_dir}/gffread-lncRNA_gff-to-lncRNA_gtf.stderr\n') # ### Inspect lncRNA GTF # In[13]: get_ipython().run_cell_magic('bash', '', 'head ${analysis_dir}/"${lncRNA_gtf}"\n') # ### Exract lncRNAs to FastA # # Explanation of GffRead options used below: # # - `-w`: specifies output FastA file # # - `-W`: specifies to write coordinates of all exons spliced in FastA deflines # # - `-g`: specifies input FastA (needs to have a corresponding FastA index file in same directory) # In[14]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n${gffread} -E \\\n-w ${analysis_dir}/"${lncRNA_fasta}" -W \\\n-g "${ncbi_fasta}" \\\n${analysis_dir}/"${lncRNA_gtf}" \\\n2> ${analysis_dir}/gffread_lncRNA-fasta-extraction.stderr\n') # ### Inspect lncRNA FastA # In[15]: get_ipython().run_cell_magic('bash', '', 'head ${analysis_dir}/"${lncRNA_fasta}"\n') # ### Create lncRNA FastA index # In[16]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\n${samtools} faidx "${lncRNA_fasta}"\n\nls -ltrh "${lncRNA_fasta_index}"\n') # ### Inspect lncRNA FastA index # In[17]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nhead "${lncRNA_fasta_index}"\n') # ### Generate checksums # In[18]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n md5sum "${file}" | tee --append checksums.md5\ndone\n') # ### Document GffRead program options # In[19]: get_ipython().run_cell_magic('bash', '', '${gffread} -h\n') # ### Document `gtf_extract` options # In[20]: get_ipython().run_cell_magic('bash', '', 'gtf_extract -h\n') # In[ ]: