#!/usr/bin/env python # coding: utf-8 # ## Convert _P.generosa_ GFF to GTF # # ### Notebook relies on: # # - [GffRead](https://github.com/gpertea/gffread) # # ### Addresses [this GitHub Issue](https://github.com/RobertsLab/resources/issues/1411) # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable # # - without `%env` is Python variable # In[1]: # Set directories, input/output files get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/P_generosa/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20220301-pgen-gff_to_gtf') analysis_dir="20220301-pgen-gff_to_gtf" # Input files (from NCBI) get_ipython().run_line_magic('env', 'gff=Panopea-generosa-v1.0.a4.gff3') # URL to download files from NCBI get_ipython().run_line_magic('env', 'url=https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming') # Output file(s) get_ipython().run_line_magic('env', 'gtf=Panopea-generosa-v1.0.a4.gtf') # Set program locations get_ipython().run_line_magic('env', 'gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread') # ### Create analysis directory # In[2]: get_ipython().run_cell_magic('bash', '', '# Make analysis and data directory, if doesn\'t exist\nmkdir --parents "${analysis_dir}"\n\nmkdir --parents "${data_dir}"\n') # ### Download GFF # In[3]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Download with wget.\n# Use --quiet option to prevent wget output from printing too many lines to notebook\n# Use --continue to prevent re-downloading fie if it\'s already been downloaded.\n# Use --no-check-certificate to avoid download error from gannet\nwget --quiet \\\n--continue \\\n--no-check-certificate \\\n${url}/${gff}\n\nls -ltrh "${gff}"\n') # ### Examine GFF # In[4]: get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${gff}"\n') # ### Convert GFF to GTF # In[5]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n${gffread} -E \\\n${data_dir}/"${gff}" -T \\\n1> ${analysis_dir}/"${gtf}" \\\n2> ${analysis_dir}/gffread-gff_to_gtf.stderr\n') # ### Inspect GTF # In[6]: get_ipython().run_cell_magic('bash', '', 'head ${analysis_dir}/"${gtf}"\n') # ### Generate checksum(s) # In[7]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n md5sum "${file}" | tee --append checksums.md5\ndone\n') # ### Document GffRead program options # In[8]: get_ipython().run_cell_magic('bash', '', '${gffread} -h\n') # In[ ]: