#!/usr/bin/env python # coding: utf-8 # ## Convert _P.meandrina_ GFF to GTF # # _P.meandrina_ GFF (and genome files) were originally obtained from here: # # http://cyanophora.rutgers.edu/Pocillopora_meandrina/ # # #### Notebook relies on: # # - [GffRead](https://github.com/gpertea/gffread) # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable # # - without `%env` is Python variable # In[2]: # Set directories, input/output files get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/P_meandrina/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20230519-pmea-gff_to_gtf') analysis_dir="20230519-pmea-gff_to_gtf" # Input files (from NCBI) get_ipython().run_line_magic('env', 'gff=Pocillopora_meandrina_HIv1.genes.gff3') # URL of file directory get_ipython().run_line_magic('env', 'url=owl:/volume1/web/halfshell/genomic-databank') # Output file(s) get_ipython().run_line_magic('env', 'gtf=Pocillopora_meandrina_HIv1.genes.gtf') # Set program locations get_ipython().run_line_magic('env', 'gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread') # Set some formatting stuff get_ipython().run_line_magic('env', 'break_line=--------------------------------------------------------------------------') # ### Create analysis directory # In[3]: get_ipython().run_cell_magic('bash', '', '# Make analysis and data directory, if doesn\'t exist\nmkdir --parents "${analysis_dir}"\n\nmkdir --parents "${data_dir}"\n') # ### Download GFF # In[4]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\nrsync "${url}/${gff}" .\n\n\nls -ltrh "${gff}"\n') # ### Examine GFF # In[5]: get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${gff}"\n') # #### Count unique number of fields # # This identifies if there are rows with >9 fields (which there shouldn't be in a [GFF3](http://gmod.org/wiki/GFF3)). # In[6]: get_ipython().run_cell_magic('bash', '', '# Capture number of fields (NF) in each row in array.\nfield_count_array=($(awk -F "\\t" \'{print NF}\' "${data_dir}/${gff}" | sort --unique))\n\n# Check array contents\necho "List of number of fields in ${data_dir}/${gff}:"\necho ""\nfor field_count in "${field_count_array[@]}"\ndo\n echo "${field_count}"\ndone\n\necho ""\necho "${break_line}"\necho ""\n\n# Preview of each line "type" with a given number of fields\n# Check array contents\necho ""\nfor field_count in "${field_count_array[@]}"\ndo\n echo "Preview of lines with ${field_count} field(s):"\n echo ""\n awk \\\n -v field_count="${field_count}" \\\n -F "\\t" \\\n \'NF == field_count\' \\\n "${data_dir}/${gff}" \\\n | head\n echo ""\n echo "${break_line}"\n echo ""\ndone\n') # ### Convert GFF to GTF # In[7]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n${gffread} -E \\\n${data_dir}/"${gff}" -T \\\n1> ${analysis_dir}/"${gtf}" \\\n2> ${analysis_dir}/gffread-gff_to_gtf.stderr\n') # ### Inspect GTF # In[8]: get_ipython().run_cell_magic('bash', '', 'head ${analysis_dir}/"${gtf}"\n') # ### Generate checksum(s) # In[9]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n md5sum "${file}" | tee --append checksums.md5\ndone\n') # ### Document GffRead program options # In[10]: get_ipython().run_cell_magic('bash', '', '${gffread} -h\n')