#!/usr/bin/env python # coding: utf-8 # ## Validate and Convert _P.verrucosa_ GFF to GTF # # ### Notebook relies on: # # - [GffRead](https://github.com/gpertea/gffread) # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable # # - without `%env` is Python variable # In[14]: # Set directories, input/output files get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/M_capitata/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20230127-pver-gff_to_gtf') analysis_dir="20230127-pver-gff_to_gtf" # Input files (from NCBI) get_ipython().run_line_magic('env', 'gff=Pver_genome_assembly_v1.0.gff3') # URL of file directory get_ipython().run_line_magic('env', 'url=https://owl.fish.washington.edu/halfshell/genomic-databank') # Output file(s) get_ipython().run_line_magic('env', 'valid_gff=Pver_genome_assembly_v1.0-valid.gff3') get_ipython().run_line_magic('env', 'gtf=Pver_genome_assembly_v1.0-valid.gtf') # Set program locations get_ipython().run_line_magic('env', 'gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread') # Set some formatting stuff get_ipython().run_line_magic('env', 'break_line=--------------------------------------------------------------------------') # ### Create analysis directory # In[3]: get_ipython().run_cell_magic('bash', '', '# Make analysis and data directory, if doesn\'t exist\nmkdir --parents "${analysis_dir}"\n\nmkdir --parents "${data_dir}"\n') # ### Download GFF # In[4]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Download with wget.\n# Use --quiet option to prevent wget output from printing too many lines to notebook\n# Use --continue to prevent re-downloading fie if it\'s already been downloaded.\n# Use --no-check-certificate to avoid download error from gannet\nwget --quiet \\\n--continue \\\n--no-check-certificate \\\n${url}/${gff}\n\nls -ltrh "${gff}"\n') # ### Examine GFF # #### Check first 20 lines # In[5]: get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${gff}"\n') # #### Count unique number of fields # # This identifies if there are rows with >9 fields (which there shouldn't be in a [GFF3](http://gmod.org/wiki/GFF3)). # In[15]: get_ipython().run_cell_magic('bash', '', '# Capture number of fields (NF) in each row in array.\nfield_count_array=($(awk -F "\\t" \'{print NF}\' "${data_dir}/${gff}" | sort --unique))\n\n# Check array contents\necho "List of number of fields in ${data_dir}/${gff}:"\necho ""\nfor field_count in "${field_count_array[@]}"\ndo\n echo "${field_count}"\ndone\n\necho ""\necho "${break_line}"\necho ""\n\n# Preview of each line "type" with a given number of fields\n# Check array contents\necho ""\nfor field_count in "${field_count_array[@]}"\ndo\n echo "Preview of lines with ${field_count} field(s):"\n echo ""\n awk \\\n -v field_count="${field_count}" \\\n -F "\\t" \\\n \'NF == field_count\' \\\n "${data_dir}/${gff}" \\\n | head\n echo ""\n echo "${break_line}"\n echo ""\ndone\n') # In the above results, we can see that there are rows with 10 and 11 fields, due to incorrect tabs entered in Field 9. This creates an invalid GFF3 file. # ### Fix invalid tabs in Field 9 # # This will remove tabs in Field 9 and join those extra fields into Field 9 using a semi-colon separator with the `Note=` attribute. # In[19]: get_ipython().run_cell_magic('bash', '', 'time \\\nwhile read -r line\ndo\n # Count number of fields\n number_of_fields=$(echo "${line}" | awk -F "\\t" \'{print NF}\')\n\n # If number of fields is 9, print the entire line\n if [[ "${number_of_fields}" -eq 9 ]]; then\n echo "${line}"\n # If there are 10 fields, cut/capture the first 9 and\n # capture the 10th.\n # Use printf to join 10th field to 9th via semi-colon\n elif [[ "${number_of_fields}" -eq 10 ]]; then\n first_nine=$(echo "${line}" | cut -f 1-9)\n tenth=$(echo "${line}" | cut -f 10 )\n printf "%s;%s\\n" "${first_nine}" "Note=${tenth}"\n # If there are 11 fields, cut/capture the first 9, followed\n # by capturing the 10th and 11th fields.\n # Use printf to join 10th and 11th field to 9th via semi-colon.\n elif [[ "${number_of_fields}" -eq 11 ]]; then\n first_nine=$(echo "${line}" | cut -f 1-9)\n tenth=$(echo "${line}" | cut -f 10)\n eleventh=$(echo "${line}" | cut -f 11)\n printf "%s;%s;%s\\n" "${first_nine}" "Note=${tenth}" "Note=${eleventh}"\n else\n # If the line doesn\'t match any of the above, just print it.\n echo "${line}"\n fi\n\ndone < "${data_dir}/${gff}" > "${analysis_dir}/${valid_gff}"\n') # #### Compare line numbers between the two GFFs to make sure we didn't lose anything # In[20]: get_ipython().run_cell_magic('bash', '', 'wc -l "${data_dir}/${gff}"\nwc -l "${analysis_dir}/${valid_gff}"\n') # #### Count fields to make sure no more rows with >9 fields # In[21]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{print NF}\' "${analysis_dir}/${valid_gff}" | sort --unique\n') # ### Convert GFF to GTF # In[22]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n${gffread} -E \\\n"${analysis_dir}/${valid_gff}" -T \\\n1> ${analysis_dir}/"${gtf}" \\\n2> ${analysis_dir}/gffread-gff_to_gtf.stderr\n') # ### Inspect GTF # In[23]: get_ipython().run_cell_magic('bash', '', 'head ${analysis_dir}/"${gtf}"\n') # ### Generate checksum(s) # In[24]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n md5sum "${file}" | tee --append checksums.md5\ndone\n') # ### Document GffRead program options # In[25]: get_ipython().run_cell_magic('bash', '', '${gffread} -h\n')