#!/usr/bin/env python # coding: utf-8 # ## Generate _P.generosa_ FastA files from CDS, gene, and mRNA GFFs # # See this [GitHub Issue](https://github.com/RobertsLab/resources/issues/1439) # # This notebook relies on [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) to be installed and available in your `$PATH`. # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE"\ndate\necho "------------"\necho ""\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "\nhostname\necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable # - without `%env` is Python variable # In[2]: # Set directories get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/P_generosa/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20220324-pgen-gffs_to_fastas') analysis_dir="/home/samb/analyses/20220324-pgen-gffs_to_fastas" # Input GFFs get_ipython().run_line_magic('env', 'gff_url=https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/') get_ipython().run_line_magic('env', 'gff_prefix=Panopea-generosa-v1.0.a4.') # Genome FastA get_ipython().run_line_magic('env', 'genome_fasta=Panopea-generosa-v1.0.fa') # Programs get_ipython().run_line_magic('env', 'bedtools=/home/sam/programs/bedtools-2.29.1/bin/bedtools') get_ipython().run_line_magic('env', 'samtools=/home/sam/programs/samtools-1.12/samtools') # Formatting get_ipython().run_line_magic('env', 'line_break="----------------------------------------------------------------------------------------------"') # ### Make data and analysis directories if they don't exist # In[3]: get_ipython().run_cell_magic('bash', '', 'mkdir --parents "${analysis_dir}" "${data_dir}"\n') # ### Download FastA and GFFs # # If needing to download via `wget`, be sure to include `--no-check-certificate` option to avoid error. # In[4]: get_ipython().run_cell_magic('bash', '', '\ncd "${data_dir}"\n\n# Array of GFF files.\ngff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)\n\n# Download GFFs\nfor gff in "${gff_array[@]}"\ndo\n wget \\\n --no-check-certificate \\\n --continue \\\n --quiet \\\n ${gff_url}${gff}\ndone\n\n# Download FastA\nwget \\\n--no-check-certificate \\\n--continue \\\n--quiet \\\n${gff_url}${genome_fasta}\n\nls -lh\n') # ### Generate MD5 checksums for reference # In[5]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\nmd5sum *\n') # ### Examine GFF files # In[6]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Array of GFF files\ngff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)\n\n# Make a list so subsequent head command lists filenames in output\ngff_list=$(echo "${gff_array[@]}")\n\nhead ${gff_list}\n') # ### Create customized BED file from GFFs # # - Formatted to use the "name" column in the BED format for use with `bedtools` later... # # - 4th column will be: `geneID|parentID` or `geneID` # In[7]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Array of GFF files\ngff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)\n\n\nfor gff in "${gff_array[@]}"\ndo\n\n # Trim of filename prefix\n trimmed_name=${gff/Panopea-generosa-v1.0.a4./}\n \n # Trim off filename suffix to get genome feature\n feature=${trimmed_name/.gff3/}\n \n if [[ "${feature}" = "mRNA" ]]\n then\n # Run gtf_extractor on GFF files\n gtf_extract \\\n --gff \\\n --fields=chr,start,end,ID,Parent \\\n "${gff}" \\\n | awk \'{print $1 "\\t" $2 "\\t" $3 "\\t" $4 "|" $5}\' \\\n > ${analysis_dir}/"${feature}".bed.tmp\n \n # Adds gene parent ID by rmoving mRNA ".m[0-9][0-9]" at end of line\n elif [[ "${feature}" = "CDS" ]]\n then\n # Run gtf_extractor on GFF files\n gtf_extract \\\n --gff \\\n --fields=chr,start,end,ID,Parent,Parent \\\n "${gff}" \\\n | awk \'{print $1 "\\t" $2 "\\t" $3 "\\t" $4 "|" $5 "|" $6}\' | sed \'s/.m[0-9][0-9]$//\' \\\n > ${analysis_dir}/"${feature}".bed.tmp\n \n # Excludes Parent ID, since genes don\'t have a Parent ID\n elif [[ "${feature}" = "gene" ]]\n then\n # Run gtf_extractor on GFF files\n gtf_extract \\\n --gff \\\n --fields=chr,start,end,ID \\\n "${gff}" \\\n | awk \'{print $1 "\\t" $2 "\\t" $3 "\\t" $4}\' \\\n > ${analysis_dir}/"${feature}".bed.tmp\n fi\ndone\n\nls -lh ${analysis_dir}/*.bed.tmp\n') # ### Examine temporary BED files # In[8]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor bed in *.bed.tmp\ndo\n echo ""\n echo "${bed}"\n echo ""\n head "${bed}"\n echo ""\n echo "${line_break}"\ndone\n') # ### Create FastA files # In[9]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor bed in *.bed.tmp\ndo\n # Get feature by removing strings after first period\n feature=${bed%%.*}\n \n # Used BEDTOOLS getfasta to make FastAs from GFFs\n ${bedtools} getfasta \\\n -name \\\n -fi ${data_dir}/${genome_fasta} \\\n -bed ${bed} \\\n > ${gff_prefix}${feature}.fasta\n \n # Remove tmp BED file\n echo ""\n echo "Removing ${bed}."\n rm "${bed}"\ndone\n\nls -lh *.fasta\n') # ### Create FastA index files # In[10]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor fasta in *.fasta\ndo\n ${samtools} faidx "${fasta}"\ndone\n\nls -ltrh\n') # ### Examine FastAs # In[11]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor fasta in *.fasta\ndo\n grep --with-filename "^>" "${fasta}" | head\ndone\n') # ### Generate MD5 checksums # In[12]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nmd5sum * | tee --append checksums.md5\n') # ### Remove unneeded data files # In[13]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Array of GFF files\ngff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)\n\n# Remove genome FastA\necho "Removing ${genome_fasta}."\nrm "${genome_fasta}"\n\n# Remove GFFs\nfor gff in "${gff_array[@]}"\ndo\n echo ""\n echo "Removing ${gff}."\n rm "${gff}"\ndone\n\nls -lh\n') # ### Program options # In[14]: get_ipython().run_cell_magic('bash', '', '\ngtf_extract -h\n\necho ""\necho "${line_break}"\necho "${line_break}"\necho ""\n\n${samtools} faidx -h\n\necho ""\necho "${line_break}"\necho "${line_break}"\necho ""\n\n${bedtools} getfasta -h\n') # In[ ]: