#!/usr/bin/env python # coding: utf-8 # ## Create _P.generosa_ primary gene annotations mapping file # # See this [GitHub Issue](https://github.com/RobertsLab/resources/issues/1443). # # This notebook will utilize NCBI BLAST and DIAMOND BLAST annotations generated by our [GenSas _P.generosa_ genome annotation](https://robertslab.github.io/sams-notebook/2019/09/28/Genome-Annotation-Pgenerosa_v074-a4-Using-GenSAS.html). # # It will compare the two sets of SwissProt ID annotations (SPIDs) to determine lowest E-value and use that entry as the representative entry for a gene. It will then use that canonical list of SPIDs to pull gene names and gene ontology (GO) IDs from UniProt, and create a tab-deltimited annotation mapping file. # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE"\ndate\necho "------------"\necho ""\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "\nhostname\necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable # - without `%env` is Python variablec # In[2]: ###################################################################### ### Set directories get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/P_generosa/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20220419-pgen-gene_annotation_mapping') analysis_dir="/home/sam/analyses/20220419-pgen-gene_annotation_mapping" ##################################################################### ### Input files get_ipython().run_line_magic('env', 'base_url=https://gannet.fish.washington.edu/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation') get_ipython().run_line_magic('env', 'blast_annotations=Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab') get_ipython().run_line_magic('env', 'diamond_annotations=Panopea-generosa-vv0.74.a4.5d951bcf45b4b-diamond_functional.tab') ###################################################################### ### Output files # UniProt batch output get_ipython().run_line_magic('env', 'uniprot_output=20220419-pgen-uniprot_batch-results.txt') # Gene name list for UniProt batch submission get_ipython().run_line_magic('env', 'spid_list=Panopea-generosa-v1.0.a4-blast-diamond-functional-SPIDs.txt') # Genome IDs and SPIDs get_ipython().run_line_magic('env', 'genome_IDs_SPIDs=Panopea-generosa-v1.0.a4-blast-diamond-functional-genome_IDs-SPIDs.txt') get_ipython().run_line_magic('env', 'blast_diamond_cat=Panopea-generosa-v1.0.a4-blast-diamond-functional.tab') get_ipython().run_line_magic('env', 'blast_diamond_cat_best=Panopea-generosa-v1.0.a4-blast-diamond-functional_best.tab') # Parsed UniProt get_ipython().run_line_magic('env', 'parsed_uniprot=20220419-pgen-accession-gene_name-gene_description-go_ids.tab') # Final output get_ipython().run_line_magic('env', 'joined_output=20220419-pgen-gene-accessions-gene_id-gene_name-gene_description-alt_gene_description-go_ids.tab') ###################################################################### ### Programs # UniProt batch submission/retrieval script get_ipython().run_line_magic('env', 'uniprot_mapping_script=/home/sam/programs/uniprot_mapping.pl') # ### Make input/output directories # In[3]: get_ipython().run_cell_magic('bash', '', '# If directories don\'t exist, make them\nmkdir --parents "${data_dir}" "${analysis_dir}"\n') # ### Download and inspect annotation files # # `--quiet`: Prevents `wget` output from overwhelming Jupyter Notebook # # `--continue`: If download was previously initiated, will continue where leftoff and will not create a second file if one already exists. # In[4]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\nwget --quiet --continue ${base_url}/${blast_annotations}\nwget --quiet --continue ${base_url}/${diamond_annotations}\n\nls -ltrh\n\necho ""\necho "---------------------------------------------------------"\necho ""\nhead -n 25 *.tab\n') # ### Count number of header lines (i.e. beginning with a `#` # In[5]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\ngrep -c "^#" "${blast_annotations}" "${diamond_annotations}"\n') # ### Concatenate annotation files and keep only one with best `e-value` # # Also modifies mRNA names to generate gene names instead. # # - `awk 'NR > 13'`: Skips first 13 header lines # - `sort -k1,1 -k9,9`: Sorts on first field (mRNA name), then on 9th field (e-value) # - `sed 's/^21910-//'`: Removes leading info from each mRNA name, at the beginning of each line (`^`) # - `sed 's/.m0[0-9]//'`: Removes `.m0N` from each mRNA name. # - `awk '!array[$1]++'`: `awk` array that only prints line if it's the first occurrence of gene name (first field; `$1` (i.e. no duplicates) # # --- # # Also replaces two obsolete SPIDs, as [identified in previous notebook run](http://localhost:8888/lab/workspaces/auto-W/tree/20220419-pgen-gene_annotation_mapping.ipynb#Manually-look-up-SPIDs-in-UniProt). # In[50]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Concatenate both annotation files\nfor file in ${blast_annotations} ${diamond_annotations}\ndo\n awk \'NR > 13\' ${file}\ndone \\\n>> "${analysis_dir}"/"${blast_diamond_cat}"\n\n# Sort for best e-value and perform formatting of genome IDs\nsort -k1,1 -k9,9 "${analysis_dir}"/"${blast_diamond_cat}" \\\n| sed \'s/^21910-//\' \\\n| sed \'s/.m0[0-9]//\' \\\n| awk \'!array[$1]++\' \\\n>> "${analysis_dir}"/"${blast_diamond_cat_best}"\n\n# Find/replace two obsolete SPIDs\nsed -i \'s/Q6ZRR9/M0R2J8/g\' "${analysis_dir}"/"${blast_diamond_cat_best}"\nsed -i \'s/Q9NPA5/Q9NTW7/g\' "${analysis_dir}"/"${blast_diamond_cat_best}"\n\necho ""\necho "Line count:"\n\nwc -l "${analysis_dir}"/"${blast_diamond_cat}"\n\necho "--------------------------------------------------"\n\necho ""\necho "Line count:"\n\nwc -l "${analysis_dir}"/"${blast_diamond_cat_best}"\n\necho "--------------------------------------------------"\necho ""\n\nhead -n 25 "${analysis_dir}"/"${blast_diamond_cat_best}"\n') # ### Create list of genome IDs and SwissProt IDs # In[51]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nawk \'{print $5,"\\t",$1}\' "${blast_diamond_cat_best}" > "$genome_IDs_SPIDs"\n\necho ""\necho "Line count:"\n\nwc -l "$genome_IDs_SPIDs"\n\necho "--------------------------------------------------"\n\nhead "$genome_IDs_SPIDs"\n') # ### Create list os SwissProt IDs # In[52]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nawk \'{print $5}\' "${blast_diamond_cat_best}" > "${spid_list}"\n\necho ""\necho "Line count:"\n\nwc -l "${spid_list}"\n\necho "--------------------------------------------------"\n\necho ""\n\nhead "${spid_list}"\n') # ### Batch submission/retrieval to/from UniProt # # Perl script obtained from UniProt: https://www.uniprot.org/help/api_batch_retrieval # # Modified to accept file with list of IDs and to map SPID to UniProt Accession # In[53]: get_ipython().run_cell_magic('bash', '', '# Print script for viewing\ncat "${uniprot_mapping_script}"\n') # In[54]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\n# Run UniProt Prel mapping script and time how long it takes\ntime \\\nperl "${uniprot_mapping_script}" "${spid_list}" > "${uniprot_output}"\n\nls -ltrh\n\necho ""\necho ""\necho "--------------------------------------------------"\necho ""\necho "Line count:"\n\nwc -l "${uniprot_output}"\n\necho "--------------------------------------------------"\n') # ### Check mapping output # # Counting Accession lines (beginning with `AC`) should show a _lower_ count than the number of SwissProt IDs submitted, as UniProt automatically removes duplicates upon submission. # In[55]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nhead -n 30 "${uniprot_output}"\n\necho ""\n\necho "----------------------------------------------------"\n\necho ""\n\necho "Number of accessions:"\n\necho ""\n\ngrep -c "^AC" "${uniprot_output}"\n') # ### Parse the stuff we want # # - UniProt accession # # - Gene name/abbreviation # # - Gene description # # - GO IDs # #### Check DE descriptor lines to decide pattern matching # # Checks lines beginning with `DE` to identify values in the 2nd field with `Name` in them. # # Identifies unique values. This will determine how to parse properly after this. # In[56]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\ngrep "^DE" "${uniprot_output}" | awk \'$2 ~ /Name/ { print $2 }\' | sort -u\n') # In[57]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\n# Loop through UniProt records\ntime \\\nwhile read -r line\ndo\n # Get record line descriptor\n descriptor=$(echo "${line}" | awk \'{print $1}\')\n\n # Capture second field for evaluation\n go_line=$(echo "${line}" | awk \'{print $2}\')\n\n # Append GO IDs to array\n if [[ "${go_line}" == "GO;" ]]; then\n go_id=$(echo "${line}" | awk \'{print $3}\')\n go_ids_array+=("${go_id}")\n elif [[ "${go_line}" == "GeneID;" ]]; then\n # Uses sed to strip trailing semi-colon\n gene_id=$(echo "${line}" | awk \'{print $3}\' | sed \'s/;$//\')\n fi\n\n # Get gene description\n if [[ "${descriptor}" == "DE" ]] && [[ "${go_line}" == "RecName:" ]]; then\n # Uses sed to strip trailing spaces at end of line and remove commas\n gene_description=$(echo "${line}" | awk -F "[={]" \'{print $2}\' | sed \'s/[[:blank:]]*$//\' | sed \'s/,//g\' | sed \'s/;$//\')\n\n # Get alternate name\n elif [[ "${descriptor}" == "DE" ]] && [[ "${go_line}" == "AltName:" ]]; then\n # Uses sed to strip trailing spaces at end of line and remove commas\n alt_gene_description=$(echo "${line}" | awk -F "[={]" \'{print $2}\' | sed \'s/[[:blank:]]*$//\' | sed \'s/,//g\' | sed \'s/;$//\')\n\n # Get gene name\n elif [[ "${descriptor}" == "GN" ]] && [[ $(echo "${line}" | awk -F "=" \'{print $1}\') == "GN Name" ]]; then\n # Uses sed to strip trailing spaces at end of line\n gene=$(echo "${line}" | awk -F \'Name=|{|;\' \'{print $2}\' | sed \'s/[[:blank:]]*$//\')\n\n # Get UniProt accession\n elif [[ "${descriptor}" == "AC" ]]; then\n # awk removes "AC" denotation\n # sed removes all spaces\n # sed removes trailing semi-colon\n # Uses array to handle accessions being on multiple lines of UniProt records file\n accession=$(echo "${line}" | awk \'{$1="";print $0}\' | sed \'s/[[:space:]]*//g\' | sed \'s/;$//\')\n accessions_array+=("${accession}")\n\n # Identify beginning on new record\n elif [[ "${descriptor}" == "//" ]]; then\n\n # Prints other comma-separated variables, then GOID1;GOID2;GOIDn\n # IFS prevents spaces from being added between GO IDs\n # sed removes ";" after final GO ID\n (IFS=; printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n" "${accessions_array[*]}" "${gene_id}" "${gene}" "${gene_description}" "${alt_gene_description}" "${go_ids_array[*]}" | sed \'s/;$//\')\n\n # Re-initialize variables\n accession="" \n accessions_array=()\n descriptor=""\n gene=""\n gene_description=""\n gene_id=""\n go_id=""\n go_ids_array=()\n fi\n\n\ndone < "${uniprot_output}" >> "${parsed_uniprot}"\n') # ### Inspect parsed UniProt file # In[58]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nwc -l "${parsed_uniprot}"\n\necho ""\necho "------------------------------------------------------------------"\necho ""\n\nhead -n 25 "${parsed_uniprot}"\n') # In[59]: get_ipython().run_cell_magic('html', '', '# Sets markdown table align left in subsequent cell\n\n') # ### Combine with original list of genes and SPIDs # # Output format (tab-delimited): # # | gene_ID | SPIDs | UniProt_gene_ID | gene | gene_description | alternate_gene_description | GO_IDs | # |---------|-------|-----------------|------|------------------|----------------------------|--------| # # # Explanation: # # - `awk -v FS='[;[:space:]]+'`: Sets the Field Separator variable to handle `; ` in UniProt accessions. Allows for proper searching. # # - `FNR == NR`: Restricts next block (designated by `{}`) to work only on first input file. # # - `{array[$1]=$0; next}`: Adds the entire line (`$0`) of the first file to the array names `array` and then moves on to the next set of commands for the second input file. # # - `($1 in array)`: Looks for the value of the first column (`$1`, which is SPID) from the second file to see if there's a match from the array (which contains the line from the first file). # # - `{print $2,array[$1]}'`: If there's a match, print the second column (`$2`, which is gene ID) from the second file, followed by the line from the first file. # # - `"${parsed_uniprot}" "${spid_list}"`: The first and second input files. # # - `"${joined_output}"`: Result of the join. # In[60]: get_ipython().run_cell_magic('bash', '', '\ncd "${analysis_dir}"\n\nawk \\\n-v FS=\'[;[:space:]]+\' \\\n\'NR==FNR \\\n{array[$1]=$0; next} \\\n($1 in array) \\\n{print $2"\\t"array[$1]}\' \\\n"${parsed_uniprot}" "${genome_IDs_SPIDs}" \\\n> "${joined_output}"\n') # ### Inspect final annotation file # In[61]: get_ipython().run_cell_magic('bash', '', '\ncd "${analysis_dir}"\n\nwc -l "${joined_output}"\n\necho ""\necho "------------------------------------------------------------------"\necho ""\n\nhead -n 25 "${joined_output}"\n') # ## DO NOT EDIT! CELLS BELOW THIS!! # # The cells below are for reference, as they were used to identify a set of obsolete/missing SPIDs. These were then used in a subsequent re-run of the noteook to refine things. The remaining cells document this process. # # The original number of SPIDs was 14676, but we ended up with only 14668 matches. # # Let's see if we can figure out why... # ### Identify differences in lists of genome IDs # In[36]: get_ipython().run_cell_magic('bash', '', '\ncd "${analysis_dir}"\n\ndiff <(awk \'{print $1}\' "${joined_output}") <(awk \'{print $2}\' "${genome_IDs_SPIDs}")\n') # ### Get SPIDs of "missing" genome IDs # In[21]: get_ipython().run_cell_magic('bash', '', '\ncd "${analysis_dir}"\n\nfor id in PGEN_.00g050810 PGEN_.00g119250 PGEN_.00g162250 PGEN_.00g185820 PGEN_.00g209090 PGEN_.00g222140 PGEN_.00g258380 PGEN_.00g304800\ndo\n grep "${id}" "${genome_IDs_SPIDs}"\ndone\n') # ### Manually look up SPIDs in UniProt # - [Q3UN21](https://www.uniprot.org/uniprot/Q3UN21): Obsolete/deleted from UniProt. # # - [P0DN79](https://www.uniprot.org/uniprot/P0DN79): Obsolete/deleted from UniProt. # # - [Q6ZRR9](https://www.uniprot.org/uniprot/M0R2J8): Redirects to SPID M0R2J8. # # - [Q9NPA5](https://www.uniprot.org/uniprot/Q9NTW7): Redirects to SPID Q9NTW7. # # - [Q6ZR98](https://www.uniprot.org/uniprot/Q6ZR98): Obsolete/deleted from UniProt. # # - [Q5T699](https://www.uniprot.org/uniprot/Q5T699): Obsolete/deleted from UniProt. # # Using information above, will add code to find/replace the redirected SPIDs and then re-run rest of notebook. # In[ ]: