#!/usr/bin/env python # coding: utf-8 # ## Create _S.salar_ UniProt annotations file for Shelly # # See this [GitHub Issue](https://github.com/RobertsLab/resources/issues/1220) # # This notebook relies on [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) to be installed and available in your `$PATH`. # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable; without `%env` is Python variable # In[1]: # Set directories, input/output files get_ipython().run_line_magic('env', 'data_dir=/home/samb/data/S_salar/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/samb/analyses/20210601_ssal_gff-annotations') analysis_dir="/home/samb/analyses/20210601_ssal_gff-annotations" # Input GFF get_ipython().run_line_magic('env', 'orig_gff=GCF_000233375.1_ICSASG_v2_genomic.gff') get_ipython().run_line_magic('env', 'orig_gff_url=https://gannet.fish.washington.edu/metacarcinus/Salmo_Calig/GENOMES/v2/RefSeq') # UniProt batch output get_ipython().run_line_magic('env', 'perl_output=20210601_ssal_uniprot_batch_results.txt') # GTF extractor output get_ipython().run_line_magic('env', 'gtf_extractor_output=20210601_ssal_chrom-start-end-Dbxref.csv') # Gene name list for UniProt batch submission get_ipython().run_line_magic('env', 'gene_list=20210601_ssal_gene-list.txt') # Parsed UniProt get_ipython().run_line_magic('env', 'parsed_uniprot=20210601_ssal_accession-gene_id-gene-gene_description-go_ids.csv') # Final output get_ipython().run_line_magic('env', 'joined_output=20210601_ssal_chrom-gene_id_start-end-acc-gene-gene_description-go_ids.csv') # ### Download GFF # In[3]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Download with wget. Use --no-check-certificate to avoid issues with Gannet certificate\n# Use --quiet option to prevent wget output from printing too many lines to notebook\nwget --no-check-certificate --quiet ${orig_gff_url}/${orig_gff}\n\nls -ltrh "${orig_gff}"\n') # ### Examine GFF # In[4]: get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${orig_gff}"\n') # ### Use GFFutils to extract gene features # In[7]: get_ipython().run_cell_magic('bash', '', '# Extract just gene features\n# Extract chromosome name, start, end, and Dbxref fields\n# Dbxref is the NCBI gene name, in this particular instance\n# Specify input as GFF\n# Use awk to format as comma-delimited output to help with downstream parsing/joining\ntime \\\ngtf_extract \\\n--feature gene \\\n--fields=chrom,start,end,Dbxref \\\n--gff ${data_dir}/${orig_gff} \\\n| awk \'BEGIN { OFS = ","; FS="[\\t:]"} {print $1, $2, $3, $5}\' \\\n> ${analysis_dir}/${gtf_extractor_output}\n') # #### Check results # In[8]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nls -ltrh ${gtf_extractor_output}\n\necho ""\n\nhead ${gtf_extractor_output}\n') # #### Confirm that GFFutils output seem okay # In[10]: get_ipython().run_cell_magic('bash', '', '# Count gene features via GFFutils\necho "GFFutils number of extracted genes:"\ngtf_extract -f gene --fields=Dbxref --gff ${data_dir}/${orig_gff} | wc -l\n\necho ""\n\n# Count gene features via awk\necho "awk number of extracted genes:"\nawk \'$3 == "gene" { print $0 }\' ${data_dir}/${orig_gff} | wc -l\n') # ### Extract gene ids for batch submission to UniProt # In[11]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nawk -F"," \'{print $4}\' "${gtf_extractor_output}" > "${gene_list}"\n') # #### Check gene list # In[12]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nhead "${gene_list}"\n') # ### Batch submission to UniProt # # Perl script obtained from UniProt: https://www.uniprot.org/help/api_batch_retrieval # # Modified to map NCIB gene ID to UniProt accession. # In[13]: get_ipython().run_cell_magic('bash', '', '# Print the script for viewing\ncat /home/samb/programs/uniprot_mapping.pl\n') # In[16]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\ntime \\\nperl /home/samb/programs/uniprot_mapping.pl "${gene_list}" > "${perl_output}"\nls -ltrh\n\necho ""\necho ""\necho "--------------------------------------------------"\necho ""\necho "Line count:"\nwc -l "${perl_output}"\n') # #### Check mapping output # In[17]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nhead -n 30 "${perl_output}"\n') # ### Parse out the stuff we want: # # - UniProt accession # # - Gene ID (NCBI gene ID) # # - Gene name/abbraviation # # - Gene description # # - GO terms # In[18]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\ntime \\\nwhile read -r line\ndo\n # Get record line descriptor\n descriptor=$(echo "${line}" | awk \'{print $1}\' )\n\n # Capture second field for evaluation\n go_line=$(echo "${line}" | awk \'{print $2}\')\n\n # Append GO IDs to array\n if [[ "${go_line}" == "GO;" ]]; then\n go_id=$(echo "${line}" | awk \'{print $3}\')\n go_ids_array+=("${go_id}")\n elif [[ "${go_line}" == "GeneID;" ]]; then\n # Uses sed to strip trailing semi-colon\n gene_id=$(echo "${line}" | awk \'{print $3}\' | sed \'s/;$//\')\n fi\n\n # Get gene description\n if [[ "${descriptor}" == "DE" ]] && [[ "${go_line}" == "SubName:" ]]; then\n # Uses sed to strip trailing spaces at end of line and remove commas\n gene_description=$(echo "${line}" | awk -F"[={]" \'{print $2}\' | sed \'s/[[:blank:]]*$//\' | sed \'s/,//g\')\n\n # Get gene name\n elif [[ "${descriptor}" == "GN" ]] && [[ $(echo "${line}" | awk -F "=" \'{print $1}\') == "GN Name" ]]; then\n # Uses sed to strip trailing spaces at end of line\n gene=$(echo "${line}" | awk -F\'Name=|{\' \'{print $2}\' | sed \'s/[[:blank:]]*$//\')\n\n # Get UniProt accession\n elif [[ "${descriptor}" == "AC" ]]; then\n # Uses sed to strip trailing semi-colon\n accession=$(echo "${line}" | awk \'{print $2}\' | sed \'s/;$//\')\n\n # Identify beginning on new record\n elif [[ "${descriptor}" == "//" ]]; then\n\n # Prints other comma-separated variables, then GOID1;GOID2;GOIDn\n # IFS prevents spaces from being added between GO IDs\n # sed removes ";" after final GO ID\n (IFS=; printf "%s,%s,%s,%s,%s\\n" "${accession}" "${gene_id}" "${gene}" "${gene_description}" "${go_ids_array[*]}" | sed \'s/;$//\')\n\n # Re-initialize variables\n accession="" \n descriptor=""\n gene=""\n gene_description\n gene_id=""\n go_id=""\n go_ids_array=()\n fi\n\n\ndone < "${perl_output}" >> "${parsed_uniprot}"\n') # Despite notebook error message, if you check the time stamps on the files below, it looks like this took nearly 6.5hrs!! # #### Check parsed file # In[20]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nls -ltrh\necho ""\nhead "${parsed_uniprot}"\n\necho ""\necho ""\necho "--------------------------------------------------"\necho ""\necho "Line count:"\nwc -l "${parsed_uniprot}"\n') # Line count looks reasonable, as I know that some NCBI gene IDs are associated with multiple UniProt accessions, so we should end up with more results than were submitted. # ### Join parsed UniProt info with chromosome names # This will sort the both files on the columns with the NCBI gene ID for joining. # # Then, it will replace the commas with tabs and re-order the columns so that the NCBI chromosome is in the first column. # In[22]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\njoin \\\n-t "," \\\n-1 4 \\\n-2 2 \\\n<(sort -t "," -k 4,4 "${gtf_extractor_output}") \\\n<(sort -t "," -k2,2 "${parsed_uniprot}") \\\n| awk \'BEGIN {FS=","; OFS="\\t"} {print $2, $1, $3, $4, $5, $6, $7, $8}\' \\\n> "${joined_output}"\n') # In[23]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nls -ltrh\n\necho ""\n\necho "Line count:"\nwc -l "${joined_output}"\n') # Hmmm, does this line count make sense? # # Submitted to UniProt: 79030 # # Returned from UniProt: 82393 # # Joined: 82570 # Look at the file. Columns will be tab-separated in this order: # # | chromosome | NCBI gene ID | start | end | UniProt accession | gene abbreviation/name | gene description | GO IDs | # |---|----|----|----|---|----|----|----| # In[24]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nhead "${joined_output}"\n') # ### Generate cheksums # # Forgot to do this before closing notebook. # In[2]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n md5sum "${file}" | tee --append checksums.md5\ndone\n') # In[ ]: