#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # #### Set variables # In[1]: get_ipython().run_line_magic('env', 'seqkit=/home/sam/programs/seqkit-v0.9.3') get_ipython().run_line_magic('env', 'data_dir=/home/sam/data') get_ipython().run_line_magic('env', 'analyses_dir=/home/sam/analyses/20190225_cpg_oe') # #### Download data # In[3]: get_ipython().run_cell_magic('bash', '', 'cd ${data_dir}\ntime \\\nwget \\\n--recursive \\\n--no-directories \\\n--no-parent \\\n--quiet \\\n--accept "*GENE.fa" \\\n"http://gannet.fish.washington.edu/seashell/bu-serine-wd/19-01-08/"\n\nls -lhtr *.fa\n') # Prepared at request of this GitHub Issue: # # https://github.com/RobertsLab/resources/issues/593 # # Code is modified from the following link in order to loop through a large number of files: # # http://htmlpreview.github.io/?https://github.com/hputnam/EastOyEpi/blob/master/02-Cpg-test.html # In[4]: get_ipython().run_cell_magic('bash', '', 'fa_array=(${data_dir}/*GENE.fa)\n\nfor fa in ${fa_array[@]}\ndo\n fn=$(basename ${fa} .fa)\n mkdir ${fn}_analysis\n cd ${fn}_analysis\n fx2tab \\\n --length \\\n ${fa} \\\n > ${fa}_tab\n awk \'{ print $2 }\' ${fa}_tab > ${fa}_tab2\n awk -F\\[Cc][Gg] \'{print NF-1}\' ${fa}_tab_2 > CG\n awk -F\\[Cc] \'{print NF-1}\' ${fa}_tab_2 > C\n awk -F\\[Gg] \'{print NF-1}\' ${fa}_tab_2 > G\n paste ${fa} \\\n CG \\\n C \\\n G \\\n > comb\n awk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n > ID_CpG\ndone\n') # In[7]: get_ipython().run_cell_magic('bash', '', 'mkdir ${analyses_dir}\n\nfa_array=(${data_dir}/*GENE.fa)\n\nfor fa in "${fa_array[@]}"\ndo\n cd ${analyses_dir}\n fn=$(basename ${fa} .fa)\n mkdir ${fn}_analysis\n cd ${fn}_analysis\n ${seqkit} fx2tab \\\n --length \\\n ${fa} \\\n > ${analyses_dir}/${fa}_tab\n awk \'{ print $2 }\' ${fa}_tab > ${fa}_tab2\n awk -F\\[Cc][Gg] \'{print NF-1}\' ${fa}_tab_2 > CG\n awk -F\\[Cc] \'{print NF-1}\' ${fa}_tab_2 > C\n awk -F\\[Gg] \'{print NF-1}\' ${fa}_tab_2 > G\n paste ${fa} \\\n CG \\\n C \\\n G \\\n > comb\n awk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n > ID_CpG\ndone\n') # In[9]: get_ipython().run_cell_magic('bash', '', 'mkdir ${analyses_dir}\n\nfa_array=(${data_dir}/*GENE.fa)\n\nfor fa in "${fa_array[@]}"\ndo\n cd ${analyses_dir}\n fn=$(basename ${fa} .fa)\n mkdir ${fn}_analysis\n cd ${fn}_analysis\n ${seqkit} fx2tab \\\n --length \\\n ${fa} \\\n > ${fn}_analysis/${fn}_tab\n awk \'{ print $2 }\' ${fn}_tab > ${fn}_tab2\n awk -F\\[Cc][Gg] \'{print NF-1}\' ${fn}_tab_2 > CG\n awk -F\\[Cc] \'{print NF-1}\' ${fn}_tab_2 > C\n awk -F\\[Gg] \'{print NF-1}\' ${fn}_tab_2 > G\n paste ${fa} \\\n CG \\\n C \\\n G \\\n > comb\n awk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n > ID_CpG\ndone\n') # In[10]: get_ipython().run_cell_magic('bash', '', '${seqkit}\n') # In[13]: get_ipython().run_cell_magic('bash', '', 'mkdir ${analyses_dir}\n\nfa_array=(${data_dir}/*GENE.fa)\n\nfor fa in "${fa_array[@]}"\ndo\n cd ${analyses_dir}\n fn=$(basename ${fa} .fa)\n mkdir ${fn}_analysis\n \n cd ${fn}_analysis\n \n ${seqkit} fx2tab \\\n --length \\\n ${fa} \\\n > ${fn}_tab\n \n awk \'{ print $2 }\' ${fn}_tab > ${fn}_tab2\n \n awk -F\\[Cc][Gg] \'{print NF-1}\' ${fn}_tab2 > CG\n \n awk -F\\[Cc] \'{print NF-1}\' ${fn}_tab2 > C\n \n awk -F\\[Gg] \'{print NF-1}\' ${fn}_tab2 > G\n \n paste ${fn}_tab \\\n CG \\\n C \\\n G \\\n > comb\n \n awk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n > ID_CpG\ndone\n') # #### Ugh. Turns out awk has some field limit. Solution: install and use ```gawk```. # # Installed ```gawk``` and restarted notebook. # In[2]: get_ipython().run_cell_magic('bash', '', 'mkdir ${analyses_dir}\n\n# Create arrays of all FastA files\nfa_array=(${data_dir}/*GENE.fa)\n\n\ntime \\\nfor fa in "${fa_array[@]}"\ndo\n # Change to proper directory\n cd ${analyses_dir}\n # Remove file path and extension from the FastA and save as variable\n fn=$(basename ${fa} .fa)\n \n # Make subdirectory using filename\n mkdir ${fn}_analysis\n cd ${fn}_analysis\n \n # Use seqkit to convert FastA to tab-delimited and print sequence length \n ${seqkit} fx2tab \\\n --length \\\n ${fa} \\\n > ${fn}_tab\n \n # Print only sequences to new file\n gawk \'{ print $2 }\' ${fn}_tab > ${fn}_tab2\n \n # Delimit sequences on CGs and print the number of fields minus 1 to get the number of CGs present.\n gawk -F\\[Cc][Gg] \'{print NF-1}\' ${fn}_tab2 > CG\n \n # Delimit sequences on CGs and print the number of fields minus 1 to get the number of Cs present.\n gawk -F\\[Cc] \'{print NF-1}\' ${fn}_tab2 > C\n \n # Delimit sequences on CGs and print the number of fields minus 1 to get the number of Gs present.\n gawk -F\\[Gg] \'{print NF-1}\' ${fn}_tab2 > G\n \n # Paste these together to have file with the following fields:\n # - FastA header\n # - Sequence\n # - Sequence length\n # - Number of CGs\n # - Number of Cs\n # - Number of Gs\n paste ${fn}_tab \\\n CG \\\n C \\\n G \\\n > comb\n \n # Do some math to calculate CpG O/E ratio (observed vs expected)\n gawk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n > ID_CpG\ndone\n') # In[3]: get_ipython().run_cell_magic('bash', '', 'cd /home/sam/analyses/\n\nrsync \\\n--archive \\\n--relative \\\n./20190225_cpg_oe \\\ngannet:/volume1/web/Atumefaciens\n') # In[ ]: