#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# #### Set variables

# In[1]:


get_ipython().run_line_magic('env', 'seqkit=/home/sam/programs/seqkit-v0.9.3')
get_ipython().run_line_magic('env', 'data_dir=/home/sam/data')
get_ipython().run_line_magic('env', 'analyses_dir=/home/sam/analyses/20190225_cpg_oe')


# #### Download data

# In[3]:


get_ipython().run_cell_magic('bash', '', 'cd ${data_dir}\ntime \\\nwget \\\n--recursive \\\n--no-directories \\\n--no-parent \\\n--quiet \\\n--accept "*GENE.fa" \\\n"http://gannet.fish.washington.edu/seashell/bu-serine-wd/19-01-08/"\n\nls -lhtr *.fa\n')


# Prepared at request of this GitHub Issue: 
# 
# https://github.com/RobertsLab/resources/issues/593
# 
# Code is modified from the following link in order to loop through a large number of files:
# 
# http://htmlpreview.github.io/?https://github.com/hputnam/EastOyEpi/blob/master/02-Cpg-test.html

# In[4]:


get_ipython().run_cell_magic('bash', '', 'fa_array=(${data_dir}/*GENE.fa)\n\nfor fa in ${fa_array[@]}\ndo\n  fn=$(basename ${fa} .fa)\n  mkdir ${fn}_analysis\n  cd ${fn}_analysis\n  fx2tab \\\n  --length \\\n  ${fa} \\\n  > ${fa}_tab\n  awk \'{ print $2 }\' ${fa}_tab > ${fa}_tab2\n  awk -F\\[Cc][Gg] \'{print NF-1}\' ${fa}_tab_2 > CG\n  awk -F\\[Cc] \'{print NF-1}\' ${fa}_tab_2 > C\n  awk -F\\[Gg] \'{print NF-1}\' ${fa}_tab_2 > G\n  paste ${fa} \\\n  CG \\\n  C \\\n  G \\\n  > comb\n  awk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n  > ID_CpG\ndone\n')


# In[7]:


get_ipython().run_cell_magic('bash', '', 'mkdir ${analyses_dir}\n\nfa_array=(${data_dir}/*GENE.fa)\n\nfor fa in "${fa_array[@]}"\ndo\n  cd ${analyses_dir}\n  fn=$(basename ${fa} .fa)\n  mkdir ${fn}_analysis\n  cd ${fn}_analysis\n  ${seqkit} fx2tab \\\n  --length \\\n  ${fa} \\\n  > ${analyses_dir}/${fa}_tab\n  awk \'{ print $2 }\' ${fa}_tab > ${fa}_tab2\n  awk -F\\[Cc][Gg] \'{print NF-1}\' ${fa}_tab_2 > CG\n  awk -F\\[Cc] \'{print NF-1}\' ${fa}_tab_2 > C\n  awk -F\\[Gg] \'{print NF-1}\' ${fa}_tab_2 > G\n  paste ${fa} \\\n  CG \\\n  C \\\n  G \\\n  > comb\n  awk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n  > ID_CpG\ndone\n')


# In[9]:


get_ipython().run_cell_magic('bash', '', 'mkdir ${analyses_dir}\n\nfa_array=(${data_dir}/*GENE.fa)\n\nfor fa in "${fa_array[@]}"\ndo\n  cd ${analyses_dir}\n  fn=$(basename ${fa} .fa)\n  mkdir ${fn}_analysis\n  cd ${fn}_analysis\n  ${seqkit} fx2tab \\\n  --length \\\n  ${fa} \\\n  > ${fn}_analysis/${fn}_tab\n  awk \'{ print $2 }\' ${fn}_tab > ${fn}_tab2\n  awk -F\\[Cc][Gg] \'{print NF-1}\' ${fn}_tab_2 > CG\n  awk -F\\[Cc] \'{print NF-1}\' ${fn}_tab_2 > C\n  awk -F\\[Gg] \'{print NF-1}\' ${fn}_tab_2 > G\n  paste ${fa} \\\n  CG \\\n  C \\\n  G \\\n  > comb\n  awk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n  > ID_CpG\ndone\n')


# In[10]:


get_ipython().run_cell_magic('bash', '', '${seqkit}\n')


# In[13]:


get_ipython().run_cell_magic('bash', '', 'mkdir ${analyses_dir}\n\nfa_array=(${data_dir}/*GENE.fa)\n\nfor fa in "${fa_array[@]}"\ndo\n  cd ${analyses_dir}\n  fn=$(basename ${fa} .fa)\n  mkdir ${fn}_analysis\n  \n  cd ${fn}_analysis\n  \n  ${seqkit} fx2tab \\\n  --length \\\n  ${fa} \\\n  > ${fn}_tab\n  \n  awk \'{ print $2 }\' ${fn}_tab > ${fn}_tab2\n  \n  awk -F\\[Cc][Gg] \'{print NF-1}\' ${fn}_tab2 > CG\n  \n  awk -F\\[Cc] \'{print NF-1}\' ${fn}_tab2 > C\n  \n  awk -F\\[Gg] \'{print NF-1}\' ${fn}_tab2 > G\n  \n  paste ${fn}_tab \\\n  CG \\\n  C \\\n  G \\\n  > comb\n  \n  awk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n  > ID_CpG\ndone\n')


# #### Ugh. Turns out awk has some field limit. Solution: install and use ```gawk```.
# 
# Installed ```gawk``` and restarted notebook.

# In[2]:


get_ipython().run_cell_magic('bash', '', 'mkdir ${analyses_dir}\n\n# Create arrays of all FastA files\nfa_array=(${data_dir}/*GENE.fa)\n\n\ntime \\\nfor fa in "${fa_array[@]}"\ndo\n  # Change to proper directory\n  cd ${analyses_dir}\n  # Remove file path and extension from the FastA and save as variable\n  fn=$(basename ${fa} .fa)\n  \n  # Make subdirectory using filename\n  mkdir ${fn}_analysis\n  cd ${fn}_analysis\n  \n  # Use seqkit to convert FastA to tab-delimited and print sequence length \n  ${seqkit} fx2tab \\\n  --length \\\n  ${fa} \\\n  > ${fn}_tab\n  \n  # Print only sequences to new file\n  gawk \'{ print $2 }\' ${fn}_tab > ${fn}_tab2\n  \n  # Delimit sequences on CGs and print the number of fields minus 1 to get the number of CGs present.\n  gawk -F\\[Cc][Gg] \'{print NF-1}\' ${fn}_tab2 > CG\n  \n  # Delimit sequences on CGs and print the number of fields minus 1 to get the number of Cs present.\n  gawk -F\\[Cc] \'{print NF-1}\' ${fn}_tab2 > C\n  \n  # Delimit sequences on CGs and print the number of fields minus 1 to get the number of Gs present.\n  gawk -F\\[Gg] \'{print NF-1}\' ${fn}_tab2 > G\n  \n  # Paste these together to have file with the following fields:\n    # - FastA header\n    # - Sequence\n    # - Sequence length\n    # - Number of CGs\n    # - Number of Cs\n    # - Number of Gs\n  paste ${fn}_tab \\\n  CG \\\n  C \\\n  G \\\n  > comb\n  \n  # Do some math to calculate CpG O/E ratio (observed vs expected)\n  gawk \'{print $1, "\\t", (($4)/($5*$6))*(($3^2)/($3-1))}\' comb \\\n  > ID_CpG\ndone\n')


# In[3]:


get_ipython().run_cell_magic('bash', '', 'cd /home/sam/analyses/\n\nrsync \\\n--archive \\\n--relative \\\n./20190225_cpg_oe \\\ngannet:/volume1/web/Atumefaciens\n')


# In[ ]: