#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# ### Set variables
# 
# `%env` are best for bash

# In[10]:


# Set workding directory
get_ipython().run_line_magic('env', 'wd=/home/sam/analyses/20191203_pgen_v074.a4_genome_feature_counts')
wd="/home/sam/analyses/20191203_pgen_v074.a4_genome_feature_counts"

# File download
get_ipython().run_line_magic('env', 'downloads_list=Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3 Pgenerosa_v074.fa.fai')


# Input/output files
get_ipython().run_line_magic('env', 'old_fa_index=Pgenerosa_v074.fa.fai')
get_ipython().run_line_magic('env', 'org_merged_gff=Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3')
get_ipython().run_line_magic('env', 'new_merged_gff=Panopea-generosa-v1.0.a4.gff3')
get_ipython().run_line_magic('env', 'new_file=Panopea-generosa-v1.0.a4')
get_ipython().run_line_magic('env', 'new_repeats_file=Panopea-generosa-v1.0.a4.repeats')
get_ipython().run_line_magic('env', 'repeats_gff=Panopea-generosa-v1.0.a4.repeat_region.gff3')


# Set lists of column header names
gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']

# Set genome size to 942Mbp
GENOME_SIZE = 942000000


# ### Import Python modules

# In[3]:


import fnmatch
import os
import pandas


# In[4]:


# Function to calculate percentage of genome comprised of a given feature
def ind_repeats_percent(feature_length_sum): 
    return round(float(feature_length_sum / GENOME_SIZE * 100), 2)


# In[5]:


get_ipython().run_cell_magic('bash', '', 'mkdir --parents ${wd}\n')


# In[6]:


cd {wd}


# ### Download files

# In[11]:


get_ipython().run_cell_magic('bash', '', 'mapfile -t downloads_array < <(echo ${downloads_list} | tr " " "\\n")\n\nfor file in "${downloads_array[@]}"\ndo\n    rsync --archive --verbose \\\n    owl:/volume1/web/halfshell/genomic-databank/${file} .\ndone\n\nls -lh\n\n## If need to download via wget, uncomment and run lines below:\n#for file in "${downloads_array[@]}"\n#do\n#  wget --quiet --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/"${file}"\n#done\n')


# ### Rename scaffolds

# In[12]:


get_ipython().run_cell_magic('bash', '', '# Array of old scaffold names\n# Uses first column of FastA index file to get scaffold names\nmapfile -t orig_scaffold_names < <(awk \'{print $1}\' "${old_fa_index}")\n\n# Array of new scaffold names\nmapfile -t new_scaffold_names < <(for number in {01..18}; do echo "Scaffold_${number}"; done)\n\n# sed substitution\n# creates sed script to find original scaffold names and replace them with new scafold names\n# and passes to sed via stdin\nfor index in "${!orig_scaffold_names[@]}"\ndo\n  printf "s/%s/%s/\\n" "${orig_scaffold_names[index]}" "${new_scaffold_names[index]}"\ndone | sed --file - "${org_merged_gff}" \\\n>> "${new_merged_gff}"\n\n# Check that substituion worked\necho ""\necho "List of scaffold names in ${org_merged_gff}:"\necho "--------------------"\necho ""\nawk \'{print $1}\' "${org_merged_gff}" | sort | uniq -c\necho ""\necho "--------------------"\necho ""\necho "List of scaffold names in ${new_merged_gff}:"\necho "--------------------"\necho ""\nawk \'{print $1}\' "${new_merged_gff}" | sort | uniq -c\necho ""\n')


# ### Separate features

# In[13]:


get_ipython().run_cell_magic('bash', '', '\necho "Here are the features and their counts in ${new_merged_gff}:"\nawk \'NR>3 {print $3}\' "${new_merged_gff}" | sort | uniq -c\necho ""\n\n# Create array for feature names\nmapfile -t features_array < <(awk \'NR>3 {print $3}\' "${new_merged_gff}" | sort | uniq)\n\n\n# Save features in array\nfor feature in "${features_array[@]}"\ndo\n   output="${new_file}.${feature}.gff3"\n   head -n 3 "${new_merged_gff}" >> "${output}"\n   awk -v feature="$feature" \'$3 == feature {print}\' "${new_merged_gff}" \\\n   >> "${output}"\n   echo ""\n   echo ""\n   echo "${output}"\n   echo "----------------------------------------------"\n   head -n 5 "${output}"\ndone\n')


# ### Feature stats

# In[14]:


for file in os.listdir('.'):
    if fnmatch.fnmatch(file, 'Panopea-generosa-v1.0.a4*.gff3'):
        print('\n' * 2)
        print(file)
        print("-------------------------")
        
        # Import GFF.
        # Skip first 3 rows (gff header lines) and indicate file is tab-separated
        gff=pandas.read_csv(file, header=None, skiprows=3, sep="\t")
        
        # Rename columns
        gff.columns = gff_header
        
        # Subtract start value from end value.
        # Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system)
        gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)
        
        # Apply functions in list to seqlength column
        gff_stats = gff['seqlength'].agg(['mean', 'min', 'median', 'max'])
        
        print (gff_stats)


# ### Separate repeats

# In[15]:


get_ipython().run_cell_magic('bash', '', '\n# Initialize array\nfeatures_array=()\n\n# Identify unique features in GFF\n## Store as an array\n## Skip first three header lines, and then cut on two delimiters that are present\necho "Unique repeats features in ${repeats_gff}:"\nwhile IFS=\'\' read -r line \ndo\n    features_array+=("$line")\ndone < <(awk -F"class=" \'NR >3 {print $2}\' "${repeats_gff}" \\\n                  | sort -u \\\n                  | cut -d \'%\' -f 1 \\\n                  | cut -d \';\' -f 1 \\\n                  | uniq)\n\n\n# Check array contents\nfor feature in "${features_array[@]}"\ndo\n    echo "${feature}"\ndone\n\necho ""\necho "----------------------------------------------"\necho ""\n\n# Loop through array and create new GFFs from each feature\nfor feature in "${features_array[@]}"\ndo\n    echo "Parsing ${feature} from ${repeats_gff}..."\n    echo "Writing GFF3 header to ${new_repeats_file}.${feature}.gff3"\n    # Write header to new file\n    head -n 3 "${repeats_gff}" > "${new_repeats_file}"."${feature}".gff3\n    echo "Parsing matching feature lines for ${feature} feature..."\n    grep "repeat_class=${feature}" "${repeats_gff}" >> "${new_repeats_file}"."${feature}".gff3\n    echo "Done with parsing ${feature} feature."\n    # Count number of lines, excluding three line header (oddly, need to run tail --lines n+1, as +1 prints entire file)\n    feature_count=$(tail --lines +4 "${new_repeats_file}"."${feature}".gff3 | wc -l)\n    echo "Identified ${feature_count} ${feature} features."\n    echo "Output file is: ${new_repeats_file}.${feature}.gff3"\n    head -n 5 "${new_repeats_file}.${feature}.gff3"\n    echo ""\n    echo "----------------------------------------------"\ndone\n\necho "----------------------------------------------"\necho ""\nls -lh\n')


# ### Sequence length repeats stats

# In[17]:


total_repeats_percent = 0

for file in os.listdir('.'):
    if fnmatch.fnmatch(file, 'Panopea-generosa-v1.0.a4.repeats*.gff3'):
        print('\n' * 2)
        print(file)
        print("-------------------------")
        # Import GFF.
        # Skip first five rows and file is tab-separated
        gff=pandas.read_csv(file, header=None, skiprows=5, sep="\t")
        # Rename columns
        gff.columns = gff_header
        # Subtract start value from end value.
        # Have to add 1 so that sequence length can't equal zero
        gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)
        gff_sum = gff['seqlength'].sum()
        
        total_repeats_percent += ind_repeats_percent(gff_sum)
        print ("percent" , ind_repeats_percent(gff_sum))
        
        # Apply functions in list to seqlength column
        gff_stats = gff['seqlength'].agg(['sum', 'mean', 'min', 'median', 'max'])
        
        print (gff_stats.round(2))
print('\n' * 2) 
print("-------------------------")
print ("Repeats composition of genome (percent):" , total_repeats_percent)


# In[18]:


get_ipython().run_cell_magic('bash', '', '%%bash\nmapfile -t downloads_array < <(echo ${downloads_list} | tr " " "\\n")\n\nfor file in "${downloads_array[@]}"\ndo\n  rm "${file}"\ndone\n\nls -lh\n')


# In[19]:


get_ipython().run_cell_magic('bash', '', 'cd ..\nrsync --progress --archive --relative \\\n./20191203_pgen_v074.a4_genome_feature_counts \\\ngannet:/volume2/web/Atumefaciens\n')


# In[ ]: