#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # # `%env` are best for bash # In[10]: # Set workding directory get_ipython().run_line_magic('env', 'wd=/home/sam/analyses/20191203_pgen_v074.a4_genome_feature_counts') wd="/home/sam/analyses/20191203_pgen_v074.a4_genome_feature_counts" # File download get_ipython().run_line_magic('env', 'downloads_list=Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3 Pgenerosa_v074.fa.fai') # Input/output files get_ipython().run_line_magic('env', 'old_fa_index=Pgenerosa_v074.fa.fai') get_ipython().run_line_magic('env', 'org_merged_gff=Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3') get_ipython().run_line_magic('env', 'new_merged_gff=Panopea-generosa-v1.0.a4.gff3') get_ipython().run_line_magic('env', 'new_file=Panopea-generosa-v1.0.a4') get_ipython().run_line_magic('env', 'new_repeats_file=Panopea-generosa-v1.0.a4.repeats') get_ipython().run_line_magic('env', 'repeats_gff=Panopea-generosa-v1.0.a4.repeat_region.gff3') # Set lists of column header names gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes'] # Set genome size to 942Mbp GENOME_SIZE = 942000000 # ### Import Python modules # In[3]: import fnmatch import os import pandas # In[4]: # Function to calculate percentage of genome comprised of a given feature def ind_repeats_percent(feature_length_sum): return round(float(feature_length_sum / GENOME_SIZE * 100), 2) # In[5]: get_ipython().run_cell_magic('bash', '', 'mkdir --parents ${wd}\n') # In[6]: cd {wd} # ### Download files # In[11]: get_ipython().run_cell_magic('bash', '', 'mapfile -t downloads_array < <(echo ${downloads_list} | tr " " "\\n")\n\nfor file in "${downloads_array[@]}"\ndo\n rsync --archive --verbose \\\n owl:/volume1/web/halfshell/genomic-databank/${file} .\ndone\n\nls -lh\n\n## If need to download via wget, uncomment and run lines below:\n#for file in "${downloads_array[@]}"\n#do\n# wget --quiet --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/"${file}"\n#done\n') # ### Rename scaffolds # In[12]: get_ipython().run_cell_magic('bash', '', '# Array of old scaffold names\n# Uses first column of FastA index file to get scaffold names\nmapfile -t orig_scaffold_names < <(awk \'{print $1}\' "${old_fa_index}")\n\n# Array of new scaffold names\nmapfile -t new_scaffold_names < <(for number in {01..18}; do echo "Scaffold_${number}"; done)\n\n# sed substitution\n# creates sed script to find original scaffold names and replace them with new scafold names\n# and passes to sed via stdin\nfor index in "${!orig_scaffold_names[@]}"\ndo\n printf "s/%s/%s/\\n" "${orig_scaffold_names[index]}" "${new_scaffold_names[index]}"\ndone | sed --file - "${org_merged_gff}" \\\n>> "${new_merged_gff}"\n\n# Check that substituion worked\necho ""\necho "List of scaffold names in ${org_merged_gff}:"\necho "--------------------"\necho ""\nawk \'{print $1}\' "${org_merged_gff}" | sort | uniq -c\necho ""\necho "--------------------"\necho ""\necho "List of scaffold names in ${new_merged_gff}:"\necho "--------------------"\necho ""\nawk \'{print $1}\' "${new_merged_gff}" | sort | uniq -c\necho ""\n') # ### Separate features # In[13]: get_ipython().run_cell_magic('bash', '', '\necho "Here are the features and their counts in ${new_merged_gff}:"\nawk \'NR>3 {print $3}\' "${new_merged_gff}" | sort | uniq -c\necho ""\n\n# Create array for feature names\nmapfile -t features_array < <(awk \'NR>3 {print $3}\' "${new_merged_gff}" | sort | uniq)\n\n\n# Save features in array\nfor feature in "${features_array[@]}"\ndo\n output="${new_file}.${feature}.gff3"\n head -n 3 "${new_merged_gff}" >> "${output}"\n awk -v feature="$feature" \'$3 == feature {print}\' "${new_merged_gff}" \\\n >> "${output}"\n echo ""\n echo ""\n echo "${output}"\n echo "----------------------------------------------"\n head -n 5 "${output}"\ndone\n') # ### Feature stats # In[14]: for file in os.listdir('.'): if fnmatch.fnmatch(file, 'Panopea-generosa-v1.0.a4*.gff3'): print('\n' * 2) print(file) print("-------------------------") # Import GFF. # Skip first 3 rows (gff header lines) and indicate file is tab-separated gff=pandas.read_csv(file, header=None, skiprows=3, sep="\t") # Rename columns gff.columns = gff_header # Subtract start value from end value. # Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system) gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1) # Apply functions in list to seqlength column gff_stats = gff['seqlength'].agg(['mean', 'min', 'median', 'max']) print (gff_stats) # ### Separate repeats # In[15]: get_ipython().run_cell_magic('bash', '', '\n# Initialize array\nfeatures_array=()\n\n# Identify unique features in GFF\n## Store as an array\n## Skip first three header lines, and then cut on two delimiters that are present\necho "Unique repeats features in ${repeats_gff}:"\nwhile IFS=\'\' read -r line \ndo\n features_array+=("$line")\ndone < <(awk -F"class=" \'NR >3 {print $2}\' "${repeats_gff}" \\\n | sort -u \\\n | cut -d \'%\' -f 1 \\\n | cut -d \';\' -f 1 \\\n | uniq)\n\n\n# Check array contents\nfor feature in "${features_array[@]}"\ndo\n echo "${feature}"\ndone\n\necho ""\necho "----------------------------------------------"\necho ""\n\n# Loop through array and create new GFFs from each feature\nfor feature in "${features_array[@]}"\ndo\n echo "Parsing ${feature} from ${repeats_gff}..."\n echo "Writing GFF3 header to ${new_repeats_file}.${feature}.gff3"\n # Write header to new file\n head -n 3 "${repeats_gff}" > "${new_repeats_file}"."${feature}".gff3\n echo "Parsing matching feature lines for ${feature} feature..."\n grep "repeat_class=${feature}" "${repeats_gff}" >> "${new_repeats_file}"."${feature}".gff3\n echo "Done with parsing ${feature} feature."\n # Count number of lines, excluding three line header (oddly, need to run tail --lines n+1, as +1 prints entire file)\n feature_count=$(tail --lines +4 "${new_repeats_file}"."${feature}".gff3 | wc -l)\n echo "Identified ${feature_count} ${feature} features."\n echo "Output file is: ${new_repeats_file}.${feature}.gff3"\n head -n 5 "${new_repeats_file}.${feature}.gff3"\n echo ""\n echo "----------------------------------------------"\ndone\n\necho "----------------------------------------------"\necho ""\nls -lh\n') # ### Sequence length repeats stats # In[17]: total_repeats_percent = 0 for file in os.listdir('.'): if fnmatch.fnmatch(file, 'Panopea-generosa-v1.0.a4.repeats*.gff3'): print('\n' * 2) print(file) print("-------------------------") # Import GFF. # Skip first five rows and file is tab-separated gff=pandas.read_csv(file, header=None, skiprows=5, sep="\t") # Rename columns gff.columns = gff_header # Subtract start value from end value. # Have to add 1 so that sequence length can't equal zero gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1) gff_sum = gff['seqlength'].sum() total_repeats_percent += ind_repeats_percent(gff_sum) print ("percent" , ind_repeats_percent(gff_sum)) # Apply functions in list to seqlength column gff_stats = gff['seqlength'].agg(['sum', 'mean', 'min', 'median', 'max']) print (gff_stats.round(2)) print('\n' * 2) print("-------------------------") print ("Repeats composition of genome (percent):" , total_repeats_percent) # In[18]: get_ipython().run_cell_magic('bash', '', '%%bash\nmapfile -t downloads_array < <(echo ${downloads_list} | tr " " "\\n")\n\nfor file in "${downloads_array[@]}"\ndo\n rm "${file}"\ndone\n\nls -lh\n') # In[19]: get_ipython().run_cell_magic('bash', '', 'cd ..\nrsync --progress --archive --relative \\\n./20191203_pgen_v074.a4_genome_feature_counts \\\ngannet:/volume2/web/Atumefaciens\n') # In[ ]: