%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Tue Oct 29 08:51:12 PDT 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: swoose ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 24 On-line CPU(s) list: 0-23 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 44 Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz Stepping: 2 CPU MHz: 2925.971 BogoMIPS: 5851.97 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 12288K NUMA node0 CPU(s): 0-23 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 70G 16G 429M 392M 53G 52G Swap: 4.7G 444K 4.7G
No LSB modules are available.
%env
variables are good for passing to bash cells
%env wd=/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts
wd="/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts"
gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']
%env rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/
%env gff=Panopea-generosa-vv0.74.a4.repeat_region.gff3
%env wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a4.repeat_region.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/
%env new_gff=Panopea-generosa-vv0.74.a4.repeats
# Set genome size to 942Mbp
GENOME_SIZE = 942000000
env: wd=/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts env: rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/ env: gff=Panopea-generosa-vv0.74.a4.repeat_region.gff3 env: wget_gffs=--directory-prefix=$/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a4.repeat_region.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/ env: new_gff=Panopea-generosa-vv0.74.a4.repeats
# Calculate percentage of genome comprised of a given feature
def ind_repeats_percent(feature_length_sum):
return round(float(feature_length_sum / GENOME_SIZE * 100), 2)
import fnmatch
import os
import pandas
%%bash
mkdir --parents ${wd}
cd {wd}
/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts
Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)
%%bash
rsync \
--archive \
--verbose \
--progress \
--include="${gff}" \
--exclude="*" \
"${rysnc_owl}" \
.
echo ""
echo ""
echo "----------------------------------------------------------"
ls -lh
receiving incremental file list ./ Panopea-generosa-vv0.74.a4.repeat_region.gff3 390,130,212 100% 28.74MB/s 0:00:12 (xfr#1, to-chk=0/2) sent 91 bytes received 390,177,991 bytes 25,172,779.48 bytes/sec total size is 390,130,212 speedup is 1.00 ---------------------------------------------------------- total 373M -rwx------ 1 sam users 373M Oct 14 10:13 Panopea-generosa-vv0.74.a4.repeat_region.gff3
# %%bash
# time \
# wget "${wget_gffs}"
# ls -lh ${wd}
%%bash
head ${gff}
##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1 225 1646 + . ID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 910 1325 3459 + . ID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1329 2039 5278 - . ID=19535.GS22252506.PGEN_.repeat00149393;Name=19535.GS22252506.PGEN_.repeat00149393;repeat_match=rnd-6_family-1529;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 3030 3608 3232 - . ID=19535.GS22252506.PGEN_.repeat00149394;Name=19535.GS22252506.PGEN_.repeat00149394;repeat_match=rnd-1_family-330;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 3604 3693 512 - . ID=19535.GS22252506.PGEN_.repeat00149395;Name=19535.GS22252506.PGEN_.repeat00149395;repeat_match=rnd-1_family-278;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 5038 5265 1614 + . ID=19535.GS22252506.PGEN_.repeat00149396;Name=19535.GS22252506.PGEN_.repeat00149396;repeat_match=rnd-5_family-2058;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 5224 5277 240 - . ID=19535.GS22252506.PGEN_.repeat00149397;Name=19535.GS22252506.PGEN_.repeat00149397;repeat_match=rnd-5_family-1533;repeat_class=LINE%2FL2;
%%bash
# Initialize array
features_array=()
# Identify unique features in GFF
## Store as an array
## Skip first three header lines, and then cut on two delimiters that are present
echo "Unique repeats features in ${gff}:"
while IFS='' read -r line
do
features_array+=("$line")
done < <(awk -F"class=" 'NR >3 {print $2}' ${gff} \
| sort -u \
| cut -d '%' -f 1 \
| cut -d ';' -f 1 \
| uniq)
# Check array contents
for feature in "${features_array[@]}"
do
echo "${feature}"
done
# Loop through array and create new GFFs from each feature
for feature in "${features_array[@]}"
do
echo "Parsing ${feature} from ${gff}..."
echo "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3"
head -n 5 ${gff} > ${new_gff}.${feature}.gff3
echo "Parsing matching feature lines for ${feature} feature..."
grep "${feature}" ${gff} >> ${new_gff}.${feature}.gff3
echo "Done with parsing ${feature} feature."
feature_count=$(tail --lines +6 ${new_gff}.${feature}.gff3 | wc -l)
echo "Identified ${feature_count} ${feature} features."
echo "Output file is: Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3"
echo ""
done
echo "----------------------------------------------"
echo ""
ls -lh
Unique repeats features in Panopea-generosa-vv0.74.a4.repeat_region.gff3: DNA LINE LTR RC Simple_repeat SINE Unknown Parsing DNA from Panopea-generosa-vv0.74.a4.repeat_region.gff3... Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.DNA.gff3 Parsing matching feature lines for DNA feature... Done with parsing DNA feature. Identified 21094 DNA features. Output file is: Panopea-generosa-vv0.74.a3.repeats.DNA.gff3 Parsing LINE from Panopea-generosa-vv0.74.a4.repeat_region.gff3... Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.LINE.gff3 Parsing matching feature lines for LINE feature... Done with parsing LINE feature. Identified 69365 LINE features. Output file is: Panopea-generosa-vv0.74.a3.repeats.LINE.gff3 Parsing LTR from Panopea-generosa-vv0.74.a4.repeat_region.gff3... Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.LTR.gff3 Parsing matching feature lines for LTR feature... Done with parsing LTR feature. Identified 2890 LTR features. Output file is: Panopea-generosa-vv0.74.a3.repeats.LTR.gff3 Parsing RC from Panopea-generosa-vv0.74.a4.repeat_region.gff3... Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.RC.gff3 Parsing matching feature lines for RC feature... Done with parsing RC feature. Identified 546 RC features. Output file is: Panopea-generosa-vv0.74.a3.repeats.RC.gff3 Parsing Simple_repeat from Panopea-generosa-vv0.74.a4.repeat_region.gff3... Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3 Parsing matching feature lines for Simple_repeat feature... Done with parsing Simple_repeat feature. Identified 18121 Simple_repeat features. Output file is: Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3 Parsing SINE from Panopea-generosa-vv0.74.a4.repeat_region.gff3... Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.SINE.gff3 Parsing matching feature lines for SINE feature... Done with parsing SINE feature. Identified 39397 SINE features. Output file is: Panopea-generosa-vv0.74.a3.repeats.SINE.gff3 Parsing Unknown from Panopea-generosa-vv0.74.a4.repeat_region.gff3... Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3 Parsing matching feature lines for Unknown feature... Done with parsing Unknown feature. Identified 1375742 Unknown features. Output file is: Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3 ---------------------------------------------- total 745M -rwx------ 1 sam users 373M Oct 14 10:13 Panopea-generosa-vv0.74.a4.repeat_region.gff3 -rw-rw-r-- 1 sam users 5.4M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.DNA.gff3 -rw-rw-r-- 1 sam users 18M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3 -rw-rw-r-- 1 sam users 735K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3 -rw-rw-r-- 1 sam users 140K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.RC.gff3 -rw-rw-r-- 1 sam users 4.5M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3 -rw-rw-r-- 1 sam users 11M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3 -rw-rw-r-- 1 sam users 335M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3
%%bash
# Check the output files
for file in "${new_gff}"*.gff3
do
echo ""
echo ""
echo "${file}"
echo "----------------------------------------------"
head "${file}"
done
Panopea-generosa-vv0.74.a4.repeats.DNA.gff3 ---------------------------------------------- ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1 225 1646 + . ID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 910 1325 3459 + . ID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 75698 76199 3502 - . ID=19535.GS22252506.PGEN_.repeat00149552;Name=19535.GS22252506.PGEN_.repeat00149552;repeat_match=rnd-1_family-397;repeat_class=DNA%2FhAT-hAT5; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 189334 189466 440 - . ID=19535.GS22252506.PGEN_.repeat00149753;Name=19535.GS22252506.PGEN_.repeat00149753;repeat_match=rnd-1_family-580;repeat_class=DNA%2FTcMar-Tc1; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 217948 218189 795 - . ID=19535.GS22252506.PGEN_.repeat00149810;Name=19535.GS22252506.PGEN_.repeat00149810;repeat_match=rnd-6_family-5014;repeat_class=DNA%2FTcMar-Tc1; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 218368 218586 1139 + . ID=19535.GS22252506.PGEN_.repeat00149812;Name=19535.GS22252506.PGEN_.repeat00149812;repeat_match=rnd-6_family-5014;repeat_class=DNA%2FTcMar-Tc1; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 261172 261266 561 + . ID=19535.GS22252506.PGEN_.repeat00149912;Name=19535.GS22252506.PGEN_.repeat00149912;repeat_match=rnd-6_family-9;repeat_class=DNA%2FhAT-Tip100; Panopea-generosa-vv0.74.a4.repeats.LINE.gff3 ---------------------------------------------- ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1 225 1646 + . ID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 910 1325 3459 + . ID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 5224 5277 240 - . ID=19535.GS22252506.PGEN_.repeat00149397;Name=19535.GS22252506.PGEN_.repeat00149397;repeat_match=rnd-5_family-1533;repeat_class=LINE%2FL2; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 106383 106576 376 + . ID=19535.GS22252506.PGEN_.repeat00149599;Name=19535.GS22252506.PGEN_.repeat00149599;repeat_match=rnd-5_family-1227;repeat_class=LINE%2FCR1-Zenon; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 106637 108267 4210 + . ID=19535.GS22252506.PGEN_.repeat00149600;Name=19535.GS22252506.PGEN_.repeat00149600;repeat_match=rnd-1_family-321;repeat_class=LINE%2FCR1-Zenon; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 242610 242911 1612 - . ID=19535.GS22252506.PGEN_.repeat00149866;Name=19535.GS22252506.PGEN_.repeat00149866;repeat_match=rnd-1_family-435;repeat_class=LINE%2FRTE-X; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 242916 243551 1279 - . ID=19535.GS22252506.PGEN_.repeat00149867;Name=19535.GS22252506.PGEN_.repeat00149867;repeat_match=rnd-1_family-403;repeat_class=LINE%2FRTE-X; Panopea-generosa-vv0.74.a4.repeats.LTR.gff3 ---------------------------------------------- ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1 225 1646 + . ID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 910 1325 3459 + . ID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 3821636 3823037 10961 + . ID=19535.GS22252506.PGEN_.repeat00155489;Name=19535.GS22252506.PGEN_.repeat00155489;repeat_match=rnd-5_family-337;repeat_class=LTR%2FGypsy; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 3823036 3826380 26996 + . ID=19535.GS22252506.PGEN_.repeat00155490;Name=19535.GS22252506.PGEN_.repeat00155490;repeat_match=rnd-5_family-337;repeat_class=LTR%2FGypsy; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 3831155 3831246 660 + . ID=19535.GS22252506.PGEN_.repeat00155515;Name=19535.GS22252506.PGEN_.repeat00155515;repeat_match=rnd-5_family-337;repeat_class=LTR%2FGypsy; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 3831536 3832007 3730 + . ID=19535.GS22252506.PGEN_.repeat00155517;Name=19535.GS22252506.PGEN_.repeat00155517;repeat_match=rnd-5_family-337;repeat_class=LTR%2FGypsy; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 4665540 4666330 516 + . ID=19535.GS22252506.PGEN_.repeat00156951;Name=19535.GS22252506.PGEN_.repeat00156951;repeat_match=rnd-5_family-1106;repeat_class=LTR%2FGypsy; Panopea-generosa-vv0.74.a4.repeats.RC.gff3 ---------------------------------------------- ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1 225 1646 + . ID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 910 1325 3459 + . ID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 123062 123694 4812 + . ID=19535.GS22252506.PGEN_.repeat00149637;Name=19535.GS22252506.PGEN_.repeat00149637;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 2285720 2285809 738 + . ID=19535.GS22252506.PGEN_.repeat00153160;Name=19535.GS22252506.PGEN_.repeat00153160;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 2285873 2286021 1221 + . ID=19535.GS22252506.PGEN_.repeat00153162;Name=19535.GS22252506.PGEN_.repeat00153162;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 2286296 2286843 4324 + . ID=19535.GS22252506.PGEN_.repeat00153165;Name=19535.GS22252506.PGEN_.repeat00153165;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 2635022 2635479 3530 - . ID=19535.GS22252506.PGEN_.repeat00153957;Name=19535.GS22252506.PGEN_.repeat00153957;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron; Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3 ---------------------------------------------- ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1 225 1646 + . ID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 910 1325 3459 + . ID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 301637 301892 970 + . ID=19535.GS22252506.PGEN_.repeat00149963;Name=19535.GS22252506.PGEN_.repeat00149963;repeat_match=rnd-4_family-288;repeat_class=Simple_repeat; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 306440 306805 460 + . ID=19535.GS22252506.PGEN_.repeat00149968;Name=19535.GS22252506.PGEN_.repeat00149968;repeat_match=rnd-4_family-288;repeat_class=Simple_repeat; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 484555 484765 1048 + . ID=19535.GS22252506.PGEN_.repeat00150121;Name=19535.GS22252506.PGEN_.repeat00150121;repeat_match=rnd-6_family-10;repeat_class=Simple_repeat; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 550953 551053 234 + . ID=19535.GS22252506.PGEN_.repeat00150239;Name=19535.GS22252506.PGEN_.repeat00150239;repeat_match=rnd-1_family-158;repeat_class=Simple_repeat; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 551620 551760 386 + . ID=19535.GS22252506.PGEN_.repeat00150243;Name=19535.GS22252506.PGEN_.repeat00150243;repeat_match=rnd-1_family-158;repeat_class=Simple_repeat; Panopea-generosa-vv0.74.a4.repeats.SINE.gff3 ---------------------------------------------- ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1 225 1646 + . ID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 910 1325 3459 + . ID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 6637 6844 1072 + . ID=19535.GS22252506.PGEN_.repeat00149404;Name=19535.GS22252506.PGEN_.repeat00149404;repeat_match=rnd-1_family-21;repeat_class=SINE%2FtRNA-Core-L2; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 22157 22354 1063 - . ID=19535.GS22252506.PGEN_.repeat00149446;Name=19535.GS22252506.PGEN_.repeat00149446;repeat_match=rnd-1_family-21;repeat_class=SINE%2FtRNA-Core-L2; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 39994 40185 1148 - . ID=19535.GS22252506.PGEN_.repeat00149474;Name=19535.GS22252506.PGEN_.repeat00149474;repeat_match=rnd-3_family-517;repeat_class=SINE%2FtRNA-Core-L2; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 58668 58862 729 + . ID=19535.GS22252506.PGEN_.repeat00149518;Name=19535.GS22252506.PGEN_.repeat00149518;repeat_match=rnd-3_family-517;repeat_class=SINE%2FtRNA-Core-L2; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 110510 110677 514 - . ID=19535.GS22252506.PGEN_.repeat00149613;Name=19535.GS22252506.PGEN_.repeat00149613;repeat_match=rnd-1_family-48;repeat_class=SINE%2FtRNA-Core-L2; Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3 ---------------------------------------------- ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1 225 1646 + . ID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 910 1325 3459 + . ID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1 225 1646 + . ID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 910 1325 3459 + . ID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 1329 2039 5278 - . ID=19535.GS22252506.PGEN_.repeat00149393;Name=19535.GS22252506.PGEN_.repeat00149393;repeat_match=rnd-6_family-1529;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 3030 3608 3232 - . ID=19535.GS22252506.PGEN_.repeat00149394;Name=19535.GS22252506.PGEN_.repeat00149394;repeat_match=rnd-1_family-330;repeat_class=Unknown; PGA_scaffold2__36_contigs__length_69596280 GenSAS_5d25089d78791-repeatmodeler repeat_region 3604 3693 512 - . ID=19535.GS22252506.PGEN_.repeat00149395;Name=19535.GS22252506.PGEN_.repeat00149395;repeat_match=rnd-1_family-278;repeat_class=Unknown;
total_repeats_percent = 0
for file in os.listdir('.'):
if fnmatch.fnmatch(file, 'Panopea-generosa-vv0.74.a4.repeats*.gff3'):
print('\n' * 2)
print(file)
print("-------------------------")
# Import GFF.
# Skip first five rows and file is tab-separated
gff=pandas.read_csv(file, header=None, skiprows=5, sep="\t")
# Rename columns
gff.columns = gff_header
# Subtract start value from end value.
# Have to add 1 so that sequence length can't equal zero
gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)
gff_sum = gff['seqlength'].sum()
total_repeats_percent += ind_repeats_percent(gff_sum)
print ("percent" , ind_repeats_percent(gff_sum))
# Apply functions in list to seqlength column
gff_stats = gff['seqlength'].agg(['sum', 'mean', 'min', 'median', 'max'])
print (gff_stats.round(2))
print('\n' * 2)
print("-------------------------")
print ("Repeats composition of genome (percent):" , total_repeats_percent)
Panopea-generosa-vv0.74.a4.repeats.LINE.gff3 ------------------------- percent 2.91 sum 27388849.00 mean 394.85 min 11.00 median 226.00 max 6604.00 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3 ------------------------- percent 0.5 sum 4733271.0 mean 261.2 min 6.0 median 125.0 max 5981.0 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3 ------------------------- percent 29.09 sum 2.740281e+08 mean 1.991900e+02 min 1.100000e+01 median 1.440000e+02 max 6.574000e+03 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3 ------------------------- percent 0.22 sum 2060084.00 mean 712.83 min 11.00 median 316.00 max 6541.00 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.repeats.RC.gff3 ------------------------- percent 0.02 sum 232303.00 mean 425.46 min 13.00 median 464.00 max 674.00 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3 ------------------------- percent 0.65 sum 6133778.00 mean 155.69 min 11.00 median 164.00 max 934.00 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.repeats.DNA.gff3 ------------------------- percent 0.91 sum 8602532.00 mean 407.82 min 11.00 median 247.00 max 7012.00 Name: seqlength, dtype: float64 ------------------------- Repeats composition of genome (percent): 34.3
%%bash
rm ${gff}
ls -lh
total 373M -rw-rw-r-- 1 sam users 5.4M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.DNA.gff3 -rw-rw-r-- 1 sam users 18M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3 -rw-rw-r-- 1 sam users 735K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3 -rw-rw-r-- 1 sam users 140K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.RC.gff3 -rw-rw-r-- 1 sam users 4.5M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3 -rw-rw-r-- 1 sam users 11M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3 -rw-rw-r-- 1 sam users 335M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3