%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Tue Oct 29 09:39:44 PDT 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: swoose ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 24 On-line CPU(s) list: 0-23 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 44 Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz Stepping: 2 CPU MHz: 2925.971 BogoMIPS: 5851.97 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 12288K NUMA node0 CPU(s): 0-23 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 70G 17G 413M 410M 53G 52G Swap: 4.7G 7.0M 4.6G
No LSB modules are available.
%env wd=/home/sam/analyses/20191029_pgen_v074.a4_genome_feature_counts
wd="/home/sam/analyses/20191029_pgen_v074.a4_genome_feature_counts"
# Set list of column header names
gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']
%env rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/
%env gffs=Panopea-generosa-vv0.74.a4.[Cegmrt]*.gff3
%env wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.[Cegmrt]*.gff3' --reject 'Panopea-generosa-vv0.74.a4.repeat_region.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/
env: wd=/home/sam/analyses/20191029_pgen_v074.a4_genome_feature_counts env: rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/ env: gffs=Panopea-generosa-vv0.74.a4.[Cegmrt]*.gff3 env: wget_gffs=--directory-prefix=$/home/sam/analyses/20191029_pgen_v074.a4_genome_feature_counts --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.[Cegmrt]*.gff3' --reject 'Panopea-generosa-vv0.74.a4.repeat_region.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/
import fnmatch
import os
import pandas
%%bash
mkdir --parents ${wd}
cd {wd}
/home/sam/analyses/20191029_pgen_v074.a4_genome_feature_counts
Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)
%%bash
rsync \
--archive \
--verbose \
--progress \
--include="${gffs}" \
--exclude="*" \
"${rysnc_owl}" \
.
# Remove unneeded repeats GFF
rm Panopea-generosa-vv0.74.a4.repeat_region.gff3
echo ""
echo ""
echo "----------------------------------------------------------"
ls -lh
receiving incremental file list sent 65 bytes received 230 bytes 196.67 bytes/sec total size is 542,158,159 speedup is 1,837,824.27 ---------------------------------------------------------- total 146M -rwx------ 1 sam users 60M Oct 14 10:13 Panopea-generosa-vv0.74.a4.CDS.gff3 -rwx------ 1 sam users 62M Oct 14 10:13 Panopea-generosa-vv0.74.a4.exon.gff3 -rwx------ 1 sam users 11M Oct 14 10:13 Panopea-generosa-vv0.74.a4.gene.gff3 -rwx------ 1 sam users 11M Oct 14 10:13 Panopea-generosa-vv0.74.a4.mRNA.gff3 -rwx------ 1 sam users 1.5K Oct 14 10:13 Panopea-generosa-vv0.74.a4.rRNA.gff3 -rwx------ 1 sam users 2.8M Oct 14 10:13 Panopea-generosa-vv0.74.a4.tRNA.gff3
# %%bash
# time \
# wget "${wget_gffs}"
# ls -lh ${wd}
%%bash
head Panopea-generosa-vv0.74.a4.CDS.gff3
##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish CDS 2 125 . + 0 ID=PGEN_.00g000010.m01.CDS01;Name=PGEN_.00g000010.m01.CDS01;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01 PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish CDS 1995 2095 . + 1 ID=PGEN_.00g000010.m01.CDS02;Name=PGEN_.00g000010.m01.CDS02;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01 PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish CDS 3325 3495 . + 0 ID=PGEN_.00g000010.m01.CDS03;Name=PGEN_.00g000010.m01.CDS03;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01 PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish CDS 4651 4719 . + 0 ID=PGEN_.00g000010.m01.CDS04;Name=PGEN_.00g000010.m01.CDS04;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01 PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish CDS 19808 19943 . - 2 ID=PGEN_.00g000020.m01.CDS01;Name=PGEN_.00g000020.m01.CDS01;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01 PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish CDS 21133 21362 . - 0 ID=PGEN_.00g000020.m01.CDS02;Name=PGEN_.00g000020.m01.CDS02;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01 PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish CDS 22487 22613 . - 2 ID=PGEN_.00g000020.m01.CDS03;Name=PGEN_.00g000020.m01.CDS03;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01
for file in os.listdir('.'):
if fnmatch.fnmatch(file, 'Panopea-generosa-vv0.74.a4*.gff3'):
print('\n' * 2)
print(file)
print("-------------------------")
# Import GFF.
# Skip first 3 rows (gff header lines) and indicate file is tab-separated
gff=pandas.read_csv(file, header=None, skiprows=3, sep="\t")
# Rename columns
gff.columns = gff_header
# Subtract start value from end value.
# Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system)
gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)
# Apply functions in list to seqlength column
gff_stats = gff['seqlength'].agg(['mean', 'min', 'median', 'max'])
print (gff_stats)
Panopea-generosa-vv0.74.a4.mRNA.gff3 ------------------------- mean 12903.649559 min 166.000000 median 5453.000000 max 283066.000000 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.gene.gff3 ------------------------- mean 10811.04461 min 166.00000 median 4464.00000 max 283066.00000 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.CDS.gff3 ------------------------- mean 201.476988 min 3.000000 median 133.000000 max 13221.000000 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.tRNA.gff3 ------------------------- mean 74.805659 min 53.000000 median 75.000000 max 314.000000 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.rRNA.gff3 ------------------------- mean 118.428571 min 113.000000 median 115.000000 max 138.000000 Name: seqlength, dtype: float64 Panopea-generosa-vv0.74.a4.exon.gff3 ------------------------- mean 201.476988 min 3.000000 median 133.000000 max 13221.000000 Name: seqlength, dtype: float64