%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Mon Aug 26 14:25:46 PDT 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: swoose ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 24 On-line CPU(s) list: 0-23 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 44 Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz Stepping: 2 CPU MHz: 2925.993 BogoMIPS: 5851.93 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 12288K NUMA node0 CPU(s): 0-23 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp pti tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 70G 8.1G 42G 472M 19G 61G Swap: 4.7G 0B 4.7G
No LSB modules are available.
%env wd=/home/sam/analyses/20190826_pgen_genome_feature_counts
wd="/home/sam/analyses/20190826_pgen_genome_feature_counts"
%env gffs=owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v07[04]*.gff
%env wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Pgenerosa_v07[04]*.gff' https://owl.fish.washington.edu/halfshell/genomic-databank/
env: wd=/home/sam/analyses/20190826_pgen_genome_feature_counts env: gffs=owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v07[04]*.gff env: wget_gffs=--directory-prefix=$/home/sam/analyses/20190826_pgen_genome_feature_counts --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Pgenerosa_v07[04]*.gff' https://owl.fish.washington.edu/halfshell/genomic-databank/
import fnmatch
import os
import pandas
%%bash
mkdir --parents ${wd}
cd {wd}
/home/sam/analyses/20190826_pgen_genome_feature_counts
Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)
%%bash
rsync \
--archive \
--verbose \
--progress \
"${gffs}" .
rm Pgenerosa_v074.fa.out.gff Pgenerosa_v074.CpG.gff
echo ""
echo ""
echo "----------------------------------------------------------"
ls -lh
receiving incremental file list Pgenerosa_v074.CpG.gff 2,623,644,046 100% 28.14MB/s 0:01:28 (xfr#1, to-chk=6/18) Pgenerosa_v074.fa.out.gff 62,754,222 100% 20.47MB/s 0:00:02 (xfr#2, to-chk=4/18) sent 49 bytes received 2,686,726,727 bytes 29,045,694.88 bytes/sec total size is 2,778,393,209 speedup is 1.03 ---------------------------------------------------------- total 88M -rw-rw-r-- 1 sam users 20M May 23 15:02 Pgenerosa_v070.CDS.gff -rw-rw-r-- 1 sam users 22M May 23 15:02 Pgenerosa_v070.exon.gff -rw-rw-r-- 1 sam users 12M May 23 15:03 Pgenerosa_v070.gene.gff -rw-rw-r-- 1 sam users 16M May 23 15:03 Pgenerosa_v070.mRNA.gff -rw-rw-r-- 1 sam users 5.0M Aug 26 09:53 Pgenerosa_v070_top18_scaffolds.CDS.gff -rw-rw-r-- 1 sam users 5.6M Aug 26 09:54 Pgenerosa_v070_top18_scaffolds.exon.gff -rw-rw-r-- 1 sam users 500K Aug 26 11:32 Pgenerosa_v070_top18_scaffolds.five_prime_UTR.gff -rw-r--r-- 1 sam users 2.0M Aug 21 09:03 Pgenerosa_v070_top18_scaffolds.gene.gff -rw-rw-r-- 1 sam users 2.7M Aug 26 09:54 Pgenerosa_v070_top18_scaffolds.mRNA.gff -rw-rw-r-- 1 sam users 522K Aug 26 11:32 Pgenerosa_v070_top18_scaffolds.three_prime_UTR.gff -rw-rw-r-- 1 sam users 1.3M Jul 10 08:00 Pgenerosa_v074.CDS.gff -rw-rw-r-- 1 sam users 1.4M Jul 10 08:01 Pgenerosa_v074.exon.gff -rw-rw-r-- 1 sam users 102K Aug 22 15:41 Pgenerosa_v074.five_prime_UTR.gff -rw-rw-r-- 1 sam users 577K Jul 10 08:01 Pgenerosa_v074.gene.gff -rw-rw-r-- 1 sam users 716K Jul 10 08:02 Pgenerosa_v074.mRNA.gff -rw-rw-r-- 1 sam users 52K Aug 22 15:41 Pgenerosa_v074.three_prime_UTR.gff
# %%bash
# time \
# wget "${wget_gffs}"
# ls -lh ${wd}
# Set list of column header names
gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']
for file in os.listdir('.'):
if fnmatch.fnmatch(file, 'Pgenerosa_v07[04]*.gff'):
print('\n' * 2)
print(file)
print("-------------------------")
# Import GFF.
# Skip first row (gff header line) and indicate file is tab-separated
gff=pandas.read_csv(file, header=None, skiprows=1, sep="\t")
# Rename columns
gff.columns = gff_header
# Subtract start value from end value.
# Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system)
gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)
# Apply functions in list to seqlength column
gff_stats = gff['seqlength'].agg(['mean', 'min', 'median', 'max'])
print (gff_stats)
Pgenerosa_v070.exon.gff ------------------------- mean 217.971132 min 3.000000 median 139.000000 max 16912.000000 Name: seqlength, dtype: float64 Pgenerosa_v070_top18_scaffolds.three_prime_UTR.gff ------------------------- mean 481.02338 min 1.00000 median 221.00000 max 8355.00000 Name: seqlength, dtype: float64 Pgenerosa_v070_top18_scaffolds.five_prime_UTR.gff ------------------------- mean 148.558072 min 1.000000 median 89.000000 max 16912.000000 Name: seqlength, dtype: float64 Pgenerosa_v070_top18_scaffolds.exon.gff ------------------------- mean 219.024085 min 3.000000 median 131.000000 max 16912.000000 Name: seqlength, dtype: float64 Pgenerosa_v074.five_prime_UTR.gff ------------------------- mean 62.226496 min 1.000000 median 41.000000 max 1259.000000 Name: seqlength, dtype: float64 Pgenerosa_v070_top18_scaffolds.mRNA.gff ------------------------- mean 8696.297953 min 180.000000 median 4621.000000 max 139826.000000 Name: seqlength, dtype: float64 Pgenerosa_v074.CDS.gff ------------------------- mean 193.248049 min 1.000000 median 126.000000 max 11772.000000 Name: seqlength, dtype: float64 Pgenerosa_v070_top18_scaffolds.gene.gff ------------------------- mean 8696.297953 min 180.000000 median 4621.000000 max 139826.000000 Name: seqlength, dtype: float64 Pgenerosa_v074.mRNA.gff ------------------------- mean 12170.897196 min 192.000000 median 5552.000000 max 175905.000000 Name: seqlength, dtype: float64 Pgenerosa_v074.three_prime_UTR.gff ------------------------- mean 516.604585 min 1.000000 median 251.000000 max 4080.000000 Name: seqlength, dtype: float64 Pgenerosa_v074.gene.gff ------------------------- mean 12170.897196 min 192.000000 median 5552.000000 max 175905.000000 Name: seqlength, dtype: float64 Pgenerosa_v070.CDS.gff ------------------------- mean 194.539909 min 1.000000 median 132.000000 max 11048.000000 Name: seqlength, dtype: float64 Pgenerosa_v070.gene.gff ------------------------- mean 3768.232394 min 177.000000 median 1474.000000 max 139826.000000 Name: seqlength, dtype: float64 Pgenerosa_v070.mRNA.gff ------------------------- mean 3768.232394 min 177.000000 median 1474.000000 max 139826.000000 Name: seqlength, dtype: float64 Pgenerosa_v074.exon.gff ------------------------- mean 210.612431 min 3.000000 median 130.000000 max 11772.000000 Name: seqlength, dtype: float64 Pgenerosa_v070_top18_scaffolds.CDS.gff ------------------------- mean 177.728604 min 1.000000 median 123.000000 max 9851.000000 Name: seqlength, dtype: float64