%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Wed Nov 6 14:57:08 PST 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: swoose ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 24 On-line CPU(s) list: 0-23 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 44 Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz Stepping: 2 CPU MHz: 2925.971 BogoMIPS: 5851.97 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 12288K NUMA node0 CPU(s): 0-23 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 70G 31G 450M 523M 38G 38G Swap: 4.7G 879M 3.8G
No LSB modules are available.
%env
are best for bash
# Set workding directory
%env wd=/home/sam/analyses/20191106_swoose_pgen_feature_stats
wd="/home/sam/analyses/20191106_swoose_pgen_feature_stats"
# File download commands
%env rsync_gannet=gannet:/volume2/web/Atumefaciens/20191105_swoose_pgen_v074_renaming/
%env wget_command=--directory-prefix=${wd} --quiety --no-directories --no-check-certificate https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/
# Input/output files
%env exon_gff=Panopea-generosa-v1.0.a4.exon.gff3
%env exon_bed=Panopea-generosa-v1.0.a4.exon.bed
%env gene_gff=Panopea-generosa-v1.0.a4.gene.gff3
%env gene_bed=Panopea-generosa-v1.0.a4.gene.bed
%env introns_bed=Panopea-generosa-v1.0.a4.introns.bed
%env intergenic_bed=Panopea-generosa-v1.0.a4.intergenic.bed
# Programs
%env intersectbed=/home/sam/programs/bedtools-2.28.0/bin/intersectBed
# Set list of column header names
bed_header = ['scaffold', 'start', 'end']
env: wd=/home/sam/analyses/20191106_swoose_pgen_feature_stats env: rsync_gannet=gannet:/volume2/web/Atumefaciens/20191105_swoose_pgen_v074_renaming/ env: wget_command=--directory-prefix=$/home/sam/analyses/20191106_swoose_pgen_feature_stats --quiety --no-directories --no-check-certificate https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/ env: exon_gff=Panopea-generosa-v1.0.a4.exon.gff3 env: exon_bed=Panopea-generosa-v1.0.a4.exon.bed env: gene_gff=Panopea-generosa-v1.0.a4.gene.gff3 env: gene_bed=Panopea-generosa-v1.0.a4.gene.bed env: introns_bed=Panopea-generosa-v1.0.a4.introns.bed env: intergenic_bed=Panopea-generosa-v1.0.a4.intergenic.bed env: intersectbed=/home/sam/programs/bedtools-2.28.0/bin/intersectBed
import fnmatch
import os
import pandas
%%bash
mkdir --parents ${wd}
cd {wd}
/home/sam/analyses/20191106_swoose_pgen_feature_stats
%%bash
# Create array of files from list
files_array=(${exon_gff} ${gene_gff} ${intergenic_bed} ${introns_bed})
for file in "${files_array[@]}"
do
rsync \
--archive \
--progress \
--verbose \
"${rsync_gannet}${file}" \
.
done
ls -lh
# If need to download via wget, uncomment lines in the cell below
# for file in "${files_array[@]}"
# do
# "${wget_command}${file}"
# done
# ls -lh ${wd}
receiving incremental file list sent 11 bytes received 79 bytes 180.00 bytes/sec total size is 57,148,140 speedup is 634,979.33 receiving incremental file list Panopea-generosa-v1.0.a4.gene.gff3 9,888,459 100% 110.95MB/s 0:00:00 (xfr#1, to-chk=0/1) sent 30 bytes received 9,889,785 bytes 19,779,630.00 bytes/sec total size is 9,888,459 speedup is 1.00 receiving incremental file list sent 11 bytes received 83 bytes 62.67 bytes/sec total size is 1,019,566 speedup is 10,846.45 receiving incremental file list Panopea-generosa-v1.0.a4.introns.bed 4,579,012 100% 111.97MB/s 0:00:00 (xfr#1, to-chk=0/1) sent 30 bytes received 4,579,691 bytes 9,159,442.00 bytes/sec total size is 4,579,012 speedup is 1.00 total 76M -rw-rw-r-- 1 sam sam 6.7M Nov 6 15:21 Panopea-generosa-v1.0.a4.exon.bed -rw-rw-r-- 1 sam users 55M Nov 5 08:55 Panopea-generosa-v1.0.a4.exon.gff3 -rw-rw-r-- 1 sam users 9.5M Nov 5 08:55 Panopea-generosa-v1.0.a4.gene.gff3 -rw-rw-r-- 1 sam users 996K Nov 5 08:59 Panopea-generosa-v1.0.a4.intergenic.bed -rw-rw-r-- 1 sam users 4.4M Nov 5 08:59 Panopea-generosa-v1.0.a4.introns.bed
Too lazy to write for loop
%%bash
echo "Preview GFF: ${exon_gff}"
echo ""
head "${exon_gff}"
echo ""
echo "---------------------"
echo ""
# Create BED file
# Skip first three lines (header) of GFF
# Print scaffold name, start, and stop positions; tab-delimited
awk 'NR > 3 {print $1"\t"$4"\t"$5}' "${exon_gff}" \
> "${exon_bed}"
echo "Preview BED: ${exon_bed}"
echo ""
head "${exon_bed}"
echo ""
echo "---------------------"
echo ""
echo "Preview GFF: ${gene_gff}"
echo ""
head "${gene_gff}"
echo ""
echo "---------------------"
echo ""
# Create BED file
# Skip first three lines (header) of GFF
# Print scaffold name, start, and stop positions; tab-delimited
awk 'NR > 3 {print $1"\t"$4"\t"$5}' "${gene_gff}" \
> "${gene_bed}"
echo "Preview BED: ${gene_bed}"
echo ""
head "${gene_bed}"
echo ""
Preview GFF: Panopea-generosa-v1.0.a4.exon.gff3 ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 Scaffold_01 GenSAS_5d9637f372b5d-publish exon 2 125 . + . ID=PGEN_.00g000010.m01.exon01;Name=PGEN_.00g000010.m01.exon01;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon1;Alias=21510-PGEN_.00g234140.m01.exon1 Scaffold_01 GenSAS_5d9637f372b5d-publish exon 1995 2095 . + . ID=PGEN_.00g000010.m01.exon02;Name=PGEN_.00g000010.m01.exon02;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon2;Alias=21510-PGEN_.00g234140.m01.exon2 Scaffold_01 GenSAS_5d9637f372b5d-publish exon 3325 3495 . + . ID=PGEN_.00g000010.m01.exon03;Name=PGEN_.00g000010.m01.exon03;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon3;Alias=21510-PGEN_.00g234140.m01.exon3 Scaffold_01 GenSAS_5d9637f372b5d-publish exon 4651 4719 . + . ID=PGEN_.00g000010.m01.exon04;Name=PGEN_.00g000010.m01.exon04;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon4;Alias=21510-PGEN_.00g234140.m01.exon4 Scaffold_01 GenSAS_5d9637f372b5d-publish exon 19808 19943 . - . ID=PGEN_.00g000020.m01.exon01;Name=PGEN_.00g000020.m01.exon01;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon10;Alias=21510-PGEN_.00g234150.m01.exon10 Scaffold_01 GenSAS_5d9637f372b5d-publish exon 21133 21362 . - . ID=PGEN_.00g000020.m01.exon02;Name=PGEN_.00g000020.m01.exon02;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon9;Alias=21510-PGEN_.00g234150.m01.exon9 Scaffold_01 GenSAS_5d9637f372b5d-publish exon 22487 22613 . - . ID=PGEN_.00g000020.m01.exon03;Name=PGEN_.00g000020.m01.exon03;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon8;Alias=21510-PGEN_.00g234150.m01.exon8 --------------------- Preview BED: Panopea-generosa-v1.0.a4.exon.bed Scaffold_01 2 125 Scaffold_01 1995 2095 Scaffold_01 3325 3495 Scaffold_01 4651 4719 Scaffold_01 19808 19943 Scaffold_01 21133 21362 Scaffold_01 22487 22613 Scaffold_01 24824 24959 Scaffold_01 25981 26126 Scaffold_01 27969 28019 --------------------- Preview GFF: Panopea-generosa-v1.0.a4.gene.gff3 ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 Scaffold_01 GenSAS_5d9637f372b5d-publish gene 2 4719 . + . ID=PGEN_.00g000010;Name=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140;Alias=21510-PGEN_.00g234140;original_name=21510-PGEN_.00g234140;Notes=sp|Q86IC9|CAMT1_DICDI [BLAST protein vs protein (blastp) 2.7.1],PF01596.12 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 19808 36739 . - . ID=PGEN_.00g000020;Name=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150;Alias=21510-PGEN_.00g234150;original_name=21510-PGEN_.00g234150;Notes=sp|P04177|TY3H_RAT [BLAST protein vs protein (blastp) 2.7.1],sp|P04177|TY3H_RAT [DIAMOND Functional 0.9.22],IPR036951 [InterProScan 5.29-68.0],PF00351.16 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 49248 52578 . - . ID=PGEN_.00g000030;Name=PGEN_.00g000030;original_ID=21510-PGEN_.00g234160;Alias=21510-PGEN_.00g234160;original_name=21510-PGEN_.00g234160;Notes=PF08054.6 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 55792 67546 . + . ID=PGEN_.00g000040;Name=PGEN_.00g000040;original_ID=21510-PGEN_.00g234170;Alias=21510-PGEN_.00g234170;original_name=21510-PGEN_.00g234170 Scaffold_01 GenSAS_5d9637f372b5d-publish gene 67586 69113 . - . ID=PGEN_.00g000050;Name=PGEN_.00g000050;original_ID=21510-PGEN_.00g234180;Alias=21510-PGEN_.00g234180;original_name=21510-PGEN_.00g234180;Notes=sp|Q8L840|RQL4A_ARATH [BLAST protein vs protein (blastp) 2.7.1],sp|Q8L840|RQL4A_ARATH [DIAMOND Functional 0.9.22],PF00270.24 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 70713 81099 . + . ID=PGEN_.00g000060;Name=PGEN_.00g000060;original_ID=21510-PGEN_.00g234190;Alias=21510-PGEN_.00g234190;original_name=21510-PGEN_.00g234190;Notes=sp|Q61043|NIN_MOUSE [DIAMOND Functional 0.9.22],PF04443.7 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 183686 186073 . + . ID=PGEN_.00g000070;Name=PGEN_.00g000070;original_ID=21510-PGEN_.00g234200;Alias=21510-PGEN_.00g234200;original_name=21510-PGEN_.00g234200;Notes=PF15364.1 [Pfam 1.6] --------------------- Preview BED: Panopea-generosa-v1.0.a4.gene.bed Scaffold_01 2 4719 Scaffold_01 19808 36739 Scaffold_01 49248 52578 Scaffold_01 55792 67546 Scaffold_01 67586 69113 Scaffold_01 70713 81099 Scaffold_01 183686 186073 Scaffold_01 187328 188353 Scaffold_01 189849 190460 Scaffold_01 191069 191410
%%bash
# Count the number of genes
num_genes=$(cat ${gene_bed} | wc -l)
echo "${num_genes} found in ${gene_bed}"
echo ""
echo "----------------------------"
echo ""
# Determine number of genes with no intron overlap (that's the -v option below)
num_genes_no_introns=$(${intersectbed} -v -a ${gene_bed} -b ${introns_bed} | wc -l)
echo "${num_genes_no_introns} genes with no introns"
echo ""
echo "----------------------------"
echo ""
# Determine percentage of genes with introns.
# Sets "scale" to get desired number of decimal places.
percent_genes_w_introns=$(echo "scale=4; 1 - (${num_genes_no_introns} / ${num_genes})" | bc)
echo "${percent_genes_w_introns} of genes contain introns"
34947 found in Panopea-generosa-v1.0.a4.gene.bed ---------------------------- 9772 genes with no introns ---------------------------- .7204 of genes contain introns
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*.bed'):
print('\n' * 2)
print(file)
print("-------------------------")
# Import GFF.
# Skip first 3 rows (gff header lines) and indicate file is tab-separated
bed=pandas.read_csv(file, header=None, sep="\t")
# Rename columns
bed.columns = bed_header
# Subtract start value from end value.
# Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system)
bed['seqlength'] = bed.apply(lambda position: position['end'] - position['start'] + 1, axis=1)
# Apply functions in list to seqlength column
bed_stats = bed['seqlength'].agg(['mean', 'min', 'median', 'max', 'sum'])
# Print table of calculation type and the result
for calc_type, calcs in bed_stats.iteritems():
print(calc_type, calcs, sep='\t')
Panopea-generosa-v1.0.a4.intergenic.bed ------------------------- mean 16393.607065170105 min 2.0 median 8114.0 max 370672.0 sum 565235178.0 Panopea-generosa-v1.0.a4.gene.bed ------------------------- mean 10811.04461041005 min 166.0 median 4464.0 max 283066.0 sum 377813576.0 Panopea-generosa-v1.0.a4.introns.bed ------------------------- mean 2184.529027548067 min 2.0 median 1199.0 max 93104.0 sum 338130141.0 Panopea-generosa-v1.0.a4.exon.bed ------------------------- mean 201.4769876772451 min 3.0 median 133.0 max 13221.0 sum 47741987.0
### Clean up
%%bash
# Cleanup
rm ${exon_bed} ${exon_gff} ${gene_bed} ${gene_gff} ${intergenic_bed} ${introns_bed}
ls -l
total 0
rm: cannot remove 'Panopea-generosa-v1.0.a4.exon.gff3': No such file or directory rm: cannot remove 'Panopea-generosa-v1.0.a4.gene.gff3': No such file or directory rm: cannot remove 'Panopea-generosa-v1.0.a4.intergenic.bed': No such file or directory rm: cannot remove 'Panopea-generosa-v1.0.a4.introns.bed': No such file or directory