%env work_dir = /home/sam/Downloads/20190523_pgen_Pgenerosa_v070_gffs
%env pgen70_fasta_url = http://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa
%env pgen70_gff_url = http://gannet.fish.washington.edu/Atumefaciens/20190228_pgen_maker_v070_annotation/Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff
work_dir = "/home/sam/Downloads/20190523_pgen_Pgenerosa_v070_gffs"
env: work_dir=/home/sam/Downloads/20190523_pgen_Pgenerosa_v070_gffs env: pgen70_fasta_url=http://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa env: pgen70_gff_url=http://gannet.fish.washington.edu/Atumefaciens/20190228_pgen_maker_v070_annotation/Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff
%%bash
mkdir "${work_dir}"
cd $work_dir
/home/sam/Downloads/20190523_pgen_Pgenerosa_v070_gffs
rsync
.¶An option is available to use wget
.
%%bash
rsync -av gannet:/volume2/web/Atumefaciens/20190228_pgen_maker_v070_annotation/Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff .
rsync -av owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v070.fa .
# Uncomment below to download if you don't have rsync access
# wget "${pgen70_fasta_url}"
# wget "${pgen70_gff_url}"
ls -lh
receiving incremental file list Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff sent 30 bytes received 7,597,953,355 bytes 98,038,108.19 bytes/sec total size is 7,597,025,836 speedup is 1.00 receiving incremental file list Pgenerosa_v070.fa sent 30 bytes received 2,247,392,295 bytes 25,394,263.56 bytes/sec total size is 2,247,117,885 speedup is 1.00 total 9.2G -rw-r--r-- 1 sam users 2.1G Feb 11 12:13 Pgenerosa_v070.fa -rw-r--r-- 1 sam users 7.1G May 19 02:16 Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff
%%bash
awk 'BEGIN { print "##gff-version 3" ; } $3 == "CDS" {print}' Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff > Pgenerosa_v070.CDS.gff
awk 'BEGIN { print "##gff-version 3" ; } $3 == "exon" {print}' Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff > Pgenerosa_v070.exon.gff
awk 'BEGIN { print "##gff-version 3" ; } $3 == "gene" {print}' Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff > Pgenerosa_v070.gene.gff
awk 'BEGIN { print "##gff-version 3" ; } $3 == "mRNA" {print}' Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff > Pgenerosa_v070.mRNA.gff
md5sum Pgenerosa_v070.*.gff
1c4e0bd30579798948c8e1e32c7052e3 Pgenerosa_v070.CDS.gff 6ea9360ea24ddfc4ed07486547c3319e Pgenerosa_v070.exon.gff cdb2fff32712dd4d9223dd8674b03a92 Pgenerosa_v070.gene.gff 1a6e0f4d9e515fa2b7d84c07f66cd8da Pgenerosa_v070.mRNA.gff
NOTE: GFF counts should subtract 1 to account for GFF header line (was too lazy bother subtraction programmatically)
%%bash
echo "GENOME ASSEMBLY CONTIGS:"
grep -c ">" Pgenerosa_v070.fa
echo "-------------------------"
echo ""
echo "CODING SEQUENCE COUNTS:"
wc -l Pgenerosa_v070.CDS.gff
echo "-------------------------"
echo ""
echo "EXON SEQUENCE COUNTS:"
wc -l Pgenerosa_v070.exon.gff
echo "-------------------------"
echo ""
echo "GENE SEQUENCE COUNTS:"
wc -l Pgenerosa_v070.gene.gff
echo "-------------------------"
echo ""
echo "mRNA SEQUENCE COUNTS:"
wc -l Pgenerosa_v070.mRNA.gff
echo "-------------------------"
echo ""
GENOME ASSEMBLY CONTIGS: 313649 ------------------------- CODING SEQUENCE COUNTS: 169461 Pgenerosa_v070.CDS.gff ------------------------- EXON SEQUENCE COUNTS: 175007 Pgenerosa_v070.exon.gff ------------------------- GENE SEQUENCE COUNTS: 53036 Pgenerosa_v070.gene.gff ------------------------- mRNA SEQUENCE COUNTS: 53036 Pgenerosa_v070.mRNA.gff -------------------------
%%bash
rsync -av Pgenerosa_v070.*.gff owl:/volume1/web/halfshell/genomic-databank/
sending incremental file list Pgenerosa_v070.CDS.gff Pgenerosa_v070.exon.gff Pgenerosa_v070.gene.gff Pgenerosa_v070.mRNA.gff sent 70,886,696 bytes received 95 bytes 10,905,660.15 bytes/sec total size is 70,869,112 speedup is 1.00
%%bash
head Pgenerosa_v070.CDS.gff
##gff-version 3 PGA_scaffold19103__1_contigs__length_6100 maker CDS 5114 5297 . - 0 ID=PGEN_00029322-RA:cds;Parent=PGEN_00029322-RA; PGA_scaffold19103__1_contigs__length_6100 maker CDS 3446 3906 . - 2 ID=PGEN_00029322-RA:cds;Parent=PGEN_00029322-RA; PGA_scaffold229666__1_contigs__length_7299 maker CDS 132 152 . + 0 ID=PGEN_00026601-RA:cds;Parent=PGEN_00026601-RA; PGA_scaffold229666__1_contigs__length_7299 maker CDS 3415 3567 . + 0 ID=PGEN_00026601-RA:cds;Parent=PGEN_00026601-RA; PGA_scaffold229666__1_contigs__length_7299 maker CDS 4891 5004 . + 0 ID=PGEN_00026601-RA:cds;Parent=PGEN_00026601-RA; PGA_scaffold285130__1_contigs__length_6957 maker CDS 4418 4774 . + 0 ID=PGEN_00027325-RA:cds;Parent=PGEN_00027325-RA; PGA_scaffold132901__1_contigs__length_1641 maker CDS 321 545 . + 0 ID=PGEN_00047624-RA:cds;Parent=PGEN_00047624-RA; PGA_scaffold132901__1_contigs__length_1641 maker CDS 867 1397 . + 0 ID=PGEN_00047624-RA:cds;Parent=PGEN_00047624-RA; PGA_scaffold132901__1_contigs__length_1641 maker CDS 912 1615 . - 0 ID=PGEN_00047623-RA:cds;Parent=PGEN_00047623-RA;