See this GitHub Issue
This notebook relies on GFFutils to be installed and available in your $PATH
.
%%bash
echo "TODAY'S DATE"
date
echo "------------"
echo ""
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "
hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE Fri 25 Mar 2022 06:08:04 AM PDT ------------ Distributor ID: Ubuntu Description: Ubuntu 20.04.4 LTS Release: 20.04 Codename: focal ------------ HOSTNAME: computer ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 45 bits physical, 48 bits virtual CPU(s): 2 On-line CPU(s) list: 0,1 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 165 Model name: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz Stepping: 2 CPU MHz: 2400.007 BogoMIPS: 4800.01 Hypervisor vendor: VMware Virtualization type: full L1d cache: 64 KiB L1i cache: 64 KiB L2 cache: 512 KiB L3 cache: 32 MiB NUMA node0 CPU(s): 0,1 Vulnerability Itlb multihit: KVM: Mitigation: VMX unsupported Vulnerability L1tf: Mitigation; PTE Inversion Vulnerability Mds: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Meltdown: Mitigation; PTI Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling Vulnerability Srbds: Not affected Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat flush_l1d arch_capabilities ------------ Memory Specs total used free shared buff/cache available Mem: 54Gi 3.2Gi 47Gi 128Mi 4.2Gi 50Gi Swap: 2.0Gi 0B 2.0Gi
No LSB modules are available.
%env
indicates a bash variable%env
is Python variable# Set directories
%env data_dir=/home/sam/data/P_generosa/genomes
%env analysis_dir=/home/sam/analyses/20220324-pgen-gffs_to_fastas
analysis_dir="/home/samb/analyses/20220324-pgen-gffs_to_fastas"
# Input GFFs
%env gff_url=https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/
%env gff_prefix=Panopea-generosa-v1.0.a4.
# Genome FastA
%env genome_fasta=Panopea-generosa-v1.0.fa
# Programs
%env bedtools=/home/sam/programs/bedtools-2.29.1/bin/bedtools
%env samtools=/home/sam/programs/samtools-1.12/samtools
# Formatting
%env line_break="----------------------------------------------------------------------------------------------"
env: data_dir=/home/sam/data/P_generosa/genomes env: analysis_dir=/home/sam/analyses/20220324-pgen-gffs_to_fastas env: gff_url=https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/ env: gff_prefix=Panopea-generosa-v1.0.a4. env: genome_fasta=Panopea-generosa-v1.0.fa env: bedtools=/home/sam/programs/bedtools-2.29.1/bin/bedtools env: samtools=/home/sam/programs/samtools-1.12/samtools env: line_break="----------------------------------------------------------------------------------------------"
%%bash
mkdir --parents "${analysis_dir}" "${data_dir}"
If needing to download via wget
, be sure to include --no-check-certificate
option to avoid error.
%%bash
cd "${data_dir}"
# Array of GFF files.
gff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)
# Download GFFs
for gff in "${gff_array[@]}"
do
wget \
--no-check-certificate \
--continue \
--quiet \
${gff_url}${gff}
done
# Download FastA
wget \
--no-check-certificate \
--continue \
--quiet \
${gff_url}${genome_fasta}
ls -lh
total 985M -rw-rw-r-- 1 sam sam 53M Nov 5 2019 Panopea-generosa-v1.0.a4.CDS.gff3 -rw-rw-r-- 1 sam sam 9.5M Nov 5 2019 Panopea-generosa-v1.0.a4.gene.gff3 -rw-rw-r-- 1 sam sam 9.1M Nov 5 2019 Panopea-generosa-v1.0.a4.mRNA.gff3 -rw-rw-r-- 1 sam sam 914M Nov 5 2019 Panopea-generosa-v1.0.fa
%%bash
cd "${data_dir}"
md5sum *
b38127f901cd5f5f076bb85e40fab2f6 Panopea-generosa-v1.0.a4.CDS.gff3 5bf1cfc3ae2b68d41c49d0f732ade723 Panopea-generosa-v1.0.a4.gene.gff3 3514ad8a4fba72b00403ec604e9e32e4 Panopea-generosa-v1.0.a4.mRNA.gff3 b7b64f0ce79499d79a865348658d2e49 Panopea-generosa-v1.0.fa
%%bash
cd "${data_dir}"
# Array of GFF files
gff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)
# Make a list so subsequent head command lists filenames in output
gff_list=$(echo "${gff_array[@]}")
head ${gff_list}
==> Panopea-generosa-v1.0.a4.CDS.gff3 <== ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 Scaffold_01 GenSAS_5d9637f372b5d-publish CDS 2 125 . + 0 ID=PGEN_.00g000010.m01.CDS01;Name=PGEN_.00g000010.m01.CDS01;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01 Scaffold_01 GenSAS_5d9637f372b5d-publish CDS 1995 2095 . + 1 ID=PGEN_.00g000010.m01.CDS02;Name=PGEN_.00g000010.m01.CDS02;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01 Scaffold_01 GenSAS_5d9637f372b5d-publish CDS 3325 3495 . + 0 ID=PGEN_.00g000010.m01.CDS03;Name=PGEN_.00g000010.m01.CDS03;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01 Scaffold_01 GenSAS_5d9637f372b5d-publish CDS 4651 4719 . + 0 ID=PGEN_.00g000010.m01.CDS04;Name=PGEN_.00g000010.m01.CDS04;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01 Scaffold_01 GenSAS_5d9637f372b5d-publish CDS 19808 19943 . - 2 ID=PGEN_.00g000020.m01.CDS01;Name=PGEN_.00g000020.m01.CDS01;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01 Scaffold_01 GenSAS_5d9637f372b5d-publish CDS 21133 21362 . - 0 ID=PGEN_.00g000020.m01.CDS02;Name=PGEN_.00g000020.m01.CDS02;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01 Scaffold_01 GenSAS_5d9637f372b5d-publish CDS 22487 22613 . - 2 ID=PGEN_.00g000020.m01.CDS03;Name=PGEN_.00g000020.m01.CDS03;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01 ==> Panopea-generosa-v1.0.a4.mRNA.gff3 <== ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 Scaffold_01 GenSAS_5d9637f372b5d-publish mRNA 2 4719 . + . ID=PGEN_.00g000010.m01;Name=PGEN_.00g000010.m01;Parent=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140.m01;Alias=21510-PGEN_.00g234140.m01;original_name=21510-PGEN_.00g234140 Scaffold_01 GenSAS_5d9637f372b5d-publish mRNA 19808 36739 . - . ID=PGEN_.00g000020.m01;Name=PGEN_.00g000020.m01;Parent=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150.m01;Alias=21510-PGEN_.00g234150.m01;original_name=21510-PGEN_.00g234150 Scaffold_01 GenSAS_5d9637f372b5d-publish mRNA 49248 52578 . - . ID=PGEN_.00g000030.m01;Name=PGEN_.00g000030.m01;Parent=PGEN_.00g000030;original_ID=21510-PGEN_.00g234160.m02;Alias=21510-PGEN_.00g234160.m02;original_name=21510-PGEN_.00g234160 Scaffold_01 GenSAS_5d9637f372b5d-publish mRNA 49248 52023 . - . ID=PGEN_.00g000030.m02;Name=PGEN_.00g000030.m02;Parent=PGEN_.00g000030;original_ID=21510-PGEN_.00g234160.m01;Alias=21510-PGEN_.00g234160.m01;original_name=21510-PGEN_.00g234160 Scaffold_01 GenSAS_5d9637f372b5d-publish mRNA 55792 67546 . + . ID=PGEN_.00g000040.m01;Name=PGEN_.00g000040.m01;Parent=PGEN_.00g000040;original_ID=21510-PGEN_.00g234170.m01;Alias=21510-PGEN_.00g234170.m01;original_name=21510-PGEN_.00g234170 Scaffold_01 GenSAS_5d9637f372b5d-publish mRNA 67586 69113 . - . ID=PGEN_.00g000050.m01;Name=PGEN_.00g000050.m01;Parent=PGEN_.00g000050;original_ID=21510-PGEN_.00g234180.m01;Alias=21510-PGEN_.00g234180.m01;original_name=21510-PGEN_.00g234180 Scaffold_01 GenSAS_5d9637f372b5d-publish mRNA 70713 81099 . + . ID=PGEN_.00g000060.m01;Name=PGEN_.00g000060.m01;Parent=PGEN_.00g000060;original_ID=21510-PGEN_.00g234190.m01;Alias=21510-PGEN_.00g234190.m01;original_name=21510-PGEN_.00g234190 ==> Panopea-generosa-v1.0.a4.gene.gff3 <== ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 Scaffold_01 GenSAS_5d9637f372b5d-publish gene 2 4719 . + . ID=PGEN_.00g000010;Name=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140;Alias=21510-PGEN_.00g234140;original_name=21510-PGEN_.00g234140;Notes=sp|Q86IC9|CAMT1_DICDI [BLAST protein vs protein (blastp) 2.7.1],PF01596.12 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 19808 36739 . - . ID=PGEN_.00g000020;Name=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150;Alias=21510-PGEN_.00g234150;original_name=21510-PGEN_.00g234150;Notes=sp|P04177|TY3H_RAT [BLAST protein vs protein (blastp) 2.7.1],sp|P04177|TY3H_RAT [DIAMOND Functional 0.9.22],IPR036951 [InterProScan 5.29-68.0],PF00351.16 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 49248 52578 . - . ID=PGEN_.00g000030;Name=PGEN_.00g000030;original_ID=21510-PGEN_.00g234160;Alias=21510-PGEN_.00g234160;original_name=21510-PGEN_.00g234160;Notes=PF08054.6 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 55792 67546 . + . ID=PGEN_.00g000040;Name=PGEN_.00g000040;original_ID=21510-PGEN_.00g234170;Alias=21510-PGEN_.00g234170;original_name=21510-PGEN_.00g234170 Scaffold_01 GenSAS_5d9637f372b5d-publish gene 67586 69113 . - . ID=PGEN_.00g000050;Name=PGEN_.00g000050;original_ID=21510-PGEN_.00g234180;Alias=21510-PGEN_.00g234180;original_name=21510-PGEN_.00g234180;Notes=sp|Q8L840|RQL4A_ARATH [BLAST protein vs protein (blastp) 2.7.1],sp|Q8L840|RQL4A_ARATH [DIAMOND Functional 0.9.22],PF00270.24 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 70713 81099 . + . ID=PGEN_.00g000060;Name=PGEN_.00g000060;original_ID=21510-PGEN_.00g234190;Alias=21510-PGEN_.00g234190;original_name=21510-PGEN_.00g234190;Notes=sp|Q61043|NIN_MOUSE [DIAMOND Functional 0.9.22],PF04443.7 [Pfam 1.6] Scaffold_01 GenSAS_5d9637f372b5d-publish gene 183686 186073 . + . ID=PGEN_.00g000070;Name=PGEN_.00g000070;original_ID=21510-PGEN_.00g234200;Alias=21510-PGEN_.00g234200;original_name=21510-PGEN_.00g234200;Notes=PF15364.1 [Pfam 1.6]
Formatted to use the "name" column in the BED format for use with bedtools
later...
geneID|parentID
or geneID
%%bash
cd "${data_dir}"
# Array of GFF files
gff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)
for gff in "${gff_array[@]}"
do
# Trim of filename prefix
trimmed_name=${gff/Panopea-generosa-v1.0.a4./}
# Trim off filename suffix to get genome feature
feature=${trimmed_name/.gff3/}
if [[ "${feature}" = "mRNA" ]]
then
# Run gtf_extractor on GFF files
gtf_extract \
--gff \
--fields=chr,start,end,ID,Parent \
"${gff}" \
| awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "|" $5}' \
> ${analysis_dir}/"${feature}".bed.tmp
# Adds gene parent ID by rmoving mRNA ".m[0-9][0-9]" at end of line
elif [[ "${feature}" = "CDS" ]]
then
# Run gtf_extractor on GFF files
gtf_extract \
--gff \
--fields=chr,start,end,ID,Parent,Parent \
"${gff}" \
| awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "|" $5 "|" $6}' | sed 's/.m[0-9][0-9]$//' \
> ${analysis_dir}/"${feature}".bed.tmp
# Excludes Parent ID, since genes don't have a Parent ID
elif [[ "${feature}" = "gene" ]]
then
# Run gtf_extractor on GFF files
gtf_extract \
--gff \
--fields=chr,start,end,ID \
"${gff}" \
| awk '{print $1 "\t" $2 "\t" $3 "\t" $4}' \
> ${analysis_dir}/"${feature}".bed.tmp
fi
done
ls -lh ${analysis_dir}/*.bed.tmp
-rw-rw-r-- 1 sam sam 21M Mar 25 06:11 /home/sam/analyses/20220324-pgen-gffs_to_fastas/CDS.bed.tmp -rw-rw-r-- 1 sam sam 1.6M Mar 25 06:11 /home/sam/analyses/20220324-pgen-gffs_to_fastas/gene.bed.tmp -rw-rw-r-- 1 sam sam 2.4M Mar 25 06:11 /home/sam/analyses/20220324-pgen-gffs_to_fastas/mRNA.bed.tmp
%%bash
cd "${analysis_dir}"
for bed in *.bed.tmp
do
echo ""
echo "${bed}"
echo ""
head "${bed}"
echo ""
echo "${line_break}"
done
CDS.bed.tmp Scaffold_01 2 125 PGEN_.00g000010.m01.CDS01|PGEN_.00g000010.m01|PGEN_.00g000010 Scaffold_01 1995 2095 PGEN_.00g000010.m01.CDS02|PGEN_.00g000010.m01|PGEN_.00g000010 Scaffold_01 3325 3495 PGEN_.00g000010.m01.CDS03|PGEN_.00g000010.m01|PGEN_.00g000010 Scaffold_01 4651 4719 PGEN_.00g000010.m01.CDS04|PGEN_.00g000010.m01|PGEN_.00g000010 Scaffold_01 19808 19943 PGEN_.00g000020.m01.CDS01|PGEN_.00g000020.m01|PGEN_.00g000020 Scaffold_01 21133 21362 PGEN_.00g000020.m01.CDS02|PGEN_.00g000020.m01|PGEN_.00g000020 Scaffold_01 22487 22613 PGEN_.00g000020.m01.CDS03|PGEN_.00g000020.m01|PGEN_.00g000020 Scaffold_01 24824 24959 PGEN_.00g000020.m01.CDS04|PGEN_.00g000020.m01|PGEN_.00g000020 Scaffold_01 25981 26126 PGEN_.00g000020.m01.CDS05|PGEN_.00g000020.m01|PGEN_.00g000020 Scaffold_01 27969 28019 PGEN_.00g000020.m01.CDS06|PGEN_.00g000020.m01|PGEN_.00g000020 "----------------------------------------------------------------------------------------------" gene.bed.tmp Scaffold_01 2 4719 PGEN_.00g000010 Scaffold_01 19808 36739 PGEN_.00g000020 Scaffold_01 49248 52578 PGEN_.00g000030 Scaffold_01 55792 67546 PGEN_.00g000040 Scaffold_01 67586 69113 PGEN_.00g000050 Scaffold_01 70713 81099 PGEN_.00g000060 Scaffold_01 183686 186073 PGEN_.00g000070 Scaffold_01 187328 188353 PGEN_.00g000080 Scaffold_01 189849 190460 PGEN_.00g000090 Scaffold_01 191069 191410 PGEN_.00g000100 "----------------------------------------------------------------------------------------------" mRNA.bed.tmp Scaffold_01 2 4719 PGEN_.00g000010.m01|PGEN_.00g000010 Scaffold_01 19808 36739 PGEN_.00g000020.m01|PGEN_.00g000020 Scaffold_01 49248 52578 PGEN_.00g000030.m01|PGEN_.00g000030 Scaffold_01 49248 52023 PGEN_.00g000030.m02|PGEN_.00g000030 Scaffold_01 55792 67546 PGEN_.00g000040.m01|PGEN_.00g000040 Scaffold_01 67586 69113 PGEN_.00g000050.m01|PGEN_.00g000050 Scaffold_01 70713 81099 PGEN_.00g000060.m01|PGEN_.00g000060 Scaffold_01 183686 186073 PGEN_.00g000070.m01|PGEN_.00g000070 Scaffold_01 187328 188353 PGEN_.00g000080.m01|PGEN_.00g000080 Scaffold_01 189849 190460 PGEN_.00g000090.m01|PGEN_.00g000090 "----------------------------------------------------------------------------------------------"
%%bash
cd "${analysis_dir}"
for bed in *.bed.tmp
do
# Get feature by removing strings after first period
feature=${bed%%.*}
# Used BEDTOOLS getfasta to make FastAs from GFFs
${bedtools} getfasta \
-name \
-fi ${data_dir}/${genome_fasta} \
-bed ${bed} \
> ${gff_prefix}${feature}.fasta
# Remove tmp BED file
echo ""
echo "Removing ${bed}."
rm "${bed}"
done
ls -lh *.fasta
Removing CDS.bed.tmp. Removing gene.bed.tmp. Removing mRNA.bed.tmp. -rw-rw-r-- 1 sam sam 67M Mar 25 06:11 Panopea-generosa-v1.0.a4.CDS.fasta -rw-rw-r-- 1 sam sam 362M Mar 25 06:11 Panopea-generosa-v1.0.a4.gene.fasta -rw-rw-r-- 1 sam sam 475M Mar 25 06:11 Panopea-generosa-v1.0.a4.mRNA.fasta
index file /home/sam/data/P_generosa/genomes/Panopea-generosa-v1.0.fa.fai not found, generating...
%%bash
cd "${analysis_dir}"
for fasta in *.fasta
do
${samtools} faidx "${fasta}"
done
ls -ltrh
total 934M -rw-rw-r-- 1 sam sam 67M Mar 25 06:11 Panopea-generosa-v1.0.a4.CDS.fasta -rw-rw-r-- 1 sam sam 362M Mar 25 06:11 Panopea-generosa-v1.0.a4.gene.fasta -rw-rw-r-- 1 sam sam 475M Mar 25 06:11 Panopea-generosa-v1.0.a4.mRNA.fasta -rw-rw-r-- 1 sam sam 26M Mar 25 06:11 Panopea-generosa-v1.0.a4.CDS.fasta.fai -rw-rw-r-- 1 sam sam 2.4M Mar 25 06:11 Panopea-generosa-v1.0.a4.gene.fasta.fai -rw-rw-r-- 1 sam sam 3.4M Mar 25 06:11 Panopea-generosa-v1.0.a4.mRNA.fasta.fai
%%bash
cd "${analysis_dir}"
for fasta in *.fasta
do
grep --with-filename "^>" "${fasta}" | head
done
Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000010.m01.CDS01|PGEN_.00g000010.m01|PGEN_.00g000010::Scaffold_01:2-125 Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000010.m01.CDS02|PGEN_.00g000010.m01|PGEN_.00g000010::Scaffold_01:1995-2095 Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000010.m01.CDS03|PGEN_.00g000010.m01|PGEN_.00g000010::Scaffold_01:3325-3495 Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000010.m01.CDS04|PGEN_.00g000010.m01|PGEN_.00g000010::Scaffold_01:4651-4719 Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000020.m01.CDS01|PGEN_.00g000020.m01|PGEN_.00g000020::Scaffold_01:19808-19943 Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000020.m01.CDS02|PGEN_.00g000020.m01|PGEN_.00g000020::Scaffold_01:21133-21362 Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000020.m01.CDS03|PGEN_.00g000020.m01|PGEN_.00g000020::Scaffold_01:22487-22613 Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000020.m01.CDS04|PGEN_.00g000020.m01|PGEN_.00g000020::Scaffold_01:24824-24959 Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000020.m01.CDS05|PGEN_.00g000020.m01|PGEN_.00g000020::Scaffold_01:25981-26126 Panopea-generosa-v1.0.a4.CDS.fasta:>PGEN_.00g000020.m01.CDS06|PGEN_.00g000020.m01|PGEN_.00g000020::Scaffold_01:27969-28019 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000010::Scaffold_01:2-4719 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000020::Scaffold_01:19808-36739 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000030::Scaffold_01:49248-52578 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000040::Scaffold_01:55792-67546 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000050::Scaffold_01:67586-69113 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000060::Scaffold_01:70713-81099 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000070::Scaffold_01:183686-186073 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000080::Scaffold_01:187328-188353 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000090::Scaffold_01:189849-190460 Panopea-generosa-v1.0.a4.gene.fasta:>PGEN_.00g000100::Scaffold_01:191069-191410 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000010.m01|PGEN_.00g000010::Scaffold_01:2-4719 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000020.m01|PGEN_.00g000020::Scaffold_01:19808-36739 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000030.m01|PGEN_.00g000030::Scaffold_01:49248-52578 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000030.m02|PGEN_.00g000030::Scaffold_01:49248-52023 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000040.m01|PGEN_.00g000040::Scaffold_01:55792-67546 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000050.m01|PGEN_.00g000050::Scaffold_01:67586-69113 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000060.m01|PGEN_.00g000060::Scaffold_01:70713-81099 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000070.m01|PGEN_.00g000070::Scaffold_01:183686-186073 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000080.m01|PGEN_.00g000080::Scaffold_01:187328-188353 Panopea-generosa-v1.0.a4.mRNA.fasta:>PGEN_.00g000090.m01|PGEN_.00g000090::Scaffold_01:189849-190460
%%bash
cd "${analysis_dir}"
md5sum * | tee --append checksums.md5
fb192eab0aefd5d3ba5bebef2a012f15 Panopea-generosa-v1.0.a4.CDS.fasta f2266a449290ea0383d2eb98eb3ed426 Panopea-generosa-v1.0.a4.CDS.fasta.fai 7c956b1c27d14bd91959763403f81265 Panopea-generosa-v1.0.a4.gene.fasta 588d18f5fe0e4f2259a25586349fc244 Panopea-generosa-v1.0.a4.gene.fasta.fai 1823be75694cf70f0ea6f1abc072ba16 Panopea-generosa-v1.0.a4.mRNA.fasta e120b4c1d3bb0917868e72cd22507bbc Panopea-generosa-v1.0.a4.mRNA.fasta.fai
%%bash
cd "${data_dir}"
# Array of GFF files
gff_array=(Panopea-generosa-v1.0.a4.CDS.gff3 Panopea-generosa-v1.0.a4.mRNA.gff3 Panopea-generosa-v1.0.a4.gene.gff3)
# Remove genome FastA
echo "Removing ${genome_fasta}."
rm "${genome_fasta}"
# Remove GFFs
for gff in "${gff_array[@]}"
do
echo ""
echo "Removing ${gff}."
rm "${gff}"
done
ls -lh
Removing Panopea-generosa-v1.0.fa. Removing Panopea-generosa-v1.0.a4.CDS.gff3. Removing Panopea-generosa-v1.0.a4.mRNA.gff3. Removing Panopea-generosa-v1.0.a4.gene.gff3. total 4.0K -rw-rw-r-- 1 sam sam 658 Mar 25 06:11 Panopea-generosa-v1.0.fa.fai
%%bash
gtf_extract -h
echo ""
echo "${line_break}"
echo "${line_break}"
echo ""
${samtools} faidx -h
echo ""
echo "${line_break}"
echo "${line_break}"
echo ""
${bedtools} getfasta -h
usage: gtf_extract [-h] [-v] [-f FEATURE_TYPE] [--fields FIELD_LIST] [-o OUTFILE] [--gff] [-k] GTF_FILE Extract selected data items from a GTF file and output in tab-delimited format. The program can also operate on GFF files provided the --gff option is specified. positional arguments: GTF_FILE input GTF file to extract data items from optional arguments: -h, --help show this help message and exit -v, --version show program's version number and exit -f FEATURE_TYPE, --feature FEATURE_TYPE only extract data for lines where feature is FEATURE_TYPE --fields FIELD_LIST comma-separated list of fields to output in tab- delimited format for each line in the GTF, e.g. 'chrom,start,end'. Fields can either be a GTF field name (i.e. 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand' and 'frame') or the name of an attribute (e.g. 'gene_name', 'gene_id' etc). Data items are output in the order they appear in FIELD_LIST. If a field doesn't exist for a line then '.' will be output as the value. -o OUTFILE write output to OUTFILE (default is to write to stdout) --gff specify that the input file is GFF rather than GTF format -k, --keep-headers copy headers from input file to output "----------------------------------------------------------------------------------------------" "----------------------------------------------------------------------------------------------" Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]] Option: -o, --output FILE Write FASTA to file. -n, --length INT Length of FASTA sequence line. [60] -c, --continue Continue after trying to retrieve missing region. -r, --region-file FILE File of regions. Format is chr:from-to. One per line. -i, --reverse-complement Reverse complement sequences. --mark-strand TYPE Add strand indicator to sequence name TYPE = rc for /rc on negative strand (default) no for no strand indicator sign for (+) / (-) custom,<pos>,<neg> for custom indicator --fai-idx FILE name of the index file (default file.fa.fai). --gzi-idx FILE name of compressed file index (default file.fa.gz.gzi). -f, --fastq File and index in FASTQ format. -h, --help This message. "----------------------------------------------------------------------------------------------" "----------------------------------------------------------------------------------------------"
Tool: bedtools getfasta (aka fastaFromBed) Version: v2.29.1 Summary: Extract DNA sequences from a fasta file based on feature coordinates. Usage: bedtools getfasta [OPTIONS] -fi <fasta> -bed <bed/gff/vcf> Options: -fi Input FASTA file -fo Output file (opt., default is STDOUT -bed BED/GFF/VCF file of ranges to extract from -fi -name Use the name field and coordinates for the FASTA header -name+ (deprecated) Use the name field and coordinates for the FASTA header -nameOnly Use the name field for the FASTA header -split Given BED12 fmt., extract and concatenate the sequences from the BED "blocks" (e.g., exons) -tab Write output in TAB delimited format. - Default is FASTA format. -s Force strandedness. If the feature occupies the antisense, strand, the sequence will be reverse complemented. - By default, strand information is ignored. -fullHeader Use full fasta header. - By default, only the word before the first space or tab is used.
--------------------------------------------------------------------------- CalledProcessError Traceback (most recent call last) /tmp/ipykernel_124814/3531806193.py in <module> ----> 1 get_ipython().run_cell_magic('bash', '', '\ngtf_extract -h\n\necho ""\necho "${line_break}"\necho "${line_break}"\necho ""\n\n${samtools} faidx -h\n\necho ""\necho "${line_break}"\necho "${line_break}"\necho ""\n\n${bedtools} getfasta -h\n') ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell) 2417 with self.builtin_trap: 2418 args = (magic_arg_s, cell) -> 2419 result = fn(*args, **kwargs) 2420 return result 2421 ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/magics/script.py in named_script_magic(line, cell) 140 else: 141 line = script --> 142 return self.shebang(line, cell) 143 144 # write a basic docstring: ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/decorator.py in fun(*args, **kw) 230 if not kwsyntax: 231 args, kw = fix(args, kw, sig) --> 232 return caller(func, *(extras + args), **kw) 233 fun.__name__ = func.__name__ 234 fun.__doc__ = func.__doc__ ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k) 185 # but it's overkill for just that one bit of state. 186 def magic_deco(arg): --> 187 call = lambda f, *a, **k: f(*a, **k) 188 189 if callable(arg): ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/magics/script.py in shebang(self, line, cell) 243 sys.stderr.flush() 244 if args.raise_error and p.returncode!=0: --> 245 raise CalledProcessError(p.returncode, cell, output=out, stderr=err) 246 247 def _run_script(self, p, cell, to_close): CalledProcessError: Command 'b'\ngtf_extract -h\n\necho ""\necho "${line_break}"\necho "${line_break}"\necho ""\n\n${samtools} faidx -h\n\necho ""\necho "${line_break}"\necho "${line_break}"\necho ""\n\n${bedtools} getfasta -h\n'' returned non-zero exit status 1.