%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Tue 13 Oct 2020 09:49:02 AM PDT ------------ Distributor ID: Ubuntu Description: Ubuntu 20.04.1 LTS Release: 20.04 Codename: focal ------------ HOSTNAME: mephisto ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 36 bits physical, 48 bits virtual CPU(s): 4 On-line CPU(s) list: 0-3 Thread(s) per core: 2 Core(s) per socket: 2 Socket(s): 1 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 58 Model name: Intel(R) Core(TM) i7-3517U CPU @ 1.90GHz Stepping: 9 CPU MHz: 2917.625 CPU max MHz: 3000.0000 CPU min MHz: 800.0000 BogoMIPS: 4789.55 Virtualization: VT-x L1d cache: 64 KiB L1i cache: 64 KiB L2 cache: 512 KiB L3 cache: 4 MiB NUMA node0 CPU(s): 0-3 Vulnerability Itlb multihit: KVM: Mitigation: Split huge pages Vulnerability L1tf: Mitigation; PTE Inversion; VMX conditional cache flushes, SMT vulnerable Vulnerability Mds: Mitigation; Clear CPU buffers; SMT vulnerable Vulnerability Meltdown: Mitigation; PTI Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Full generic retpoline, IBPB conditional, IBRS_FW, STIBP conditional, RSB filling Vulnerability Srbds: Vulnerable: No microcode Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm cpuid_fault epb pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt dtherm ida arat pln pts md_clear flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 7.5Gi 4.6Gi 665Mi 1.1Gi 2.3Gi 1.6Gi Swap: 21Gi 1.8Gi 20Gi
No LSB modules are available.
# Set data directories
%env muscle_fasta_dir=/home/samb/analyses/20201007_cbai_megan-read-extractions_201002558-2729-Q7
%env hemo_fasta_dir=/home/samb/analyses/20201007_cbai_megan-read-extractions_6129-403-26-Q7
%env fastq_dir=/home/samb/data/C_bairdi/DNAseq
%env wd=/home/samb/analyses
# Programs
%env seqtk=/home/samb/programs/seqtk_1.3-r115/seqtk
# File naming
%env suffix=megan.fq
env: muscle_fasta_dir=/home/samb/analyses/20201007_cbai_megan-read-extractions_201002558-2729-Q7 env: hemo_fasta_dir=/home/samb/analyses/20201007_cbai_megan-read-extractions_6129-403-26-Q7 env: fastq_dir=/home/samb/data/C_bairdi/DNAseq env: wd=/home/samb/analyses env: seqtk=/home/samb/programs/seqtk_1.3-r115/seqtk env: suffix=megan.fq
FastAs:
https://gannet.fish.washington.edu/Atumefaciens/20201007_cbai_megan-read-extractions_6129-403-26-Q7/
FastQs:
Use FastA IDs from MEGAN6 taxonomic read extraction FastAs to pull out appropriate reads from each taxa.
%%bash
timestamp=$(date +%Y%m%d)
for directory in ${muscle_fasta_dir} ${hemo_fasta_dir}
do
# Get sample name
sample=$(echo "${directory}" | cut -d "_" -f 4)
# Make new directory and change to that directory ("$_" means use previous command's argument)
mkdir --parents "${wd}"/"${timestamp}"_"${sample}"_megan-reads \
&& cd "$_" || exit
######################################################
# Create FastA IDs list to use for sequence extraction
######################################################
for fai in "${directory}"/*.fai
do
# Get species
if [[ "${sample}" = "201002558-2729-Q7" ]]; then
species=$(echo "${fai##*/}" | awk -F [.-] '{print $5}')
else
species=$(echo "${fai##*/}" | awk -F [.-] '{print $6}')
fi
# Set output FastQ filenames
prefix=${timestamp}_${sample}_${species}
# Set seqtk list filename
seqtk_list=${prefix}_seqtk-read-id-list
echo "Pulling FastA IDs from ${fai}"
echo ""
# Parse FastA IDs from FastA index file
awk '{print $1}' "${fai}" | sort -u >> "${seqtk_list}"
echo "Extracting reads from ${fastq}."
echo ""
out="${prefix}_${suffix}"
for fastq in ${fastq_dir}/*.fastq
do
# Extract corresponding reads using seqtk FastA ID list
${seqtk} subseq "${fastq}" "${seqtk_list}" >> "${out}"
done
echo "Writing reads to ${out}"
echo ""
echo ""
done
echo ""
echo "Done with read extractions"
echo ""
echo "-------------------------------------"
echo ""
# Print working directory and list files
pwd
ls -ltrh
echo ""
echo "-------------------------------------"
echo ""
done
Pulling FastA IDs from /home/samb/analyses/20201007_cbai_megan-read-extractions_201002558-2729-Q7/201002558-2729-Q7_summarized-reads-Aquifex_sp..fasta.fai Extracting reads from . Writing reads to 20201013_201002558-2729-Q7_Aquifex_sp_megan.fq Pulling FastA IDs from /home/samb/analyses/20201007_cbai_megan-read-extractions_201002558-2729-Q7/201002558-2729-Q7_summarized-reads-Arthropoda.fasta.fai Extracting reads from /home/samb/data/C_bairdi/DNAseq/20200928_cbai_nanopore_6129_403_26_quality-7.fastq. Writing reads to 20201013_201002558-2729-Q7_Arthropoda_megan.fq Pulling FastA IDs from /home/samb/analyses/20201007_cbai_megan-read-extractions_201002558-2729-Q7/201002558-2729-Q7_summarized-reads-Enterospora_canceri.fasta.fai Extracting reads from /home/samb/data/C_bairdi/DNAseq/20200928_cbai_nanopore_6129_403_26_quality-7.fastq. Writing reads to 20201013_201002558-2729-Q7_Enterospora_canceri_megan.fq Pulling FastA IDs from /home/samb/analyses/20201007_cbai_megan-read-extractions_201002558-2729-Q7/201002558-2729-Q7_summarized-reads-Sar.fasta.fai Extracting reads from /home/samb/data/C_bairdi/DNAseq/20200928_cbai_nanopore_6129_403_26_quality-7.fastq. Writing reads to 20201013_201002558-2729-Q7_Sar_megan.fq Done with read extractions ------------------------------------- /home/samb/analyses/20201013_201002558-2729-Q7_megan-reads total 13M -rw-rw-r-- 1 samb samb 11K Oct 13 10:23 20201013_201002558-2729-Q7_Aquifex_sp_seqtk-read-id-list -rw-rw-r-- 1 samb samb 914K Oct 13 10:23 20201013_201002558-2729-Q7_Aquifex_sp_megan.fq -rw-rw-r-- 1 samb samb 67K Oct 13 10:23 20201013_201002558-2729-Q7_Arthropoda_seqtk-read-id-list -rw-rw-r-- 1 samb samb 6.8M Oct 13 10:23 20201013_201002558-2729-Q7_Arthropoda_megan.fq -rw-rw-r-- 1 samb samb 57K Oct 13 10:23 20201013_201002558-2729-Q7_Enterospora_canceri_seqtk-read-id-list -rw-rw-r-- 1 samb samb 4.9M Oct 13 10:23 20201013_201002558-2729-Q7_Enterospora_canceri_megan.fq -rw-rw-r-- 1 samb samb 222 Oct 13 10:23 20201013_201002558-2729-Q7_Sar_seqtk-read-id-list -rw-rw-r-- 1 samb samb 30K Oct 13 10:23 20201013_201002558-2729-Q7_Sar_megan.fq ------------------------------------- Pulling FastA IDs from /home/samb/analyses/20201007_cbai_megan-read-extractions_6129-403-26-Q7/6129-403-26-Q7_summarized-reads-Alveolata.fasta.fai Extracting reads from /home/samb/data/C_bairdi/DNAseq/20200928_cbai_nanopore_6129_403_26_quality-7.fastq. Writing reads to 20201013_6129-403-26-Q7_Alveolata_megan.fq Pulling FastA IDs from /home/samb/analyses/20201007_cbai_megan-read-extractions_6129-403-26-Q7/6129-403-26-Q7_summarized-reads-Aquifex_sp..fasta.fai Extracting reads from /home/samb/data/C_bairdi/DNAseq/20200928_cbai_nanopore_6129_403_26_quality-7.fastq. Writing reads to 20201013_6129-403-26-Q7_Aquifex_sp_megan.fq Pulling FastA IDs from /home/samb/analyses/20201007_cbai_megan-read-extractions_6129-403-26-Q7/6129-403-26-Q7_summarized-reads-Arthropoda.fasta.fai Extracting reads from /home/samb/data/C_bairdi/DNAseq/20200928_cbai_nanopore_6129_403_26_quality-7.fastq. Writing reads to 20201013_6129-403-26-Q7_Arthropoda_megan.fq Pulling FastA IDs from /home/samb/analyses/20201007_cbai_megan-read-extractions_6129-403-26-Q7/6129-403-26-Q7_summarized-reads-Enterospora_canceri.fasta.fai Extracting reads from /home/samb/data/C_bairdi/DNAseq/20200928_cbai_nanopore_6129_403_26_quality-7.fastq. Writing reads to 20201013_6129-403-26-Q7_Enterospora_canceri_megan.fq Done with read extractions ------------------------------------- /home/samb/analyses/20201013_6129-403-26-Q7_megan-reads total 519M -rw-rw-r-- 1 samb samb 17K Oct 13 10:23 20201013_6129-403-26-Q7_Alveolata_seqtk-read-id-list -rw-rw-r-- 1 samb samb 3.5M Oct 13 10:24 20201013_6129-403-26-Q7_Alveolata_megan.fq -rw-rw-r-- 1 samb samb 152K Oct 13 10:24 20201013_6129-403-26-Q7_Aquifex_sp_seqtk-read-id-list -rw-rw-r-- 1 samb samb 41M Oct 13 10:24 20201013_6129-403-26-Q7_Aquifex_sp_megan.fq -rw-rw-r-- 1 samb samb 1.1M Oct 13 10:24 20201013_6129-403-26-Q7_Arthropoda_seqtk-read-id-list -rw-rw-r-- 1 samb samb 311M Oct 13 10:24 20201013_6129-403-26-Q7_Arthropoda_megan.fq -rw-rw-r-- 1 samb samb 655K Oct 13 10:24 20201013_6129-403-26-Q7_Enterospora_canceri_seqtk-read-id-list -rw-rw-r-- 1 samb samb 162M Oct 13 10:24 20201013_6129-403-26-Q7_Enterospora_canceri_megan.fq -------------------------------------