The simulations software simrrls is available at github.com/dereneaton/simrrls.
import os
import glob
import gzip
import itertools
import numpy as np
import simrrls
import ipyrad as ip
print "simrrls v.{}".format(simrrls.__version__)
print "ipyrad v.{}".format(ip.__version__)
simrrls v.0.0.13 ipyrad v.0.3.42
## the dir to write data to
DIR = "./ipsimdata/"
## if it doesn't exist make it
if not os.path.exists(DIR):
os.makedirs(DIR)
## if theres anything in it, remove it
for oldfile in glob.glob(os.path.join(DIR, "*")):
os.remove(oldfile)
%%bash -s $DIR
## simulate rad_example (includes indels)
simrrls -o $1/rad_example -f rad -dm 10 -ds 2 -I 0.05 -L 1000
## simulate larger rad data set
#simrrls -o $1/rad_large -f rad -dm 10 -ds 2 -I 0.01 -L 10000
## simulate pairddrad_example
simrrls -o $1/gbs_example -f gbs -dm 10 -ds 2 -I 0.05 -L 1000
## simulate pairddrad_example
simrrls -o $1/pairddrad_example -f pairddrad -dm 10 -ds 2 -L 1000
## simulate pairgbs_example
simrrls -o $1/pairgbs_example -f pairgbs -dm 10 -ds 2 -L 1000
## simulate pairddrad_wmerge_example
simrrls -o $1/pairddrad_wmerge_example -f pairddrad -dm 10 -ds 2 \
-i1 -50 -i2 100 -L 1000
## simulate pairgbs_wmerge_example (mixture of merged and non-merged reads)
simrrls -o $1/pairgbs_wmerge_example -f pairgbs -dm 10 -ds 2 \
-i1 -50 -i2 100 -L 1000
min insert size allows read overlaps/adapter sequences min insert size allows read overlaps/adapter sequences
import itertools
import gzip
import numpy as np
## Utility function
def revcomp(sequence):
"returns reverse complement of a string"
sequence = sequence[::-1].strip()\
.replace("A", "t")\
.replace("T", "a")\
.replace("C", "g")\
.replace("G", "c").upper()
return sequence
def RAD_to_genome(R1s, R2s, n_inserts, insert_low, insert_high, out_chr):
"""
Create a simulated genome file with RAD data interspersed.
"""
## start genome fasta sequence with 5000 random bp
genome = np.random.choice(list("ATGC"), 5000)
## read in the RAD data files
dat1 = gzip.open(R1s, 'r')
qiter1 = itertools.izip(*[iter(dat1)]*4)
if R2s:
dat2 = gzip.open(R2s, 'r')
qiter2 = itertools.izip(*[iter(dat2)]*4)
else:
qiter2 = itertools.izip(*[iter(str, 1)]*4)
## sample unique reads from rads
uniqs = []
locid = 0
while len(uniqs) < n_inserts:
## grab a read and get locus id
qrt1 = qiter1.next()
qrt2 = qiter2.next()
iloc = []
ilocid = int(qrt1[0].split("_")[1][5:])
## go until end of locus copies
while ilocid == locid:
iloc.append([qrt1[1].strip(), qrt2[1].strip()])
qrt1 = qiter1.next()
qrt2 = qiter2.next()
ilocid = int(qrt1[0].split("_")[1][5:])
## sample one read
uniqs.append(iloc[0])
locid += 1
## insert RADs into genome
for ins in range(n_inserts):
## get read, we leave the barcode on cuz it won't hurt
r1 = np.array(list(uniqs[ins][0]))
r2 = np.array(list(revcomp(uniqs[ins][1])))
## add R1 to genome
genome = np.concatenate((genome, r1), axis=0)
if qrt2[0]:
## add insert size
howlong = np.random.uniform(insert_low, insert_high)
chunk = np.random.choice(list("ATGC"), int(howlong))
#print howlong
genome = np.concatenate((genome, chunk), axis=0)
## add read2
genome = np.concatenate((genome, r2), axis=0)
## add spacer between loci
genome = np.concatenate((genome, np.random.choice(list("ATGC"), 5000)), axis=0)
print("input genome is {} bp".format(genome.shape[0]))
print("inserted {} loci into genome".format(n_inserts))
with open(out_chr, "w") as fasta:
header = ">MT dna:chromosome chromosome:test:MT:1:{}:1 REF\n"\
.format(genome.shape[0])
fasta.write(header)
outseq = list(genome)
for i in range(0, len(outseq), 70):
fasta.write("{}\n".format("".join(outseq[i:i+70])))
## Meta args
N_INSERTS = 500
INSERT_LOW = 250
INSERT_HIGH = 300
## SE RAD data
DATA_R1 = os.path.join(DIR, "rad_example_R1_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "rad_example_genome.fa")
big = RAD_to_genome(DATA_R1, 0, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
input genome is 2555000 bp inserted 500 loci into genome
## SE GBS data
DATA_R1 = os.path.join(DIR, "gbs_example_R1_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "gbs_example_genome.fa")
RAD_to_genome(DATA_R1, 0, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
input genome is 2555000 bp inserted 500 loci into genome
## PAIR ddRAD data
DATA_R1 = os.path.join(DIR, "pairddrad_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairddrad_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairddrad_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
input genome is 2742124 bp inserted 500 loci into genome
## PAIR ddRAD data
DATA_R1 = os.path.join(DIR, "pairddrad_wmerge_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairddrad_wmerge_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairddrad_wmerge_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
input genome is 2742116 bp inserted 500 loci into genome
## PAIR GBS data
DATA_R1 = os.path.join(DIR, "pairgbs_wmerge_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairgbs_wmerge_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairgbs_wmerge_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
input genome is 2743007 bp inserted 500 loci into genome
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests1")
data1.set_params(2, os.path.join(DIR, 'rad_example_R1_.fastq.gz'))
data1.set_params(3, os.path.join(DIR, 'rad_example_barcodes.txt'))
data1.set_params("max_alleles_consens", 4) ## raise this for now
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, os.path.join(DIR, 'rad_example_genome.fa'))
## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')
## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')
## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)
New Assembly: denovo Assembly: denovo [force] overwriting fastq files previously *created by ipyrad* in: /home/deren/Documents/ipyrad/tests/tests1/denovo_fastqs This does not affect your *original/raw data files* [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:03 [####################] 100% writing/compressing | 0:00:00 [####################] 100% processing reads | 0:00:04 [####################] 100% dereplicating | 0:00:00 [####################] 100% clustering | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:00:44 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:00:55 [####################] 100% consensus calling | 0:00:26 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:01 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:08 [####################] 100% database indels | 0:00:00 [####################] 100% indexing clusters | 0:00:01 [####################] 100% building database | 0:00:00 [####################] 100% filtering loci | 0:00:01 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:03 [####################] 100% writing vcf file | 0:00:01 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo_outfiles Assembly: reference [force] overwriting fastq files previously *created by ipyrad* in: /home/deren/Documents/ipyrad/tests/tests1/reference_fastqs This does not affect your *original/raw data files* [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:03 [####################] 100% writing/compressing | 0:00:00 [####################] 100% processing reads | 0:00:04 [####################] 100% dereplicating | 0:00:00 [####################] 100% mapping | 0:00:45 [####################] 100% finalize mapping | 0:00:14 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:00:31 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:00:41 [####################] 100% consensus calling | 0:00:15 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:04 [####################] 100% database indels | 0:00:00 [####################] 100% indexing clusters | 0:00:00 [####################] 100% building database | 0:00:00 [####################] 100% filtering loci | 0:00:01 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:02 [####################] 100% writing vcf file | 0:00:01 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests1/reference_outfiles Assembly: denovo-plus [force] overwriting fastq files previously *created by ipyrad* in: /home/deren/Documents/ipyrad/tests/tests1/denovo-plus_fastqs This does not affect your *original/raw data files* [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:03 [####################] 100% writing/compressing | 0:00:00 [####################] 100% processing reads | 0:00:04 Force reindexing of reference sequence [####################] 100% dereplicating | 0:00:00 [####################] 100% mapping | 0:00:46 [####################] 100% clustering | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% finalize mapping | 0:00:15 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:00:55 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:00:59 [####################] 100% consensus calling | 0:00:29 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:01 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:10 [####################] 100% database indels | 0:00:00 [####################] 100% indexing clusters | 0:00:01 [####################] 100% building database | 0:00:00 [####################] 100% filtering loci | 0:00:01 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:03 [####################] 100% writing vcf file | 0:00:01 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo-plus_outfiles Assembly: denovo-minus [force] overwriting fastq files previously *created by ipyrad* in: /home/deren/Documents/ipyrad/tests/tests1/denovo-minus_fastqs This does not affect your *original/raw data files* [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:03 [####################] 100% writing/compressing | 0:00:00 [####################] 100% processing reads | 0:00:04 Force reindexing of reference sequence [####################] 100% dereplicating | 0:00:00 [####################] 100% mapping | 0:00:12 [####################] 100% clustering | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:00:26 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:00:42 [####################] 100% consensus calling | 0:00:16 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:05 [####################] 100% database indels | 0:00:00 [####################] 100% indexing clusters | 0:00:01 [####################] 100% building database | 0:00:00 [####################] 100% filtering loci | 0:00:01 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:02 [####################] 100% writing vcf file | 0:00:01 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo-minus_outfiles
NLOCI = 1000
## check results
assert all(data1.stats.clusters_hidepth == NLOCI)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
assert all(data3.stats.clusters_hidepth == NLOCI)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert all(data4.stats.clusters_hidepth == NLOCI-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == NLOCI-N_INSERTS
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests2")
data1.set_params(2, os.path.join(DIR, "gbs_example_R1_.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "gbs_example_barcodes.txt"))
data1.set_params("max_alleles_consens", 4) ## raise this for now
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, "reference")
data2.set_params(6, os.path.join(DIR, "gbs_example_genome.fa"))
## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')
## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')
## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)
New Assembly: denovo Assembly: denovo [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:03 [####################] 100% writing/compressing | 0:00:00 [####################] 100% processing reads | 0:00:33 [####################] 100% dereplicating | 0:00:00 [####################] 100% clustering | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:00:51 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:00:51 [####################] 100% consensus calling | 0:00:24 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:08 [####################] 100% indexing clusters | 0:00:03 [####################] 100% building database | 0:00:05 [####################] 100% filtering loci | 0:00:04 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:14 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo_outfiles Assembly: reference [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:03 [####################] 100% writing/compressing | 0:00:00 [####################] 100% processing reads | 0:00:32 [####################] 100% dereplicating | 0:00:00 [####################] 100% mapping | 0:00:55 [####################] 100% finalize mapping | 0:00:20 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:00:37 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:00:37 [####################] 100% consensus calling | 0:00:14 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:04 [####################] 100% indexing clusters | 0:00:03 [####################] 100% building database | 0:00:02 [####################] 100% filtering loci | 0:00:00 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:07 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests2/reference_outfiles Assembly: denovo-plus [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:03 [####################] 100% writing/compressing | 0:00:00 [####################] 100% processing reads | 0:00:34 Force reindexing of reference sequence [####################] 100% dereplicating | 0:00:00 [####################] 100% mapping | 0:00:54 [####################] 100% clustering | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% finalize mapping | 0:00:21 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:01:06 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:00:55 [####################] 100% consensus calling | 0:00:26 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:10 [####################] 100% indexing clusters | 0:00:03 [####################] 100% building database | 0:00:05 [####################] 100% filtering loci | 0:00:00 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:14 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo-plus_outfiles Assembly: denovo-minus [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:03 [####################] 100% writing/compressing | 0:00:00 [####################] 100% processing reads | 0:00:35 Force reindexing of reference sequence [####################] 100% dereplicating | 0:00:00 [####################] 100% mapping | 0:00:11 [####################] 100% clustering | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:00:27 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:00:37 [####################] 100% consensus calling | 0:00:15 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:04 [####################] 100% indexing clusters | 0:00:03 [####################] 100% building database | 0:00:02 [####################] 100% filtering loci | 0:00:04 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:07 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo-minus_outfiles
## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests3")
data1.set_params(2, os.path.join(DIR, "pairddrad_example_*.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "pairddrad_example_barcodes.txt"))
data1.set_params(7, "pairddrad")
data1.set_params(8, ("TGCAG", "CCGG"))
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, "reference")
data2.set_params(6, os.path.join(DIR, "pairddrad_example_genome.fa"))
## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')
## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')
## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)
New Assembly: denovo Assembly: denovo [force] overwriting fastq files previously *created by ipyrad* in: /home/deren/Documents/ipyrad/tests/tests3/denovo_fastqs This does not affect your *original/raw data files* [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:05 [####################] 100% writing/compressing | 0:00:01 [####################] 100% processing reads | 0:00:07 [####################] 100% dereplicating | 0:00:01 [####################] 100% clustering | 0:00:06 [####################] 100% building clusters | 0:00:00 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:02:00 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:01:14 [####################] 100% consensus calling | 0:00:51 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:02 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:16 [####################] 100% database indels | 0:00:00 [####################] 100% indexing clusters | 0:00:01 [####################] 100% building database | 0:00:00 [####################] 100% filtering loci | 0:00:01 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:03 [####################] 100% writing vcf file | 0:00:01 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo_outfiles Assembly: reference [force] overwriting fastq files previously *created by ipyrad* in: /home/deren/Documents/ipyrad/tests/tests3/reference_fastqs This does not affect your *original/raw data files* [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:05 [####################] 100% writing/compressing | 0:00:01 [####################] 100% processing reads | 0:00:06 Force reindexing of reference sequence [####################] 100% dereplicating | 0:00:01 [####################] 100% mapping | 0:02:04 [####################] 100% finalize mapping | 0:00:36 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:01:03 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:00:55 [####################] 100% consensus calling | 0:00:29 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:01 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:09 [####################] 100% database indels | 0:00:00 [####################] 100% indexing clusters | 0:00:01 [####################] 100% building database | 0:00:00 [####################] 100% filtering loci | 0:00:01 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:02 [####################] 100% writing vcf file | 0:00:01 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests3/reference_outfiles Assembly: denovo-plus [force] overwriting fastq files previously *created by ipyrad* in: /home/deren/Documents/ipyrad/tests/tests3/denovo-plus_fastqs This does not affect your *original/raw data files* [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:06 [####################] 100% writing/compressing | 0:00:01 [####################] 100% processing reads | 0:00:07 Force reindexing of reference sequence [####################] 100% dereplicating | 0:00:00 [####################] 100% mapping | 0:02:14 [####################] 100% clustering | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% finalize mapping | 0:00:37 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:02:15 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:01:32 [####################] 100% consensus calling | 0:00:55 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:02 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:17 [####################] 100% database indels | 0:00:00 [####################] 100% indexing clusters | 0:00:01 [####################] 100% building database | 0:00:00 [####################] 100% filtering loci | 0:00:01 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:03 [####################] 100% writing vcf file | 0:00:01 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo-plus_outfiles Assembly: denovo-minus [force] overwriting fastq files previously *created by ipyrad* in: /home/deren/Documents/ipyrad/tests/tests3/denovo-minus_fastqs This does not affect your *original/raw data files* [####################] 100% chunking large files | 0:00:00 [####################] 100% sorting reads | 0:00:05 [####################] 100% writing/compressing | 0:00:01 [####################] 100% processing reads | 0:00:07 Force reindexing of reference sequence [####################] 100% dereplicating | 0:00:01 [####################] 100% mapping | 0:00:52 [####################] 100% clustering | 0:00:00 [####################] 100% building clusters | 0:00:00 [####################] 100% chunking | 0:00:00 [####################] 100% aligning | 0:01:03 [####################] 100% concatenating | 0:00:00 [####################] 100% inferring [H, E] | 0:01:02 [####################] 100% consensus calling | 0:00:30 [####################] 100% concat/shuffle input | 0:00:00 [####################] 100% clustering across | 0:00:01 [####################] 100% building clusters | 0:00:00 [####################] 100% aligning clusters | 0:00:09 [####################] 100% database indels | 0:00:00 [####################] 100% indexing clusters | 0:00:01 [####################] 100% building database | 0:00:00 [####################] 100% filtering loci | 0:00:01 [####################] 100% building loci/stats | 0:00:01 [####################] 100% building vcf file | 0:00:02 [####################] 100% writing vcf file | 0:00:01 [####################] 100% writing outfiles | 0:00:01 Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo-minus_outfiles
## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) <ipython-input-54-cce3ab63dc2d> in <module>() 4 5 assert all(data2.stats.clusters_hidepth == N_INSERTS) ----> 6 assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS 7 8 assert all(data3.stats.clusters_hidepth == 1000) AssertionError:
data2.stats_dfs
s1 : reads_raw 1A_0 19835 1B_0 20071 1C_0 19969 1D_0 20082 2E_0 20004 2F_0 19899 2G_0 19928 2H_0 20110 3I_0 20078 3J_0 19965 3K_0 19846 3L_0 20025 s2 : reads_raw trim_adapter_bp_read1 trim_adapter_bp_read2 \ 1A_0 19835 0 0 1B_0 20071 0 0 1C_0 19969 0 0 1D_0 20082 0 0 2E_0 20004 0 0 2F_0 19899 0 0 2G_0 19928 0 0 2H_0 20110 0 0 3I_0 20078 0 0 3J_0 19965 0 0 3K_0 19846 0 0 3L_0 20025 0 0 trim_quality_bp_read1 trim_quality_bp_read2 reads_filtered_by_Ns \ 1A_0 0 0 0 1B_0 0 0 0 1C_0 0 0 0 1D_0 0 0 0 2E_0 0 0 0 2F_0 0 0 0 2G_0 0 0 0 2H_0 0 0 0 3I_0 0 0 0 3J_0 0 0 0 3K_0 0 0 0 3L_0 0 0 0 reads_filtered_by_minlen reads_passed_filter 1A_0 0 19835 1B_0 0 20071 1C_0 0 19969 1D_0 0 20082 2E_0 0 20004 2F_0 0 19899 2G_0 0 19928 2H_0 0 20110 3I_0 0 20078 3J_0 0 19965 3K_0 0 19846 3L_0 0 20025 s3 : merged_pairs clusters_total clusters_hidepth avg_depth_total \ 1A_0 19835 500 500 19.956 1B_0 20071 500 500 20.144 1C_0 19969 500 500 20.108 1D_0 20082 500 500 19.914 2E_0 20004 500 500 19.994 2F_0 19899 500 500 19.786 2G_0 19928 500 500 19.936 2H_0 20110 500 500 20.062 3I_0 20078 500 500 20.022 3J_0 19965 500 500 19.832 3K_0 19846 500 500 19.800 3L_0 20025 500 500 20.046 avg_depth_mj avg_depth_stat sd_depth_total sd_depth_mj \ 1A_0 19.956 19.956 2.694079 2.694079 1B_0 20.144 20.144 2.854341 2.854341 1C_0 20.108 20.108 2.746695 2.746695 1D_0 19.914 19.914 2.861923 2.861923 2E_0 19.994 19.994 2.828067 2.828067 2F_0 19.786 19.786 2.879619 2.879619 2G_0 19.936 19.936 2.994646 2.994646 2H_0 20.062 20.062 2.876483 2.876483 3I_0 20.022 20.022 2.789537 2.789537 3J_0 19.832 19.832 2.924342 2.924342 3K_0 19.800 19.800 2.802856 2.802856 3L_0 20.046 20.046 2.799979 2.799979 sd_depth_stat filtered_bad_align 1A_0 2.694079 0 1B_0 2.854341 0 1C_0 2.746695 0 1D_0 2.861923 0 2E_0 2.828067 0 2F_0 2.879619 0 2G_0 2.994646 0 2H_0 2.876483 0 3I_0 2.789537 0 3J_0 2.924342 0 3K_0 2.802856 0 3L_0 2.799979 0 s4 : hetero_est error_est 1A_0 0.001792 0.000785 1B_0 0.001959 0.000769 1C_0 0.001835 0.000715 1D_0 0.001540 0.000714 2E_0 0.002120 0.000742 2F_0 0.001826 0.000757 2G_0 0.002056 0.000727 2H_0 0.002176 0.000683 3I_0 0.001933 0.000706 3J_0 0.001724 0.000726 3K_0 0.001869 0.000755 3L_0 0.002099 0.000723 s5 : clusters_total filtered_by_depth filtered_by_maxH filtered_by_maxN \ 1A_0 500 0 0 0 1B_0 500 0 0 0 1C_0 500 0 0 0 1D_0 500 0 0 0 2E_0 500 0 0 0 2F_0 500 0 0 0 2G_0 500 0 0 0 2H_0 500 0 0 0 3I_0 500 0 0 0 3J_0 500 0 0 0 3K_0 500 0 0 0 3L_0 500 0 0 0 reads_consens nsites nhetero heterozygosity 1A_0 500 97500 165 0.001692 1B_0 500 97500 180 0.001846 1C_0 500 97500 168 0.001723 1D_0 500 97500 142 0.001456 2E_0 500 97500 196 0.002010 2F_0 500 97500 169 0.001733 2G_0 500 97500 188 0.001928 2H_0 500 97500 201 0.002062 3I_0 500 97499 178 0.001826 3J_0 500 97500 156 0.001600 3K_0 500 97500 175 0.001795 3L_0 500 97500 192 0.001969 s7_filters : total_filters applied_order retained_loci total_prefiltered_loci 500 0 500 filtered_by_rm_duplicates 0 0 500 filtered_by_max_indels 0 0 500 filtered_by_max_snps 0 0 500 filtered_by_max_shared_het 0 0 500 filtered_by_min_sample 0 0 500 filtered_by_max_alleles 2 2 498 total_filtered_loci 498 0 498 s7_loci : locus_coverage sum_coverage 1 0 0 2 0 0 3 0 0 4 0 0 5 0 0 6 0 0 7 0 0 8 0 0 9 0 0 10 0 0 11 0 0 12 498 498 s7_samples : sample_coverage 1A_0 498 1B_0 498 1C_0 498 1D_0 498 2E_0 498 2F_0 498 2G_0 498 2H_0 498 3I_0 498 3J_0 498 3K_0 498 3L_0 498 s7_snps : var sum_var pis sum_pis 0 0 0 55 0 1 0 0 132 132 2 1 2 122 376 3 15 47 88 640 4 12 95 55 860 5 32 255 25 985 .. ... ... ... ... 16 8 4322 0 1118 17 1 4339 0 1118 18 1 4357 0 1118 19 0 4357 0 1118 20 1 4377 0 1118 21 1 4398 0 1118 [22 rows x 4 columns]
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests4")
data1.set_params(2, os.path.join(DIR, "pairgbs_wmerge_example_*.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "pairgbs_wmerge_example_barcodes.txt"))
data1.set_params(7, "pairgbs")
data1.set_params(8, ("TGCAG", "TGCAG"))
##...
## branch into an assembly for denovo+reference
data3 = data1.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')
## branch into an assembly for reference
data4 = data1.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')
## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)
## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS
%%bash -s $DIR
## compressed dir/ w/ all data files
tar -zcvf ipsimdata.tar.gz $1
./ipsimdata/ ./ipsimdata/pairgbs_example_R2_.fastq.gz ./ipsimdata/pairgbs_wmerge_example_barcodes.txt ./ipsimdata/rad_example_genome.fa ./ipsimdata/pairddrad_example_genome.fa ./ipsimdata/pairgbs_example_R1_.fastq.gz ./ipsimdata/pairgbs_wmerge_example_R2_.fastq.gz ./ipsimdata/rad_example_genome.fa.fai ./ipsimdata/pairddrad_example_R2_.fastq.gz ./ipsimdata/pairddrad_example_genome.fa.sma ./ipsimdata/pairddrad_example_genome.fa.fai ./ipsimdata/pairgbs_wmerge_example_genome.fa ./ipsimdata/pairddrad_wmerge_example_genome.fa ./ipsimdata/pairddrad_example_genome.fa.smi ./ipsimdata/pairgbs_wmerge_example_R1_.fastq.gz ./ipsimdata/rad_example_genome.fa.smi ./ipsimdata/gbs_example_barcodes.txt ./ipsimdata/pairgbs_example_barcodes.txt ./ipsimdata/pairddrad_example_R1_.fastq.gz ./ipsimdata/pairddrad_wmerge_example_barcodes.txt ./ipsimdata/rad_example_barcodes.txt ./ipsimdata/pairddrad_wmerge_example_R1_.fastq.gz ./ipsimdata/pairddrad_wmerge_example_R2_.fastq.gz ./ipsimdata/gbs_example_R1_.fastq.gz ./ipsimdata/pairddrad_example_barcodes.txt ./ipsimdata/rad_example_genome.fa.sma ./ipsimdata/rad_example_R1_.fastq.gz ./ipsimdata/gbs_example_genome.fa