Notebook

Simulated RAD-seq data for ipyrad testing¶

The simulations software simrrls is available at github.com/dereneaton/simrrls.

In [1]:

import os
import glob
import gzip
import itertools
import numpy as np
import simrrls
import ipyrad as ip

print "simrrls v.{}".format(simrrls.__version__)
print "ipyrad v.{}".format(ip.__version__)

simrrls v.0.0.13
ipyrad v.0.3.42

Organization¶

In [31]:

## the dir to write data to
DIR = "./ipsimdata/"

## if it doesn't exist make it
if not os.path.exists(DIR):
    os.makedirs(DIR)
    
## if theres anything in it, remove it
for oldfile in glob.glob(os.path.join(DIR, "*")):
    os.remove(oldfile)
    

Simulate the data with simrrls¶

In [32]:

%%bash -s $DIR

## simulate rad_example (includes indels)
simrrls -o $1/rad_example -f rad -dm 10 -ds 2 -I 0.05 -L 1000

## simulate larger rad data set
#simrrls -o $1/rad_large -f rad -dm 10 -ds 2 -I 0.01 -L 10000

## simulate pairddrad_example
simrrls -o $1/gbs_example -f gbs -dm 10 -ds 2 -I 0.05 -L 1000

## simulate pairddrad_example
simrrls -o $1/pairddrad_example -f pairddrad -dm 10 -ds 2 -L 1000

## simulate pairgbs_example
simrrls -o $1/pairgbs_example -f pairgbs -dm 10 -ds 2 -L 1000

## simulate pairddrad_wmerge_example 
simrrls -o $1/pairddrad_wmerge_example -f pairddrad -dm 10 -ds 2 \
                                         -i1 -50 -i2 100 -L 1000

## simulate pairgbs_wmerge_example (mixture of merged and non-merged reads)
simrrls -o $1/pairgbs_wmerge_example -f pairgbs -dm 10 -ds 2 \
                                       -i1 -50 -i2 100 -L 1000

	min insert size allows read overlaps/adapter sequences 
	min insert size allows read overlaps/adapter sequences

Functions to insert reads into simulated genome¶

In [33]:

import itertools
import gzip
import numpy as np

In [34]:

## Utility function
def revcomp(sequence):
    "returns reverse complement of a string"
    sequence = sequence[::-1].strip()\
                             .replace("A", "t")\
                             .replace("T", "a")\
                             .replace("C", "g")\
                             .replace("G", "c").upper()
    return sequence

In [35]:

def RAD_to_genome(R1s, R2s, n_inserts, insert_low, insert_high, out_chr):
    """ 
    Create a simulated genome file with RAD data interspersed.
    """
    
    ## start genome fasta sequence with 5000 random bp
    genome = np.random.choice(list("ATGC"), 5000)
    
    ## read in the RAD data files
    dat1 = gzip.open(R1s, 'r')
    qiter1 = itertools.izip(*[iter(dat1)]*4)
    if R2s:
        dat2 = gzip.open(R2s, 'r')
        qiter2 = itertools.izip(*[iter(dat2)]*4)
    else:
        qiter2 = itertools.izip(*[iter(str, 1)]*4)
       
    ## sample unique reads from rads
    uniqs = []
    locid = 0
    while len(uniqs) < n_inserts:
        ## grab a read and get locus id
        qrt1 = qiter1.next()
        qrt2 = qiter2.next()
        iloc = []
        ilocid = int(qrt1[0].split("_")[1][5:])
        
        ## go until end of locus copies
        while ilocid == locid:
            iloc.append([qrt1[1].strip(), qrt2[1].strip()])
            qrt1 = qiter1.next()
            qrt2 = qiter2.next()
            ilocid = int(qrt1[0].split("_")[1][5:])
        
        ## sample one read
        uniqs.append(iloc[0])
        locid += 1
        
    ## insert RADs into genome
    for ins in range(n_inserts):   
        ## get read, we leave the barcode on cuz it won't hurt
        r1 = np.array(list(uniqs[ins][0]))
        r2 = np.array(list(revcomp(uniqs[ins][1])))
        
        ## add R1 to genome
        genome = np.concatenate((genome, r1), axis=0)
        if qrt2[0]:
            ## add insert size
            howlong = np.random.uniform(insert_low, insert_high)
            chunk = np.random.choice(list("ATGC"), int(howlong))
            #print howlong
            genome = np.concatenate((genome, chunk), axis=0)
            ## add read2
            genome = np.concatenate((genome, r2), axis=0)

        ## add spacer between loci
        genome = np.concatenate((genome, np.random.choice(list("ATGC"), 5000)), axis=0)

    print("input genome is {} bp".format(genome.shape[0]))
    print("inserted {} loci into genome".format(n_inserts))

    with open(out_chr, "w") as fasta:
        header = ">MT dna:chromosome chromosome:test:MT:1:{}:1 REF\n"\
                 .format(genome.shape[0])
        fasta.write(header)
        outseq = list(genome)
        for i in range(0, len(outseq), 70):
            fasta.write("{}\n".format("".join(outseq[i:i+70])))

Make 'genome' data files¶

In [36]:

## Meta args
N_INSERTS = 500
INSERT_LOW = 250
INSERT_HIGH = 300

In [37]:

## SE RAD data
DATA_R1 = os.path.join(DIR, "rad_example_R1_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "rad_example_genome.fa")
big = RAD_to_genome(DATA_R1, 0, N_INSERTS, 
                    INSERT_LOW, INSERT_HIGH,
                    OUTPUT_CHR)

input genome is 2555000 bp
inserted 500 loci into genome

In [38]:

## SE GBS data
DATA_R1 = os.path.join(DIR, "gbs_example_R1_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "gbs_example_genome.fa")
RAD_to_genome(DATA_R1, 0, N_INSERTS,
              INSERT_LOW, INSERT_HIGH, 
              OUTPUT_CHR)

input genome is 2555000 bp
inserted 500 loci into genome

In [39]:

## PAIR ddRAD data 
DATA_R1 = os.path.join(DIR, "pairddrad_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairddrad_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairddrad_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, 
              INSERT_LOW, INSERT_HIGH,
              OUTPUT_CHR)

input genome is 2742124 bp
inserted 500 loci into genome

In [40]:

## PAIR ddRAD data 
DATA_R1 = os.path.join(DIR, "pairddrad_wmerge_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairddrad_wmerge_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairddrad_wmerge_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, 
              INSERT_LOW, INSERT_HIGH,  
              OUTPUT_CHR)

input genome is 2742116 bp
inserted 500 loci into genome

In [41]:

## PAIR GBS data
DATA_R1 = os.path.join(DIR, "pairgbs_wmerge_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairgbs_wmerge_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairgbs_wmerge_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, 
              INSERT_LOW, INSERT_HIGH, 
              OUTPUT_CHR)

input genome is 2743007 bp
inserted 500 loci into genome

Run tests in ipyrad¶

rad_example¶

In [43]:

## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests1")
data1.set_params(2, os.path.join(DIR, 'rad_example_R1_.fastq.gz'))
data1.set_params(3, os.path.join(DIR, 'rad_example_barcodes.txt'))
data1.set_params("max_alleles_consens", 4) ## raise this for now 

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, os.path.join(DIR, 'rad_example_genome.fa'))

## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')

## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')

## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)

  New Assembly: denovo

  Assembly: denovo

    [force] overwriting fastq files previously *created by ipyrad* in:
    /home/deren/Documents/ipyrad/tests/tests1/denovo_fastqs
    This does not affect your *original/raw data files*
  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:03 
  [####################] 100%  writing/compressing   | 0:00:00 

  [####################] 100%  processing reads      | 0:00:04 

  [####################] 100%  dereplicating         | 0:00:00 
  [####################] 100%  clustering            | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:00:44 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:00:55 

  [####################] 100%  consensus calling     | 0:00:26 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:01 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:08 
  [####################] 100%  database indels       | 0:00:00 
  [####################] 100%  indexing clusters     | 0:00:01 
  [####################] 100%  building database     | 0:00:00 
  [####################] 100%  filtering loci        | 0:00:01 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:03 
  [####################] 100%  writing vcf file      | 0:00:01 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo_outfiles

  Assembly: reference

    [force] overwriting fastq files previously *created by ipyrad* in:
    /home/deren/Documents/ipyrad/tests/tests1/reference_fastqs
    This does not affect your *original/raw data files*
  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:03 
  [####################] 100%  writing/compressing   | 0:00:00 

  [####################] 100%  processing reads      | 0:00:04 

  [####################] 100%  dereplicating         | 0:00:00 
  [####################] 100%  mapping               | 0:00:45 
  [####################] 100%  finalize mapping      | 0:00:14 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:00:31 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:00:41 

  [####################] 100%  consensus calling     | 0:00:15 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:04 
  [####################] 100%  database indels       | 0:00:00 
  [####################] 100%  indexing clusters     | 0:00:00 
  [####################] 100%  building database     | 0:00:00 
  [####################] 100%  filtering loci        | 0:00:01 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:02 
  [####################] 100%  writing vcf file      | 0:00:01 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests1/reference_outfiles

  Assembly: denovo-plus

    [force] overwriting fastq files previously *created by ipyrad* in:
    /home/deren/Documents/ipyrad/tests/tests1/denovo-plus_fastqs
    This does not affect your *original/raw data files*
  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:03 
  [####################] 100%  writing/compressing   | 0:00:00 

  [####################] 100%  processing reads      | 0:00:04 

    Force reindexing of reference sequence
  [####################] 100%  dereplicating         | 0:00:00 
  [####################] 100%  mapping               | 0:00:46 
  [####################] 100%  clustering            | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  finalize mapping      | 0:00:15 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:00:55 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:00:59 

  [####################] 100%  consensus calling     | 0:00:29 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:01 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:10 
  [####################] 100%  database indels       | 0:00:00 
  [####################] 100%  indexing clusters     | 0:00:01 
  [####################] 100%  building database     | 0:00:00 
  [####################] 100%  filtering loci        | 0:00:01 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:03 
  [####################] 100%  writing vcf file      | 0:00:01 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo-plus_outfiles

  Assembly: denovo-minus

    [force] overwriting fastq files previously *created by ipyrad* in:
    /home/deren/Documents/ipyrad/tests/tests1/denovo-minus_fastqs
    This does not affect your *original/raw data files*
  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:03 
  [####################] 100%  writing/compressing   | 0:00:00 

  [####################] 100%  processing reads      | 0:00:04 

    Force reindexing of reference sequence
  [####################] 100%  dereplicating         | 0:00:00 
  [####################] 100%  mapping               | 0:00:12 
  [####################] 100%  clustering            | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:00:26 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:00:42 

  [####################] 100%  consensus calling     | 0:00:16 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:05 
  [####################] 100%  database indels       | 0:00:00 
  [####################] 100%  indexing clusters     | 0:00:01 
  [####################] 100%  building database     | 0:00:00 
  [####################] 100%  filtering loci        | 0:00:01 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:02 
  [####################] 100%  writing vcf file      | 0:00:01 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo-minus_outfiles

In [45]:

NLOCI = 1000

## check results
assert all(data1.stats.clusters_hidepth == NLOCI)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI

assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS

assert all(data3.stats.clusters_hidepth == NLOCI)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == NLOCI

assert all(data4.stats.clusters_hidepth == NLOCI-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == NLOCI-N_INSERTS

gbs example¶

In [38]:

## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests2")
data1.set_params(2, os.path.join(DIR, "gbs_example_R1_.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "gbs_example_barcodes.txt"))
data1.set_params("max_alleles_consens", 4) ## raise this for now 

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, "reference")
data2.set_params(6, os.path.join(DIR, "gbs_example_genome.fa"))

## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')

## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')

## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)

  New Assembly: denovo

  Assembly: denovo

  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:03 
  [####################] 100%  writing/compressing   | 0:00:00 

  [####################] 100%  processing reads      | 0:00:33 

  [####################] 100%  dereplicating         | 0:00:00 
  [####################] 100%  clustering            | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:00:51 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:00:51 

  [####################] 100%  consensus calling     | 0:00:24 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:08 
  [####################] 100%  indexing clusters     | 0:00:03 
  [####################] 100%  building database     | 0:00:05 
  [####################] 100%  filtering loci        | 0:00:04 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:14 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo_outfiles

  Assembly: reference

  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:03 
  [####################] 100%  writing/compressing   | 0:00:00 

  [####################] 100%  processing reads      | 0:00:32 

  [####################] 100%  dereplicating         | 0:00:00 
  [####################] 100%  mapping               | 0:00:55 
  [####################] 100%  finalize mapping      | 0:00:20 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:00:37 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:00:37 

  [####################] 100%  consensus calling     | 0:00:14 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:04 
  [####################] 100%  indexing clusters     | 0:00:03 
  [####################] 100%  building database     | 0:00:02 
  [####################] 100%  filtering loci        | 0:00:00 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:07 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests2/reference_outfiles

  Assembly: denovo-plus

  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:03 
  [####################] 100%  writing/compressing   | 0:00:00 

  [####################] 100%  processing reads      | 0:00:34 

    Force reindexing of reference sequence
  [####################] 100%  dereplicating         | 0:00:00 
  [####################] 100%  mapping               | 0:00:54 
  [####################] 100%  clustering            | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  finalize mapping      | 0:00:21 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:01:06 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:00:55 

  [####################] 100%  consensus calling     | 0:00:26 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:10 
  [####################] 100%  indexing clusters     | 0:00:03 
  [####################] 100%  building database     | 0:00:05 
  [####################] 100%  filtering loci        | 0:00:00 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:14 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo-plus_outfiles

  Assembly: denovo-minus

  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:03 
  [####################] 100%  writing/compressing   | 0:00:00 

  [####################] 100%  processing reads      | 0:00:35 

    Force reindexing of reference sequence
  [####################] 100%  dereplicating         | 0:00:00 
  [####################] 100%  mapping               | 0:00:11 
  [####################] 100%  clustering            | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:00:27 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:00:37 

  [####################] 100%  consensus calling     | 0:00:15 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:04 
  [####################] 100%  indexing clusters     | 0:00:03 
  [####################] 100%  building database     | 0:00:02 
  [####################] 100%  filtering loci        | 0:00:04 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:07 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo-minus_outfiles

In [39]:

## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS

assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS

pairddrad without merged reads example¶

In [52]:

## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests3")
data1.set_params(2, os.path.join(DIR, "pairddrad_example_*.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "pairddrad_example_barcodes.txt"))
data1.set_params(7, "pairddrad")
data1.set_params(8, ("TGCAG", "CCGG"))

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, "reference")
data2.set_params(6, os.path.join(DIR, "pairddrad_example_genome.fa"))

## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')

## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')

## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)

  New Assembly: denovo

  Assembly: denovo

    [force] overwriting fastq files previously *created by ipyrad* in:
    /home/deren/Documents/ipyrad/tests/tests3/denovo_fastqs
    This does not affect your *original/raw data files*
  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:05 
  [####################] 100%  writing/compressing   | 0:00:01 

  [####################] 100%  processing reads      | 0:00:07 

  [####################] 100%  dereplicating         | 0:00:01 
  [####################] 100%  clustering            | 0:00:06 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:02:00 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:01:14 

  [####################] 100%  consensus calling     | 0:00:51 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:02 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:16 
  [####################] 100%  database indels       | 0:00:00 
  [####################] 100%  indexing clusters     | 0:00:01 
  [####################] 100%  building database     | 0:00:00 
  [####################] 100%  filtering loci        | 0:00:01 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:03 
  [####################] 100%  writing vcf file      | 0:00:01 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo_outfiles

  Assembly: reference

    [force] overwriting fastq files previously *created by ipyrad* in:
    /home/deren/Documents/ipyrad/tests/tests3/reference_fastqs
    This does not affect your *original/raw data files*
  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:05 
  [####################] 100%  writing/compressing   | 0:00:01 

  [####################] 100%  processing reads      | 0:00:06 

    Force reindexing of reference sequence
  [####################] 100%  dereplicating         | 0:00:01 
  [####################] 100%  mapping               | 0:02:04 
  [####################] 100%  finalize mapping      | 0:00:36 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:01:03 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:00:55 

  [####################] 100%  consensus calling     | 0:00:29 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:01 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:09 
  [####################] 100%  database indels       | 0:00:00 
  [####################] 100%  indexing clusters     | 0:00:01 
  [####################] 100%  building database     | 0:00:00 
  [####################] 100%  filtering loci        | 0:00:01 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:02 
  [####################] 100%  writing vcf file      | 0:00:01 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests3/reference_outfiles

  Assembly: denovo-plus

    [force] overwriting fastq files previously *created by ipyrad* in:
    /home/deren/Documents/ipyrad/tests/tests3/denovo-plus_fastqs
    This does not affect your *original/raw data files*
  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:06 
  [####################] 100%  writing/compressing   | 0:00:01 

  [####################] 100%  processing reads      | 0:00:07 

    Force reindexing of reference sequence
  [####################] 100%  dereplicating         | 0:00:00 
  [####################] 100%  mapping               | 0:02:14 
  [####################] 100%  clustering            | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  finalize mapping      | 0:00:37 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:02:15 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:01:32 

  [####################] 100%  consensus calling     | 0:00:55 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:02 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:17 
  [####################] 100%  database indels       | 0:00:00 
  [####################] 100%  indexing clusters     | 0:00:01 
  [####################] 100%  building database     | 0:00:00 
  [####################] 100%  filtering loci        | 0:00:01 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:03 
  [####################] 100%  writing vcf file      | 0:00:01 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo-plus_outfiles

  Assembly: denovo-minus

    [force] overwriting fastq files previously *created by ipyrad* in:
    /home/deren/Documents/ipyrad/tests/tests3/denovo-minus_fastqs
    This does not affect your *original/raw data files*
  [####################] 100%  chunking large files  | 0:00:00 
  [####################] 100%  sorting reads         | 0:00:05 
  [####################] 100%  writing/compressing   | 0:00:01 

  [####################] 100%  processing reads      | 0:00:07 

    Force reindexing of reference sequence
  [####################] 100%  dereplicating         | 0:00:01 
  [####################] 100%  mapping               | 0:00:52 
  [####################] 100%  clustering            | 0:00:00 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  chunking              | 0:00:00 
  [####################] 100%  aligning              | 0:01:03 
  [####################] 100%  concatenating         | 0:00:00 

  [####################] 100%  inferring [H, E]      | 0:01:02 

  [####################] 100%  consensus calling     | 0:00:30 
  [####################] 100%  concat/shuffle input  | 0:00:00 
  [####################] 100%  clustering across     | 0:00:01 
  [####################] 100%  building clusters     | 0:00:00 
  [####################] 100%  aligning clusters     | 0:00:09 
  [####################] 100%  database indels       | 0:00:00 
  [####################] 100%  indexing clusters     | 0:00:01 
  [####################] 100%  building database     | 0:00:00 
  [####################] 100%  filtering loci        | 0:00:01 
  [####################] 100%  building loci/stats   | 0:00:01 
  [####################] 100%  building vcf file     | 0:00:02 
  [####################] 100%  writing vcf file      | 0:00:01 
  [####################] 100%  writing outfiles      | 0:00:01 
  Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo-minus_outfiles

In [54]:

## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS

assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-54-cce3ab63dc2d> in <module>()
      4 
      5 assert all(data2.stats.clusters_hidepth == N_INSERTS)
----> 6 assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
      7 
      8 assert all(data3.stats.clusters_hidepth == 1000)

AssertionError:

In [62]:

data2.stats_dfs

Out[62]:

s1 :       reads_raw
1A_0      19835
1B_0      20071
1C_0      19969
1D_0      20082
2E_0      20004
2F_0      19899
2G_0      19928
2H_0      20110
3I_0      20078
3J_0      19965
3K_0      19846
3L_0      20025
s2 :       reads_raw  trim_adapter_bp_read1  trim_adapter_bp_read2  \
1A_0      19835                      0                      0   
1B_0      20071                      0                      0   
1C_0      19969                      0                      0   
1D_0      20082                      0                      0   
2E_0      20004                      0                      0   
2F_0      19899                      0                      0   
2G_0      19928                      0                      0   
2H_0      20110                      0                      0   
3I_0      20078                      0                      0   
3J_0      19965                      0                      0   
3K_0      19846                      0                      0   
3L_0      20025                      0                      0   

      trim_quality_bp_read1  trim_quality_bp_read2  reads_filtered_by_Ns  \
1A_0                      0                      0                     0   
1B_0                      0                      0                     0   
1C_0                      0                      0                     0   
1D_0                      0                      0                     0   
2E_0                      0                      0                     0   
2F_0                      0                      0                     0   
2G_0                      0                      0                     0   
2H_0                      0                      0                     0   
3I_0                      0                      0                     0   
3J_0                      0                      0                     0   
3K_0                      0                      0                     0   
3L_0                      0                      0                     0   

      reads_filtered_by_minlen  reads_passed_filter  
1A_0                         0                19835  
1B_0                         0                20071  
1C_0                         0                19969  
1D_0                         0                20082  
2E_0                         0                20004  
2F_0                         0                19899  
2G_0                         0                19928  
2H_0                         0                20110  
3I_0                         0                20078  
3J_0                         0                19965  
3K_0                         0                19846  
3L_0                         0                20025  
s3 :       merged_pairs  clusters_total  clusters_hidepth  avg_depth_total  \
1A_0         19835             500               500           19.956   
1B_0         20071             500               500           20.144   
1C_0         19969             500               500           20.108   
1D_0         20082             500               500           19.914   
2E_0         20004             500               500           19.994   
2F_0         19899             500               500           19.786   
2G_0         19928             500               500           19.936   
2H_0         20110             500               500           20.062   
3I_0         20078             500               500           20.022   
3J_0         19965             500               500           19.832   
3K_0         19846             500               500           19.800   
3L_0         20025             500               500           20.046   

      avg_depth_mj  avg_depth_stat  sd_depth_total  sd_depth_mj  \
1A_0        19.956          19.956        2.694079     2.694079   
1B_0        20.144          20.144        2.854341     2.854341   
1C_0        20.108          20.108        2.746695     2.746695   
1D_0        19.914          19.914        2.861923     2.861923   
2E_0        19.994          19.994        2.828067     2.828067   
2F_0        19.786          19.786        2.879619     2.879619   
2G_0        19.936          19.936        2.994646     2.994646   
2H_0        20.062          20.062        2.876483     2.876483   
3I_0        20.022          20.022        2.789537     2.789537   
3J_0        19.832          19.832        2.924342     2.924342   
3K_0        19.800          19.800        2.802856     2.802856   
3L_0        20.046          20.046        2.799979     2.799979   

      sd_depth_stat  filtered_bad_align  
1A_0       2.694079                   0  
1B_0       2.854341                   0  
1C_0       2.746695                   0  
1D_0       2.861923                   0  
2E_0       2.828067                   0  
2F_0       2.879619                   0  
2G_0       2.994646                   0  
2H_0       2.876483                   0  
3I_0       2.789537                   0  
3J_0       2.924342                   0  
3K_0       2.802856                   0  
3L_0       2.799979                   0  
s4 :       hetero_est  error_est
1A_0    0.001792   0.000785
1B_0    0.001959   0.000769
1C_0    0.001835   0.000715
1D_0    0.001540   0.000714
2E_0    0.002120   0.000742
2F_0    0.001826   0.000757
2G_0    0.002056   0.000727
2H_0    0.002176   0.000683
3I_0    0.001933   0.000706
3J_0    0.001724   0.000726
3K_0    0.001869   0.000755
3L_0    0.002099   0.000723
s5 :       clusters_total  filtered_by_depth  filtered_by_maxH  filtered_by_maxN  \
1A_0             500                  0                 0                 0   
1B_0             500                  0                 0                 0   
1C_0             500                  0                 0                 0   
1D_0             500                  0                 0                 0   
2E_0             500                  0                 0                 0   
2F_0             500                  0                 0                 0   
2G_0             500                  0                 0                 0   
2H_0             500                  0                 0                 0   
3I_0             500                  0                 0                 0   
3J_0             500                  0                 0                 0   
3K_0             500                  0                 0                 0   
3L_0             500                  0                 0                 0   

      reads_consens  nsites  nhetero  heterozygosity  
1A_0            500   97500      165        0.001692  
1B_0            500   97500      180        0.001846  
1C_0            500   97500      168        0.001723  
1D_0            500   97500      142        0.001456  
2E_0            500   97500      196        0.002010  
2F_0            500   97500      169        0.001733  
2G_0            500   97500      188        0.001928  
2H_0            500   97500      201        0.002062  
3I_0            500   97499      178        0.001826  
3J_0            500   97500      156        0.001600  
3K_0            500   97500      175        0.001795  
3L_0            500   97500      192        0.001969  
s7_filters :                             total_filters  applied_order  retained_loci
total_prefiltered_loci                500              0            500
filtered_by_rm_duplicates               0              0            500
filtered_by_max_indels                  0              0            500
filtered_by_max_snps                    0              0            500
filtered_by_max_shared_het              0              0            500
filtered_by_min_sample                  0              0            500
filtered_by_max_alleles                 2              2            498
total_filtered_loci                   498              0            498
s7_loci :     locus_coverage  sum_coverage
1                0             0
2                0             0
3                0             0
4                0             0
5                0             0
6                0             0
7                0             0
8                0             0
9                0             0
10               0             0
11               0             0
12             498           498
s7_samples :       sample_coverage
1A_0              498
1B_0              498
1C_0              498
1D_0              498
2E_0              498
2F_0              498
2G_0              498
2H_0              498
3I_0              498
3J_0              498
3K_0              498
3L_0              498
s7_snps :     var  sum_var  pis  sum_pis
0     0        0   55        0
1     0        0  132      132
2     1        2  122      376
3    15       47   88      640
4    12       95   55      860
5    32      255   25      985
..  ...      ...  ...      ...
16    8     4322    0     1118
17    1     4339    0     1118
18    1     4357    0     1118
19    0     4357    0     1118
20    1     4377    0     1118
21    1     4398    0     1118

[22 rows x 4 columns]

pairgbs w/ merged reads example¶

In [ ]:

## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests4")
data1.set_params(2, os.path.join(DIR, "pairgbs_wmerge_example_*.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "pairgbs_wmerge_example_barcodes.txt"))
data1.set_params(7, "pairgbs")
data1.set_params(8, ("TGCAG", "TGCAG"))

##...


## branch into an assembly for denovo+reference
data3 = data1.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')

## branch into an assembly for reference
data4 = data1.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')

## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)

In [ ]:

## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS

assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS

Create zipped archive of sim data¶

In [63]:

%%bash -s $DIR
## compressed dir/ w/ all data files
tar -zcvf ipsimdata.tar.gz $1

./ipsimdata/
./ipsimdata/pairgbs_example_R2_.fastq.gz
./ipsimdata/pairgbs_wmerge_example_barcodes.txt
./ipsimdata/rad_example_genome.fa
./ipsimdata/pairddrad_example_genome.fa
./ipsimdata/pairgbs_example_R1_.fastq.gz
./ipsimdata/pairgbs_wmerge_example_R2_.fastq.gz
./ipsimdata/rad_example_genome.fa.fai
./ipsimdata/pairddrad_example_R2_.fastq.gz
./ipsimdata/pairddrad_example_genome.fa.sma
./ipsimdata/pairddrad_example_genome.fa.fai
./ipsimdata/pairgbs_wmerge_example_genome.fa
./ipsimdata/pairddrad_wmerge_example_genome.fa
./ipsimdata/pairddrad_example_genome.fa.smi
./ipsimdata/pairgbs_wmerge_example_R1_.fastq.gz
./ipsimdata/rad_example_genome.fa.smi
./ipsimdata/gbs_example_barcodes.txt
./ipsimdata/pairgbs_example_barcodes.txt
./ipsimdata/pairddrad_example_R1_.fastq.gz
./ipsimdata/pairddrad_wmerge_example_barcodes.txt
./ipsimdata/rad_example_barcodes.txt
./ipsimdata/pairddrad_wmerge_example_R1_.fastq.gz
./ipsimdata/pairddrad_wmerge_example_R2_.fastq.gz
./ipsimdata/gbs_example_R1_.fastq.gz
./ipsimdata/pairddrad_example_barcodes.txt
./ipsimdata/rad_example_genome.fa.sma
./ipsimdata/rad_example_R1_.fastq.gz
./ipsimdata/gbs_example_genome.fa