#!/usr/bin/env python # coding: utf-8 # ## Handy function for manufacturing a psuedo-genome with guaranteed hits from simulated data # # These functions take simulated rad-data and a "large" input genome (really it could just be a randomly generated fasta), and randomly inserts a handful of simulated rad tags into the genome. This guarantees that reference mapping will actually do something. # # For PE simulated data R2 reads are reversed before they're inserted, because smalt is using the `-l pe` flag, which looks for reads in this orientation --> <--. # # Also, for PE inner mate distance is fixed at 50. If you wanna get ambitious you could draw this value from a distribution, but seems like more effort than it's worth. # # This wants to be run from ipyrad/tests/data, but you can run it from anywhere if you update the paths. # # In[1]: ## Utility functions. def revcomp(sequence): "returns reverse complement of a string" sequence = sequence[::-1].strip()\ .replace("A", "t")\ .replace("T", "a")\ .replace("C", "g")\ .replace("G", "c").upper() return sequence def comp(seq): """ returns a seq with small complement""" return seq.replace("A", 't')\ .replace('T', 'a')\ .replace('C', 'g')\ .replace('G', 'c')\ .replace('n', 'Z')\ .upper()\ .replace("Z", "n")\ .replace("S", "s") # # Insert SE and PE reads into simulated genome # In[44]: import itertools import gzip import random from Bio import SeqIO RAD_DATA = "./sim_rad_test_R1_.fastq.gz" RAD_DATA_R1 = "./sim_pairddradmerge_R1_.fastq.gz" RAD_DATA_R2 = "./sim_pairddradmerge_R2_.fastq.gz" INPUT_CHR = "/Volumes/WorkDrive/ipyrad/refhacking/MusMT.fa" OUTPUT_CHR = "./sim_mt_genome.fa" N_INSERTS = 25 INSERT_SIZE = 50 ## Get the SE reads with gzip.open( RAD_DATA, 'rb' ) as infile: seqs = [] for i, quatro in enumerate( itertools.izip_longest(*[iter(infile)]*4)): seqs.append(quatro[1]) SE_samp = random.sample(seqs, N_INSERTS) ## Get the PE reads with gzip.open( RAD_DATA_R1, 'rb' ) as in_R1, gzip.open( RAD_DATA_R2, 'rb' ) as in_R2: ## create iterators to sample 4 lines at a time quart1 = itertools.izip(*[iter(in_R1)]*4) ## pair second read files, quarts samples both quart2 = itertools.izip(*[iter(in_R2)]*4) quarts = itertools.izip(quart1, quart2) seqs = [] while True: try: quart = quarts.next() except StopIteration: break seqs.append( [quart[0][1].strip(), quart[1][1].strip()] ) PE_samp = random.sample(seqs, N_INSERTS) ## Get the input fasta file (Mouse mtdna) record = SeqIO.read(INPUT_CHR, "fasta") record.seq = record.seq + record.seq lenchr = len(record) print(lenchr) ## Get the positions to insert the hits at, they will be non-overlapping ## divide the total length of the chromosome by the length of the total of ## all the reads to insert # of SE reads + PE reads 2*Readlength + innermate dist # "working" locs = [x*(lenchr/(N_INSERTS+(N_INSERTS*2+INSERT_SIZE))) for x in range(N_INSERTS*2)] locs = [x*((lenchr-(INSERT_SIZE*N_INSERTS))/(N_INSERTS+(N_INSERTS*2))) for x in range(N_INSERTS*2)] ## Insert SE for i in range(N_INSERTS): loc = locs[i] record.seq = record.seq[:loc]+SE_samp[i].strip("\n")+record.seq[loc:] print(locs) print(len(record.seq)) samp = PE_samp # Insert PE for i in range(N_INSERTS): loc = locs[i+N_INSERTS] # print("do loc - {}".format(loc)) # For each insertion point get all sequence before it, then insert the first read, # then insert INSERT_SIZE more bases from the original sequence, then paste the second # read, then the rest of the sequence. # For Read 2 [::-1] gives the reverse record.seq = record.seq[:loc]+PE_samp[i][0]+record.seq[loc:loc+INSERT_SIZE]+\ revcomp(PE_samp[i][1])+record.seq[loc+INSERT_SIZE:] # revcomp ## Methods for generating other types of reads besides illumina pe #PE_samp[i][1][::1]+record.seq[loc+INSERT_SIZE:] # rev #comp(PE_samp[i][1])+record.seq[loc+INSERT_SIZE:] # comp print(len(record.seq)) output_handle = open(OUTPUT_CHR, "w") SeqIO.write(record, output_handle, "fasta") output_handle.close() # # Just do SE # In[13]: ## Make SE psuedo-genome import itertools import gzip import random from Bio import SeqIO RAD_DATA = "./sim_rad_test_R1_.fastq.gz" INPUT_CHR = "/Volumes/WorkDrive/ipyrad/refhacking/MusMT.fa" OUTPUT_CHR = "./sim_mt_genome.fa" N_INSERTS = 25 with gzip.open( RAD_DATA, 'rb' ) as infile: seqs = [] for i, quatro in enumerate( itertools.izip_longest(*[iter(infile)]*4)): seqs.append(quatro[1]) samp = random.sample(seqs, N_INSERTS) record = SeqIO.read(INPUT_CHR, "fasta") lenchr = len(record) print(lenchr) ## Evenly space the hits so they don't overlap locs = [x*(lenchr/N_INSERTS) for x in range(N_INSERTS)] print(locs) ## The old way where they could sometimes overlap #locs = random.sample(range(lenchr), N_INSERTS) for i, loc in enumerate(locs): record.seq = record.seq[:loc]+samp[i].strip("\n")+record.seq[loc:] print(len(record.seq)) output_handle = open(OUTPUT_CHR, "w") SeqIO.write(record, output_handle, "fasta") output_handle.close() # ## Insert PE reads into the simulated genome # In[1]: import itertools import gzip import random from Bio import SeqIO RAD_DATA_R1 = "./sim_pairddradmerge_R1_.fastq.gz" RAD_DATA_R2 = "./sim_pairddradmerge_R2_.fastq.gz" INPUT_CHR = "./sim_mt_genome.fa" OUTPUT_CHR = "./sim_mt_genome_SE+PE.fa" N_INSERTS = 25 INSERT_SIZE = 50 with gzip.open( RAD_DATA_R1, 'rb' ) as in_R1, gzip.open( RAD_DATA_R2, 'rb' ) as in_R2: ## create iterators to sample 4 lines at a time quart1 = itertools.izip(*[iter(in_R1)]*4) ## pair second read files, quarts samples both quart2 = itertools.izip(*[iter(in_R2)]*4) quarts = itertools.izip(quart1, quart2) seqs = [] while True: try: quart = quarts.next() except StopIteration: break seqs.append( [quart[0][1].strip(), quart[1][1].strip()] ) samp = random.sample(seqs, N_INSERTS) record = SeqIO.read(INPUT_CHR, "fasta") lenchr = len(record) print(lenchr) locs = random.sample(range(lenchr), N_INSERTS) locs.sort() # Get a random sample of base positions to insert at from across the sim genome for i, loc in enumerate(locs): print("do loc - {}".format(loc)) print(samp[i][0], revcomp(samp[i][1])) # For each insertion point get all sequence before it, then insert the first read, # then insert INSERT_SIZE more bases from the original sequence, then paste the second # read, then the rest of the sequence. # For Read 2 [::-1] gives the reverse record.seq = record.seq[:loc]+samp[i][0]+record.seq[loc:loc+INSERT_SIZE]+\ revcomp(samp[i][1])+record.seq[loc+INSERT_SIZE:] #samp[i][1].strip("\n")[::-1]+record.seq[loc+INSERT_SIZE:] print(len(record.seq)) output_handle = open(OUTPUT_CHR, "w") SeqIO.write(record, output_handle, "fasta") output_handle.close() # In[7]: import gzip import shutil outfiles = ["/private/tmp/ipyrad-test-pair/test-refseq-pair_edits/3L0_R1_.fastq.gz", "/private/tmp/ipyrad-test-pair/test-refseq-pair_edits/3L0_R2_.fastq.gz"] for f in outfiles: print(f) with open(f, 'r') as f_in, gzip.open(f, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # In[17]: sq = "AAATGCGCCGGGCGCGAA" print("AAATGCGCCGGGCGCGAA"[::-1]) print(revcomp(sq)) sq[::-1].strip().replace("T","a") # In[25]: print((PE_samp[1][1])[::-1]) print(PE_samp[1][1]) # In[43]: record = SeqIO.read(INPUT_CHR, "fasta") record.seq = record.seq + record.seq print record.seq