#!/usr/bin/env python
# coding: utf-8

# ## Handy function for manufacturing a psuedo-genome with guaranteed hits from simulated data
# 
# These functions take simulated rad-data and a "large" input genome (really it could just be a randomly generated fasta), and randomly inserts a handful of simulated rad tags into the genome. This guarantees that reference mapping will actually do something.
# 
# For PE simulated data R2 reads are reversed before they're inserted, because smalt is using the `-l pe` flag, which looks for reads in this orientation --> <--.
# 
# Also, for PE inner mate distance is fixed at 50. If you wanna get ambitious you could draw this value from a distribution, but seems like more effort than it's worth.
# 
# This wants to be run from ipyrad/tests/data, but you can run it from anywhere if you update the paths.
# 

# In[1]:


## Utility functions.
def revcomp(sequence):
    "returns reverse complement of a string"
    sequence = sequence[::-1].strip()\
                             .replace("A", "t")\
                             .replace("T", "a")\
                             .replace("C", "g")\
                             .replace("G", "c").upper()
    return sequence
               
    
def comp(seq):
    """ returns a seq with small complement"""
    return seq.replace("A", 't')\
           .replace('T', 'a')\
           .replace('C', 'g')\
           .replace('G', 'c')\
           .replace('n', 'Z')\
           .upper()\
           .replace("Z", "n")\
           .replace("S", "s")


# # Insert SE and PE reads into simulated genome

# In[44]:


import itertools
import gzip
import random
from Bio import SeqIO

RAD_DATA = "./sim_rad_test_R1_.fastq.gz"
RAD_DATA_R1 = "./sim_pairddradmerge_R1_.fastq.gz"
RAD_DATA_R2 = "./sim_pairddradmerge_R2_.fastq.gz"
INPUT_CHR = "/Volumes/WorkDrive/ipyrad/refhacking/MusMT.fa"
OUTPUT_CHR = "./sim_mt_genome.fa"
N_INSERTS = 25
INSERT_SIZE = 50

## Get the SE reads
with gzip.open( RAD_DATA, 'rb' ) as infile:
    seqs = []
    for i, quatro in enumerate( itertools.izip_longest(*[iter(infile)]*4)):
        seqs.append(quatro[1])
    SE_samp = random.sample(seqs, N_INSERTS)

## Get the PE reads
with gzip.open( RAD_DATA_R1, 'rb' ) as in_R1, gzip.open( RAD_DATA_R2, 'rb' ) as in_R2:
    ## create iterators to sample 4 lines at a time 
    quart1 = itertools.izip(*[iter(in_R1)]*4)
    ## pair second read files, quarts samples both
    quart2 = itertools.izip(*[iter(in_R2)]*4)
    quarts = itertools.izip(quart1, quart2)
    
    seqs = []
    while True:
        try:
            quart = quarts.next()
        except StopIteration:
            break
        seqs.append( [quart[0][1].strip(), quart[1][1].strip()] )
            
    PE_samp = random.sample(seqs, N_INSERTS)

## Get the input fasta file (Mouse mtdna)
record = SeqIO.read(INPUT_CHR, "fasta")
record.seq = record.seq + record.seq
lenchr = len(record)
print(lenchr)

## Get the positions to insert the hits at, they will be non-overlapping
## divide the total length of the chromosome by the length of the total of
## all the reads to insert # of SE reads + PE reads 2*Readlength + innermate dist
# "working" locs = [x*(lenchr/(N_INSERTS+(N_INSERTS*2+INSERT_SIZE))) for x in range(N_INSERTS*2)]
locs = [x*((lenchr-(INSERT_SIZE*N_INSERTS))/(N_INSERTS+(N_INSERTS*2))) for x in range(N_INSERTS*2)]
## Insert SE
for i in range(N_INSERTS):
    loc = locs[i]
    record.seq = record.seq[:loc]+SE_samp[i].strip("\n")+record.seq[loc:]
print(locs)
print(len(record.seq))

samp = PE_samp
# Insert PE
for i in range(N_INSERTS):
    loc = locs[i+N_INSERTS]
#    print("do loc - {}".format(loc))
    # For each insertion point get all sequence before it, then insert the first read,
    # then insert INSERT_SIZE more bases from the original sequence, then paste the second
    # read, then the rest of the sequence.
    # For Read 2 [::-1] gives the reverse
    record.seq = record.seq[:loc]+PE_samp[i][0]+record.seq[loc:loc+INSERT_SIZE]+\
                    revcomp(PE_samp[i][1])+record.seq[loc+INSERT_SIZE:] # revcomp

                    ## Methods for generating other types of reads besides illumina pe
                    #PE_samp[i][1][::1]+record.seq[loc+INSERT_SIZE:] # rev
                    #comp(PE_samp[i][1])+record.seq[loc+INSERT_SIZE:] # comp

print(len(record.seq))
output_handle = open(OUTPUT_CHR, "w")
SeqIO.write(record, output_handle, "fasta")
output_handle.close()


# # Just do SE

# In[13]:


## Make SE psuedo-genome
import itertools
import gzip
import random
from Bio import SeqIO

RAD_DATA = "./sim_rad_test_R1_.fastq.gz"
INPUT_CHR = "/Volumes/WorkDrive/ipyrad/refhacking/MusMT.fa"
OUTPUT_CHR = "./sim_mt_genome.fa"
N_INSERTS = 25

with gzip.open( RAD_DATA, 'rb' ) as infile:
    seqs = []
    for i, quatro in enumerate( itertools.izip_longest(*[iter(infile)]*4)):
        seqs.append(quatro[1])
    samp = random.sample(seqs, N_INSERTS)

record = SeqIO.read(INPUT_CHR, "fasta")
lenchr = len(record)
print(lenchr)

## Evenly space the hits so they don't overlap
locs = [x*(lenchr/N_INSERTS) for x in range(N_INSERTS)]
print(locs)
## The old way where they could sometimes overlap
#locs = random.sample(range(lenchr), N_INSERTS)
for i, loc in enumerate(locs):
    record.seq = record.seq[:loc]+samp[i].strip("\n")+record.seq[loc:]
print(len(record.seq))
output_handle = open(OUTPUT_CHR, "w")
SeqIO.write(record, output_handle, "fasta")
output_handle.close()


# ## Insert PE reads into the simulated genome

# In[1]:


import itertools
import gzip
import random
from Bio import SeqIO

RAD_DATA_R1 = "./sim_pairddradmerge_R1_.fastq.gz"
RAD_DATA_R2 = "./sim_pairddradmerge_R2_.fastq.gz"
INPUT_CHR = "./sim_mt_genome.fa"
OUTPUT_CHR = "./sim_mt_genome_SE+PE.fa"
N_INSERTS = 25
INSERT_SIZE = 50

with gzip.open( RAD_DATA_R1, 'rb' ) as in_R1, gzip.open( RAD_DATA_R2, 'rb' ) as in_R2:
    ## create iterators to sample 4 lines at a time 
    quart1 = itertools.izip(*[iter(in_R1)]*4)
    ## pair second read files, quarts samples both
    quart2 = itertools.izip(*[iter(in_R2)]*4)
    quarts = itertools.izip(quart1, quart2)
    
    seqs = []
    while True:
        try:
            quart = quarts.next()
        except StopIteration:
            break
        seqs.append( [quart[0][1].strip(), quart[1][1].strip()] )
            
    samp = random.sample(seqs, N_INSERTS)

record = SeqIO.read(INPUT_CHR, "fasta")
lenchr = len(record)
print(lenchr)
locs = random.sample(range(lenchr), N_INSERTS)
locs.sort()
# Get a random sample of base positions to insert at from across the sim genome
for i, loc in enumerate(locs):
    print("do loc - {}".format(loc))
    print(samp[i][0], revcomp(samp[i][1]))
    # For each insertion point get all sequence before it, then insert the first read,
    # then insert INSERT_SIZE more bases from the original sequence, then paste the second
    # read, then the rest of the sequence.
    # For Read 2 [::-1] gives the reverse
    record.seq = record.seq[:loc]+samp[i][0]+record.seq[loc:loc+INSERT_SIZE]+\
                    revcomp(samp[i][1])+record.seq[loc+INSERT_SIZE:]
                    #samp[i][1].strip("\n")[::-1]+record.seq[loc+INSERT_SIZE:]

print(len(record.seq))
output_handle = open(OUTPUT_CHR, "w")
SeqIO.write(record, output_handle, "fasta")
output_handle.close()


# In[7]:


import gzip
import shutil
outfiles = ["/private/tmp/ipyrad-test-pair/test-refseq-pair_edits/3L0_R1_.fastq.gz", "/private/tmp/ipyrad-test-pair/test-refseq-pair_edits/3L0_R2_.fastq.gz"]

for f in outfiles:
    print(f)
    with open(f, 'r') as f_in, gzip.open(f, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


# In[17]:


sq = "AAATGCGCCGGGCGCGAA"
print("AAATGCGCCGGGCGCGAA"[::-1])
print(revcomp(sq))
sq[::-1].strip().replace("T","a")


# In[25]:


print((PE_samp[1][1])[::-1])
print(PE_samp[1][1])


# In[43]:


record = SeqIO.read(INPUT_CHR, "fasta")
record.seq = record.seq + record.seq
print record.seq