#!/usr/bin/env python
# coding: utf-8

# In[1]:


import igraph as ig
from igraph import Graph
from Bio import SeqIO
import logging
from logging import debug, info, warning, error
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


class DBGraph(Graph):
    newid = 0 # allow for a fixed id per node
    k = 4 # we need to know the k-mer size (note: the length of the sequence per node is k-1)
    #igraph = None
    name = ""
    compacted = False
    contigs = []
    
    def __init__(self, name="", k=4):
        self.name = name
        self.k = k
        super().__init__(directed=True)

        
    # utility function to quickly retrieve the edge between two igraph nodes
    # this function should exist in igraph itself
    def edge(self, n1, n2):
        gl = self
        return gl.es[gl.get_eid(n1.index, n2.index)]
        
    """
   
   utility functions: adds the node sequences to the graph
   This one initialized a fresh node if the sequence is not already there.
    
    """
    
    def find_or_fresh_node(self, s):
        new_node = None
        g = self
        try:
            new_node = g.vs.find(sequence=s)     
        except: 
            if not new_node:
                new_node = g.add_vertex(name=s)
                new_node['id'] = self.newid # we need to keep a fixed id to reference nodes if other nodes are deleted
                new_node['sequence'] = s
                new_node['label'] = str(new_node.index) +':'+ s
                new_node['visited'] = False
                new_node['compacted'] = False
                new_node['coverage'] = 1
                new_node['sum'] = 0
                new_node['dist'] = 0
              
            assert new_node, 'Failed to create new node!'
            self.newid += 1 
        return new_node
            
    
    def add(self, s1, s2):
        if self.compacted:
            raise Exception("New nodes cannot be added to a compacted graph.")
        g = self
        n1 = self.find_or_fresh_node(s1) 
        n2 = self.find_or_fresh_node(s2)     
               
        i1 = n1.index
        i2 = n2.index
        debug ("adding edge"+str(i1)+" --> "+str(i2))
        eid = None
        try: # the edge could be there already
            eid = g.get_eid(i1,i2)
            g.es[eid]["coverage"] += 1 ## in a true DeBrujn Graph the edges carry the multiplicity!
        except: # the edge is new
            g.add_edges([(i1,i2)])
            eid = g.get_eid(i1,i2)
            g.es[eid]["coverage"] = 1 
    
    def add_kmer(self,kmer):
        assert len(kmer) == self.k, "kmer sequence must have length k" 
        self.add(kmer[0:self.k-1], kmer[1:self.k])
     
    def add_from_file(self, file):       
        for record in SeqIO.parse(file, 'fasta'):
            i = 0
            while i < (len(record.seq) - self.k + 1):
                self.add_kmer(record.seq[i:i+self.k])
                i += 1
            

    def get_node0(self):
        return Node(graph=self, id=0)
    
    """
    compact the graph by compressing linear stretches of nodes into a single node
    and deleting the redundant nodes
    conserves coverage by computing the average coverage for the compacted nodes
    and stores it in the node
    That leads to a much more compact representation and reduces the effort of 
    subsequenct filtering and contig generation
    Note that no more k-mers can be added after compaction, because the original 
    k-mers can no longer be found by dictionary lookup
    """
    
    
    def compact(self):
        if self.compacted:
            raise Exception("Compaction can only be run once per graph")
        self.compacted = True
        ## look for simple 0 start nodes first
        ## only nodes with outdegree can be compacted        
        # for ni in nset0: print(ni["name"])
        i = 0
        # update set of not yet compacted nodes after each iteration
        while True: # this needs to be done this way, otherwise the set is not updated correctly    
            nset0 = self.vs.select(_outdegree_eq = 1, compacted = False)
            debug(str (i) + ': len(nset0)' + str(len(nset0)) )
            if len(nset0) < 1:    
                info('no more compactable stretches')
                break
            n = nset0[0]
            i += 1
            ### we want to walk to the left-most position, before engaging with the compaction
            n['visited'] = True
            while (n.indegree() == 1 and n.predecessors()[0].outdegree() == 1 and not n.predecessors()[0]['visited']):
                # move left                
                n = n.predecessors()[0]
                n['visited'] = True # avoid infinite loop
                debug('seeking left' + n['sequence'] )
            for noob in self.vs: noob['visited'] = False # set it back
            debug ("Start compacting: " + str(n) + str(n.outdegree()))         
            assert  n.outdegree() == 1 and (not n['compacted']), "We hit an already compacted node!" # this should never happen because of select statement
            first_time = True
            # don't take this node again
            # we are going to simply delete the neighbors, there is only one dircet neighbor
            # but we also need to check if the next node is balanced
            n['compacted'] = True # mark this node as compacted, so it won't come up again
            n['sum'] = 0 # compacted nodes store their coverage themselves, sum of coverages
            n['dist'] = 0 # distance forward to next "junction" ahead
            
            ## control id, keeping the originally assigned ids in the nodes 
            cid = n["id"]
            
            ## This is the actual compaction for node n 
            while True: 
                n = self.vs.select(id = cid)[0] 
                ## when deletng nodes, weird things kan happen, so make sure we got the same node as before
                assert n, "node must stay the same under one iteration" 
                ## exit condition: we don't have exactly 1 successor, or the next node has 
                ## multiple incoming nodes
                if (n.outdegree() != 1 or n.successors()[0].indegree() != 1):
                    debug ('--------------   finished with node ' + str(n))
                    break    
                
                    
                succ = n.successors()[0] # node is linear, so we are sure
                ### don't follow only self-reference:
                if n == succ: 
                    debug('------------------ hit a self reference, finished')
                    break
                if succ['visited']:
                    debug ('------------------ hit a loop here')
                    break
                if n in n.successors() or n in succ.successors():
                    debug('------------------- we hit a loop, finished')
                    break
                
                ## Only the very first node in our path needs to keep the full k-mer, that is a node with in_degree 0
                ## All other need to be clipped now
                if n.indegree() > 0 and first_time:
                        # otherwise we would artificially prolong the sequence
                        # indegree 0 nodes always keep their sequence
                    n['sequence'] = n['sequence'][self.k-2:]
                    first_time = False
                else: 
                    first_time = False # but only once, otherwise we'd chop off the whole sequence
                    
                debug("really compacting " + n['sequence'] + ' --> ' + succ['sequence'] + str(n))
                n['visited'] = True
                eid = self.get_eid(n.index, succ.index) # geet the edge id
                ## add edges to all succ.successors to this node
                for s in succ.successors():
                    self.add_edges([(n.index,s.index)]) ## add a new edge, 
                    new_eid = self.get_eid(n.index, s.index) ## and get the index of the edge we just made
                    ## we need to copy the coverage attribute over, 
                    ## so get the edge connecting 
                    ## successor to its next successor
                    succ_eid = self.get_eid(succ.index, s.index)
                    self.es[new_eid]['coverage'] = self.es[succ_eid]['coverage']  # copy coverage            
                ## get the successors single character and add to node sequence
                if succ["compacted"]:
                    ## add the sequence of the compacted node, it is already compacted
                    n["sequence"] += succ["sequence"]
                    n["sum"] += succ["sum"]
                    n['dist'] += succ['dist']   
                else:
                    n["sequence"] += succ["sequence"][-1:]
                    if n['sum'] == None:
                        error ('this should never happen: ' + str(n))
                    assert self.es[eid]['coverage'], "Every edge should have a coverage attribute"
                    assert n['sum'] != None, "we set the coverage attribute already"
                    n['sum'] += self.es[eid]['coverage'] # here the coverage is in the edge
                    n['dist'] += 1 # now we have made only a single step                   
                n["name"] = n["sequence"]    
                n['label'] = '['+ str(n.index) +'] '+ n['sequence'] 
                ## now delete the successor and the edge
                eid = self.get_eid(n.index, succ.index)
                self.delete_edges(eid)
                debug ("deleting: " + succ['name'])
                self.delete_vertices(succ.index)
                
        ## Now the compaction is complete, but we still have sum and dist per node, but not coverage, so we need to update
        ## the compacted nodes in a last sweep over all compacted nodes
        for n in self.vs: n['visited'] = False 
        for n in self.vs.select(compacted = True):
            if (n['dist'] == 0): 
                n['coverage'] = 1
                n['compacted'] = False ## these nodes were not really compacted, there were just marked 
            else:
                n['coverage'] = n['sum']/n['dist']
        
            
    def viz(self):
        g = self
        layout = g.layout("kk")
        color_dict = {True: "lightgreen", False: "tomato"}
        colors = [color_dict[c] for c in g.vs["compacted"]]
        shapes = []
        visual_style = {}
        visual_style["edge_width"] = [0.5 * int(coverage) for coverage in g.es["coverage"]]
        visual_style["vertex_size"] = [20 * int(coverage) for coverage in g.vs["coverage"]]
        for s in g.vs.indegree():
            if s == 0:
                shapes.append("rectangle")
            else:
                shapes.append("circle")
        g.vs['label'] = list(map(lambda x: str(x.index)+':'+x['sequence'], g.vs))        
        return(ig.plot(g, bbox=(0, 0, 400, 400), vertex_color=colors, 
                       vertex_shape=shapes, vertex_label_dist = -0.5, layout=layout, **visual_style ))

        
    ## utility function to get the best node, strategy:
    ## ignore self reference
    ## 0. if no further node: None
    ## 1. if outdegree == 1, get the node
    ## 2. if outdegree > 0, prioritize:
    ##   1) get not visited node with highest coverage
    ## the user is responsible to mark node as visited if it is used
    ## we don't want to hit on an infinite loop, but if a visited node has an unvisited next node
    ## it could still be part of an Eulerian walk
    
    """
    TODO: This function has much room for improvement. It may break up a perfectly Eulerian
    walk through the graph. Find an example of such a Eulerian graph topology. How could this be
    improved?
    """
    def get_next_node(self, n):
        
        if (n.outdegree() < 1): return None
        if (n.outdegree() == 1):
            nn = n.successors()[0] ## there is only one, return it, but only if it is not self referential
            if n == nn: # or nn['visited']: # self reference or already visited
                ### this is sub-optimal, we cannot solve graph named "double loop" below
                ### like this properly, at least it protects against infinite loop
                ### in order to resolve double loops and alternative paths, we have to return 
                ### nn in case it has an edge that we haven't visited yet
                return None
                if not all(list(map(lambda y: y['visited'] or nn == y, nn.successors()))):
                    # checks, if of successor's successors has not been visited yet
                    # slightly better, but still not optimal, one would want to define a 
                    # coverage threshold, at this point. However, such threshold should be
                    # adaptive, depending on global or local coverage
                    # Idea: calculate local coverage at the environment,
                    # to use a node twice, each edge to traverse should have at least similar (e.g. 1/2?) coverage
                    # compared to the local environment
                    return nn
                else: 
                    return None
        
        nset = n.successors()
        tmp = list(filter (lambda x: not x['visited'], nset))
        if len(tmp) > 0: 
            nn = sorted(tmp, key=lambda node: (n.outdegree(), node['coverage'], self.edge(n, node)['coverage'], len(node['sequence'] )), reverse=True)[0] 
            if n == nn: # self reference
                return None
            else:
                return nn
        else:    
            tmp = nset         
            tmp = list(filter(lambda x: (not all(list(map(lambda y: y['visited'] or x == y, x.successors())))), tmp))
                     
            if len(tmp) < 1: 
                return None
            else:
                debug ('has unvisited next: ' + str(tmp))   
                tmp = sorted(tmp, key=lambda node: (node['coverage'], self.edge(n, node)['coverage'], len(node['sequence'] )), reverse=True) 
        
                nn = tmp[0]
                if n == nn: # self reference
                    return None
                else:
                    return nn
    
    """
    make a walk through the graph, given the start node
    """
    def walk(self, n):
        sequence = n['sequence']
        n['visited'] = True
        n1 = n
        debug('walk: ' + n1['name'] + n1['sequence'])
        while self.get_next_node(n1):
            
            n1 = self.get_next_node(n1)
            debug('walk: ' + n1['name'])
            n1['visited'] = True
            if n1['compacted']:
                sequence += n1['sequence']
                debug ('seq: '+n1['sequence'])
            else:
                sequence += n1['sequence'][self.k-2]
                debug ('seq: '+n1['sequence'][self.k-2])
        info ('Done')        
        return sequence        
                
        
    ### The gredy assembly is the simplest possible heuristics and works on the compacted graph
    ### With greedy bubble correction and at every junction, follow the path of highest coverage
    ### No variants will be detected this way
    ### Dangling short ends will be cut off if they are shorter than min_contig_len, and there is another 
    ### path > 2*k ahead, even if they have higher coverage 
    ### If a repeat (a node that has already been visited) is hit with distance < min_contig_len, the contig will be dropped 
    #   - get all start nodes, easy ones first until all nodes have been visited
    def assemble(self):
       
        contigs = []
        min_contig_len = self.k ## very lenient, could be increased but ok for toy examples
        
        # get optimal start nodes first: optimal start nodes are those with indegree 0 and not visited
        counter = 0
        nset0 = self.vs.select( visited = False, _indegree = 0 )
        
        while counter <= len(nset0): # protect against infinite loop
            nset = self.vs.select(visited = False, _indegree = 0)
            # we could also sort the nodeset to refine our strategy further but we don't need that now
            if len(nset) < 1: break
            contig = self.walk(nset[0])
            if len(contig) >= min_contig_len:
                contigs.append(contig)
        
        # do the same for remaining other unvisited nodes
        nset0 = self.vs.select( visited = False)
        
        while counter <= len(nset0): # protect against infinite loop
        
            nset = self.vs.select(visited = False)
            # we could also sort the nodeset to refine our strategy further
            # as we might end up being somewhere in a loop, we could as well start with
            # the longest node
            nset = sorted(nset, key = lambda x: len(x['sequence']), reverse = True)
            if len(nset) < 1: break
            contig = self.walk(nset[0])
            if len(contig) >= min_contig_len:
                contigs.append(contig)
        
        self.contigs = contigs
        return contigs
        
        
    """
    utility funciton to calculate the coverage per graph
    assume there is a real path between nodes
    """
    def get_path_coverage(self, path):
        pathsum = 0
        pathlen = 0
        for i in range(len(path)):
            n = self.vs[path[i]]
            if (n['compacted']):
                pathsum += n['sum']
                pathlen += n['dist']
            else:
                if i < len(path)-1:
                    pathsum += self.edge(self.vs[path[i]], self.vs[path[i+1]])['coverage']
                    pathlen += 1
                    
        if (pathlen > 0): 
            return pathsum/pathlen
        else:
            return 0                 
                            
    ## simple bubble removal strategy:
    ## a bubble starts at a node with outdegree > 1
    ## it ends if the paths join again
    ## if the paths join at all
    ## if there are alternative paths to the confluent node, AND all nodes in a path have outdegree == 1
    ## then we have a real bubble: remove the nodes in the path(s) with lowest coverage
    ## however, we ignore the case for now that there could be divergent paths emerging from a lower 
    ## coverage bubble. 
    def remove_bubbles(self):
        max_look_ahead = 100 # maximum number of nodes to look ahead for a junction
        # first check if there is a bubble at all
        # we heavily use graph and set operations here
        nset0 = self.vs.select(_outdegree_gt = 1) # the maximum number of nodes to visit        
        counter = 0       
        while (counter < len(nset0)): # max number of iterations possible because we only delete nodes
            counter = counter + 1
            ## we have to update our nodes each time, because nodes could have been deleted already
            nset = self.vs.select(_outdegree_gt = 1)
            if len (nset) < 1: break
            n = nset[0]
            nhoods = self.neighborhood(n.successors(), order=max_look_ahead, mode="out") ## get nodeset in neighborhoods
            for i in range(len(nhoods)):
                for j in range(i+1, len(nhoods)):
                    ## if the intersections are non-empty, there is a confluent node
                    confluent_node = isect(nhoods[i],nhoods[j])
                    if (confluent_node):
                        info (str(counter) + ':' +' bubble detected:' + n['name'] +' ---> ' + self.vs[confluent_node]['name'])
                        paths = list(map(lambda x: self.get_shortest_paths(x,confluent_node)[0], n.successors()))
                        paths = sorted(paths, key= lambda x: (g.get_path_coverage(x),len(x))) # order paths by coverage, then by length
                        debug(str(paths))
                        keep_this = paths.pop()
                        for path in paths:
                            if n.index in path: continue ## this is a loop, we cannot deal with this case yet
                            del_me = set(path) - set(keep_this) # get a set with shared nodes removed 
                            # this guarantees also that we do not accidentially delete nodes that are still needed
                            # one could decide to only delete linear nodes
                            del_me = list(filter(lambda x: self.vs[x].outdegree() == 1 , del_me))
                            ## if the graph was compacted already this would delete only compacted nodes
                            # del_me = list(filter(lambda x: self.vs[x].indegree() == 1 , del_me))
                            
                            info ("deleting the following nodes: "+ str(del_me))
                            self.delete_vertices(del_me) # that will also clean the edges
    
    
def isect(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    if len(lst3) < 1: 
        return None
    else:
        return lst3[0]
        
    
class Node:
    id = 0
    graph = None
    inode = None
    def __init__(self, graph, id=None, sequence=None):
        self.graph = graph
        if sequence:
            self.inode = graph.vs.find(sequence=sequence)
        else: 
            if id != None:
                self.inode = graph.vs[id]               
        self.id = self.inode['id']
        
    def indegree(self):
        return self.inode.indegree()
    
    def outdegree(self):
        return self.inode.outdegree()


# In[2]:


g = DBGraph('hey joe')

#g.add_kmer('tttt')
## bubble graph
g.add('abb','bbb')


# In[3]:


g = DBGraph('hey joe')

#g.add_kmer('tttt')
## bubble graph
g.add('abb','bbb')
g.add('bbb','bba')
g.add('bba','bas')
g.add('bas','asu')
g.add('bas','ast')
g.add('ast','stu')
g.add('stu','tup')
g.add('stu','tus')
g.add('tus','usu')
g.add('usu','suu')

g.add('asu','suu')
g.add('bbb','bbA')
g.add('bbb','bbA')
g.add('bbb','bbA')
g.add('bbA','bAs')
g.add('bbA','bAs')
g.add('bAs','Asu')
g.add('bAs','Asu')
g.add('Asu','suu')
g.add('suu','uuu')

## alternative start and end nodes
## g.add("NAT","ATG")
# g.add("NAT","ATG")
# g.add("NAT","ATG")
# g.add("XAT","ATG")
# g.add("ATG", "TGA")
# g.add("ATG", "TGA")
# g.add("ATG", "TGA")
# g.add("ATG", "TGA")
# g.add("ATG", "TGA")

g.viz()


# In[4]:


g.compact()
g.viz()


# In[5]:


g = DBGraph("cycle")
# cyclic
g.add("ZAP","APZ")
g.add("APZ","PZA")
g.add("PZA","ZAP")
## this must not give into infinite loop
g.compact()
g.assemble() # output: ['ZAPZA']


# In[6]:


# test construction from file
g = DBGraph('file')
g.add_from_file('test.fasta')
g.viz()


# In[7]:


g.compact()
g.viz()


# In[8]:


g.remove_bubbles()
g.assemble() ## [Seq('12345678'), Seq('abcdefghijklmnopqrstuvwXYZ')] 
# output: [Seq('abcdefghijklmn'),
# Seq('abdde'),
# Seq('quick_brown_fox_jumps_over_the_lazy_dog')]
# no bubble will be deleted because it is indeed a loop


# In[9]:


g = DBGraph('test more stuff')
g.add("TGA","GAA")
g.add("GAA","AAT")
g.add("GAA","AAT")
g.add("GAA","AAT")
g.add("GAA","AAT")

g.add("AAT","ATT")
g.add("AAT","ATC")

# short linear graph
g.add("POP","OPA")
g.add('POP','OPA')
g.add("OPA","PAM")


## self repeat + cyclic graph
g.add("XXX","XXX")
g.add("XXX","XXX")
g.add("XXX","XXX")
g.add("XXX","XXX")
g.add("XXX","XXX")
g.add("XXX","XXY")
g.add("XXX","XXY")

g.add("XXY","XYX")
g.add("XYX","YXX")
g.add("YXX","XXX")
g.viz()


# In[10]:


g.compact()
g.assemble() # should give: ['TGAATT', 'POPAM', 'XXXYXX']
    
    
# In[11]:


g = DBGraph('hey joe')
g.add('123','234')
g.add('234','345')
g.add('345','456')
g.viz()


# In[12]:


g.assemble() # simple graphs can be assembled directly: 123456


# In[13]:


g.compact()
g.viz()


# In[14]:


g = DBGraph('double loop')
g.add('0ab','abc')
g.add('abc','bcd')
g.add('bcd','cde')
g.add('bcd','cdE')
g.add('cde','dea')
g.add('cdE','dEa')
g.add('dea','eab')
g.add('dEa','Eab')
g.add('eab','abc')
g.add('Eab','abc')
g.viz()


# In[15]:


g.compact()
g.viz()


# In[16]:


g.remove_bubbles()
g.viz()


# In[17]:


g.assemble() # should give: ['0abcdeabcdEab']


# In[ ]:


# In[ ]: