Notebook

In [2]:

#simple scoring method in which matches count +1 and both the mismatch and indel penalties are 1
import numpy as np
alphabet = ['A', 'C', 'G', 'T', '-']
score = np.array([[1, -2, -2, -2, -2],[-2, 1, -2, -2, -2],[-2, -2, 1, -2, -2],[-2, -2, -2, 1, -2],[-2, -2, -2, -2, -2]], 
                 dtype = 'int')
score

Out[2]:

array([[ 1, -2, -2, -2, -2],
       [-2,  1, -2, -2, -2],
       [-2, -2,  1, -2, -2],
       [-2, -2, -2,  1, -2],
       [-2, -2, -2, -2, -2]])

In [15]:

def overlapAlignment(x, y, score):
    """Calculate the score of an optimal overlap alignment of suffix of x and prefix of y. 
    The maximum score for overlap alignment"""
    
    D = np.zeros((len(x)+1, len(y)+1), dtype=int)
    for j in range(1, len(y) + 1):
        D[0,j] = D[0, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]
    for i in xrange(1, len(x)+1):
        for j in xrange(1, len(y)+1):
            horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]
            vert = D[i-1, j] + score[alphabet.index(x[i-1]), alphabet.index('-')]
            diag = D[i-1, j-1] + score[alphabet.index(x[i-1]), alphabet.index(y[j-1])]
            D[i,j] = max(horz, vert, diag)
    max_score = D[-1,].max()
    return D, max_score

In [16]:

D, max_score = overlapAlignment('AGACGT', 'AGTCCG', score)
print (D)
print (max_score)

[[  0  -2  -4  -6  -8 -10 -12]
 [  0   1  -1  -3  -5  -7  -9]
 [  0  -1   2   0  -2  -4  -6]
 [  0   1   0   0  -2  -4  -6]
 [  0  -1  -1  -2   1  -1  -3]
 [  0  -2   0  -2  -1  -1   0]
 [  0  -2  -2   1  -1  -3  -2]]
1

In [25]:

def traceback(D, x, y, score):
    ''' Find and return the alignment of a substring of t to p by 
        trace back from given cell in global-alignment matrix D .  
        A highest-scoring fitting alignment between v and w. 
        If multiple alignments tie for best, we report the leftmost. '''
    # get i, j for maximal cell
    max_score = D[-1,].max()
    for index in reversed(range(len(y))):
        if D[-1,index] == max_score:
            max_index = index
    i, j = len(x), max_index
    alp, alt = [], []
    while j > 0:
        diag, horz, vert = -float('inf'), -float('inf'), -float('inf')
        if i > 0 and j > 0:
            diag = D[i-1, j-1] + score[alphabet.index(x[i-1]), alphabet.index(y[j-1])]
        if i > 0:
            vert = D[i-1, j] + score[alphabet.index(x[i-1]), alphabet.index('-')]
        if j > 0:
            horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]
        if diag >= vert and diag >= horz:
            alp.append(x[i-1]); alt.append(y[j-1])
            i -= 1; j -= 1 
        elif vert >= horz:
            alp.append(x[i-1]); alt.append('-')
            i -= 1
        else:
            alp.append('-'); alt.append(y[j-1])
            j -= 1
    alignment = map(lambda x: ''.join(x), [alp[::-1], alt[::-1]])
    return alignment

In [26]:

D, max_score = overlapAlignment('AGACGT', 'AGTCCG', score)
print (D)
print (max_score)
algn = traceback(D, 'AGACGT', 'AGTCCG', score)
print '\n'.join(algn)

[[  0  -2  -4  -6  -8 -10 -12]
 [  0   1  -1  -3  -5  -7  -9]
 [  0  -1   2   0  -2  -4  -6]
 [  0   1   0   0  -2  -4  -6]
 [  0  -1  -1  -2   1  -1  -3]
 [  0  -2   0  -2  -1  -1   0]
 [  0  -2  -2   1  -1  -3  -2]]
1
ACGT
A-GT

In [28]:

x, y = [i.strip() for i in open('input/dataset_248_7.txt', 'r')]
D, max_score = overlapAlignment(x, y, score)
print (max_score)
algn = traceback(D, x, y, score)
print '\n'.join(algn)

58
AAGTAGGCAAATTCA-G-TTCTTGTCACGAGCT-GATTG-CCGGAAG-AAAATGAACTAGATCGGGCGAAGTC-CTGTCCGAGGGATACGGAG-AAT-GCTGGAACAAATCAG---AGCAGCTGGAG-TGGGACAGCCGCCGC-CTC-CTTTCCAAGCCAATGC-CGGCAGGCGTACGCCGGCT-TTCGCCGACCCAGAGGTCGGTTCGCTG-AGCAGAGCAG-TTTGTTACG-ATTTGACAGGA-TCCACGAAGATAC-GTACGTGACAACT-GTGTGCGATCGGCCC
AAGTTGGCAAATTTATGCTTCTTGTCACGAGCTCTATTGACC-GAAGTACAATGAATTAGACCGGGTTAAGTCTCCGTCCGA-GGTTAAGGAGAAATACCTGGAATAAATCAGCCTA-CTGCTGGCGTTCATAC-G-CG-CGCTC-CGCTTT-TAA-CCAATGCTCCGTAGG-ATACGCCAG-TATTCGCCG-TCC-G-GGCCGG-TCGCAGAAGCAGAGCGGATTTGATA-GAATTTGACAGGACTGCA-AAAGTTACGGTAC-TGACAACTAATGT-ATAT-TGCCC

In [30]:

alphabet = ['A', 'C', 'G', 'T', '-']
score = np.array([[1, 0, 0, 0, -2],[0, 1, 0, 0, -2],[0, 0, 1, 0, -2],[0, 0, 0, 1, -2],[-2, -2, -2, -2, -2]], 
                 dtype = 'int')
score

Out[30]:

array([[ 1,  0,  0,  0, -2],
       [ 0,  1,  0,  0, -2],
       [ 0,  0,  1,  0, -2],
       [ 0,  0,  0,  1, -2],
       [-2, -2, -2, -2, -2]])

In [31]:

x, y = ['AGTACATCAGAGGAGTT-ACATACTAACG', 'AGTTCACAGGCTA-CGTACAGATATTACGACAGGCAGA']
D, max_score = overlapAlignment(x, y, score)
print (max_score)
algn = traceback(D, x, y, score)
print '\n'.join(algn)

10
AGTACATCAGAGGA-GTT-ACATA-CTAACG
AGTTCA-CAGGCTA-CGT-ACAGATATTACG

In [ ]: