#simple scoring method in which matches count +1 and both the mismatch and indel penalties are 1
import numpy as np
alphabet = ['A', 'C', 'G', 'T', '-']
score = np.array([[1, -1, -1, -1, -1],[-1, 1, -1, -1, -1],[-1, -1, 1, -1, -1],[-1, -1, -1, 1, -1],[-1, -1, -1, -1, -1]],
dtype = 'int')
score
array([[ 1, -1, -1, -1, -1], [-1, 1, -1, -1, -1], [-1, -1, 1, -1, -1], [-1, -1, -1, 1, -1], [-1, -1, -1, -1, -1]])
score[alphabet.index('A'), alphabet.index('C')]
-1
score[alphabet.index('T'), alphabet.index('-')]
-1
# Using an edit-distance-like dynamic programming formulation, we can
# look for approximate occurrences of p in t.
def fittingAlignment(p, t, score):
""" Calculate global alignment value of sequences x and y using
dynamic programming. Return global alignment value. """
D = np.zeros((len(p)+1, len(t)+1), dtype=int)
# Note: First row gets zeros. First column initialized as usual.
for i in range(1, len(p) + 1):
D[i,0] = D[i-1,0] + score[alphabet.index(p[i-1]), alphabet.index('-')]
for i in xrange(1, len(p)+1):
for j in xrange(1, len(t)+1):
horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(t[j-1])]
vert = D[i-1, j] + score[alphabet.index(p[i-1]), alphabet.index('-')]
diag = D[i-1, j-1] + score[alphabet.index(p[i-1]), alphabet.index(t[j-1])]
D[i,j] = max(horz, vert, diag)
max_score = D[-1,].max()
return D, max_score
D, max_score = fittingAlignment('TAGATA','GTAGGCTTAAGGTTA', score)
print (D)
print (max_score)
[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [-1 -1 1 0 -1 -1 -1 1 1 0 -1 -1 -1 1 1 0] [-2 -2 0 2 1 0 -1 0 0 2 1 0 -1 0 0 2] [-3 -1 -1 1 3 2 1 0 -1 1 1 2 1 0 -1 1] [-4 -2 -2 0 2 2 1 0 -1 0 2 1 1 0 -1 0] [-5 -3 -1 -1 1 1 1 2 1 0 1 1 0 2 1 0] [-6 -4 -2 0 0 0 0 1 1 2 1 0 0 1 1 2]] 2
D, max_score = fittingAlignment('GCGTATGC','TATTGGCTATACGGTT', score)
print (D)
print (max_score)
[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [-1 -1 -1 -1 -1 1 1 0 -1 -1 -1 -1 -1 1 1 0 -1] [-2 -2 -2 -2 -2 0 0 2 1 0 -1 -2 0 0 0 0 -1] [-3 -3 -3 -3 -3 -1 1 1 1 0 -1 -2 -1 1 1 0 -1] [-4 -2 -3 -2 -2 -2 0 0 2 1 1 0 -1 0 0 2 1] [-5 -3 -1 -2 -3 -3 -1 -1 1 3 2 2 1 0 -1 1 1] [-6 -4 -2 0 -1 -2 -2 -2 0 2 4 3 2 1 0 0 2] [-7 -5 -3 -1 -1 0 -1 -2 -1 1 3 3 2 3 2 1 1] [-8 -6 -4 -2 -2 -1 -1 0 -1 0 2 2 4 3 2 1 0]] 4
def traceback(D, p, t, score):
''' Find and return the alignment of a substring of t to p by
trace back from given cell in global-alignment matrix D .
A highest-scoring fitting alignment between v and w.
If multiple alignments tie for best, we report the leftmost. '''
# get i, j for maximal cell
for index in range(len(t)):
if D[-1,index] == D[-1,].max():
max_index = index
i, j = len(p), max_index
alp, alt = [], []
while i > 0:
diag, horz, vert = -float('inf'), -float('inf'), -float('inf')
if i > 0 and j > 0:
diag = D[i-1, j-1] + score[alphabet.index(p[i-1]), alphabet.index(t[j-1])]
if i > 0:
vert = D[i-1, j] + score[alphabet.index(p[i-1]), alphabet.index('-')]
if j > 0:
horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(t[j-1])]
if diag >= vert and diag >= horz:
alp.append(p[i-1]); alt.append(t[j-1])
i -= 1; j -= 1
elif vert >= horz:
alp.append(p[i-1]); alt.append('-')
i -= 1
else:
alp.append('-'); alt.append(t[j-1])
j -= 1
alignment = map(lambda x: ''.join(x), [alt[::-1], alp[::-1]])
return alignment
D, max_score = fittingAlignment('GCGTATGC','TATTGGCTATACGGTT', score)
print (max_score)
algn = traceback(D,'GCGTATGC','TATTGGCTATACGGTT', score)
print '\n'.join(algn)
4 GC-TATAC GCGTATGC
t,p = [i.strip() for i in open('input/rosalind_ba5h.txt', 'r')]
D, max_score = fittingAlignment(p, t, score)
print (max_score)
algn = traceback(D, p, t, score)
print '\n'.join(algn)
128 GATGTC-T-C-TACAATATTAT-G-C-CCTA-C-AC-CTCCCAATCTAGGGTAGAGCCGT--TCGCACAGCGGCCAG-ACGTACAACAAAGTG-GTA-GTCGTAAT-CC---G-ACGGA-GCTAC-CACCGGGAATCTGA-ATCG-T-CATTCT-T--G-TCCGTACCCAAC-TCTATACC---CCA--AGC-GAAAAGATTATAATACGGTCA-AGCCTGAATTTTATATGTCGTGGGACCGCCGAAGTGC-CCTCTT-GTTCAGCAGCCTGCG---AAAGTACGCGGGCTCGTTCACCAGGATAATCAT-GGGA-TTTCGACCT-GGTGGGT-TGTTTCGCTAGCGA-ACTGGCGC-GGCACCATGTTCGGGAGCATCCG-GT-TATAGAACCTA-TCCGTAAGTG-CAGGGTGGAATACAGTGTCGGCGCTACCCGTACCCGCATCAATGCAGCG-AAGCTAAAGCCATGTA--AT-C-CCGCAAGTTAGTCACAAAATGTCTGGTAGTCTACACTTAATTATGGCT-TCTGTAAGGACTAACCACACT-ATCTTCGCCGGCA-ACCG-AGATAGGCGTACTAT-AAAGG-ATG-GT-TTGAG-CGATGCGGACGACTTCA-CCTCTGCCATCCAACACTCCATCTCA-AACGGGTCCAGCGAGAGAATCGAGATCTTGAGTAGCACAGCTCCGGATCCGTACCCACGACAGGATATGCGAGACCGAGCGCTACGCCACATGAGTCTAACCACGTCTAGCGCGAGATTACCAAGTCTTGGGCTGCGACGGTATAGGCAGGCGAATCAAAGTCAGGTACAGCCGTGCCATACCCACTCGCCC-G-GC-GTG-GCAACCAACCGCGC-AGCATCACGCACCTGTATC-TCCCT--ACATGGTCAT-C-ACTAGT-TGGG-TAAGCG-AATTCAGTTGCGTGGTTAGCGATTCGGTACTGCACAGGAGTACTTAATCA--AATGCAGAT-G-GACTTCCT-CGA-T GA-GTCGTGCTTTGAATATTATCGACGAATACCAACACT-TCAGT-TA---TAGA-CCTTAGTGGC-C-GC-GCTAGAACGTGC-AGAGA-TGCG-ACAAC-TAATCCCTAGGTATGGAGGC--CTCACC---AATCGGACA-CGCTCCA-TCTGTAGGCTGCGT--CCAGCTTCTA-GCCGGTTGAGTAGCGGTCGGGA-TA-AAT-C-CTCACAGCC----CCCGA-ACGT-GT-GGA--ACTG-ATTTCTCCT-TTAGTT-TGCA--CTGCGCTTGAA-TTTG-GAG-T-ATACA-CTGG-T-AT--TGGGGAGAGTAGA--TAGGTCCATCCATTT-GC--G-GATCCT--CGCTGG----ATGTTC-CG-GCCTTCGAGTCGCTATAA-C-AGTCC--AAATGTAAGGGCGG-A-A-AGT-ACGGAGC-ATGGGT--CTG-TTCGAGGCA-CGTAAGCT--CGCCGTG-ACCATGCACC-C-A--T-G-AACAAAATG---GCTAG-C--CTCTTAGTTTTGGCTAAC---AA-G--T--TCTCTGTGGTCTTCG--TAAATACAGTAGCGGGGCG-CCCATGTAAGGCA-GAATATT-AGAAGAT--TGACG-C--GAGCC-C-GCC--CC-ACGCTCGGTCTTATAA---AT-CAGC-----AA-CG-GA-C---AAAAGGATACCTAC---TCC-TACCGATGA-A--A-A---GAG--TG-G-GC-A--CC-CGT-A-TCT-CCCAC-TC--G-GC-----TAACAA-TCTAGTGCT-C-A-GG----CGCAGCCG---C-ATGCCA--T--AGCCGTGCC---CCCGGTC-CCCAGTTCACTGTTTAACCAACGGC-CTA-C-TTA-GTA--TGTGGCGTGACTGAACAAAGT--TGCTA-TGGTCTGGGATAACCGCAATTTAGTTGCAAAG--AACCTTTATGTA--G-AAAGGA-T-TTTAAACATGAAT-TAGATCGTGCCCACCTGCCAGT
array([[ 1, 0, 0, 0, -2], [ 0, 1, 0, 0, -2], [ 0, 0, 1, 0, -2], [ 0, 0, 0, 1, -2], [-2, -2, -2, -2, -2]])
D, max_score = fittingAlignment('GATACACT','ACGACCACAGATACCGCTATTCACTATATCGTT', score)
print (max_score)
algn = traceback(D,'GATACACT','ACGACCACAGATACCGCTATTCACTATATCGTT', score)
print '\n'.join(algn)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-5-8cb4ae2ec1f7> in <module>() ----> 1 D, max_score = fittingAlignment('GATACACT','ACGACCACAGATACCGCTATTCACTATATCGTT', score) 2 print (max_score) 3 algn = traceback(D,'GATACACT','ACGACCACAGATACCGCTATTCACTATATCGTT', score) 4 print '\n'.join(algn) NameError: name 'fittingAlignment' is not defined