#Reading the scoring function from txt file
import numpy as np
f = open('input/PAM250_1.txt', 'r')
score = [line.strip().split() for line in f]
alphabet = score[0]
PAM250 = np.array([i[1:] for i in score[1:]], dtype = 'int')
print alphabet
print PAM250
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-'] [[ 2 -2 0 0 -3 1 -1 -1 -1 -2 -1 0 1 0 -2 1 1 0 -6 -3 -5] [-2 12 -5 -5 -4 -3 -3 -2 -5 -6 -5 -4 -3 -5 -4 0 -2 -2 -8 0 -5] [ 0 -5 4 3 -6 1 1 -2 0 -4 -3 2 -1 2 -1 0 0 -2 -7 -4 -5] [ 0 -5 3 4 -5 0 1 -2 0 -3 -2 1 -1 2 -1 0 0 -2 -7 -4 -5] [-3 -4 -6 -5 9 -5 -2 1 -5 2 0 -3 -5 -5 -4 -3 -3 -1 0 7 -5] [ 1 -3 1 0 -5 5 -2 -3 -2 -4 -3 0 0 -1 -3 1 0 -1 -7 -5 -5] [-1 -3 1 1 -2 -2 6 -2 0 -2 -2 2 0 3 2 -1 -1 -2 -3 0 -5] [-1 -2 -2 -2 1 -3 -2 5 -2 2 2 -2 -2 -2 -2 -1 0 4 -5 -1 -5] [-1 -5 0 0 -5 -2 0 -2 5 -3 0 1 -1 1 3 0 0 -2 -3 -4 -5] [-2 -6 -4 -3 2 -4 -2 2 -3 6 4 -3 -3 -2 -3 -3 -2 2 -2 -1 -5] [-1 -5 -3 -2 0 -3 -2 2 0 4 6 -2 -2 -1 0 -2 -1 2 -4 -2 -5] [ 0 -4 2 1 -3 0 2 -2 1 -3 -2 2 0 1 0 1 0 -2 -4 -2 -5] [ 1 -3 -1 -1 -5 0 0 -2 -1 -3 -2 0 6 0 0 1 0 -1 -6 -5 -5] [ 0 -5 2 2 -5 -1 3 -2 1 -2 -1 1 0 4 1 -1 -1 -2 -5 -4 -5] [-2 -4 -1 -1 -4 -3 2 -2 3 -3 0 0 0 1 6 0 -1 -2 2 -4 -5] [ 1 0 0 0 -3 1 -1 -1 0 -3 -2 1 1 -1 0 2 1 -1 -2 -3 -5] [ 1 -2 0 0 -3 0 -1 0 0 -2 -1 0 0 -1 -1 1 3 0 -5 -3 -5] [ 0 -2 -2 -2 -1 -1 -2 4 -2 2 2 -2 -1 -2 -2 -1 0 4 -6 -2 -5] [-6 -8 -7 -7 0 -7 -3 -5 -3 -2 -4 -4 -6 -5 2 -2 -5 -6 17 0 -5] [-3 0 -4 -4 7 -5 0 -1 -4 -1 -2 -2 -5 -4 -4 -3 -3 -2 0 10 -5] [-5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5]]
def smithWaterman(x, y, score):
""" Calculate local alignment value of sequences x and y using
dynamic programming. Return local alignment value. """
D = np.zeros((len(x)+1, len(y)+1), dtype = 'int')
for i in range(1, len(x)+1):
for j in range(1, len(y)+1):
horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]
vert = D[i-1, j] + score[alphabet.index(x[i-1]), alphabet.index('-')]
diag = D[i-1, j-1] + score[alphabet.index(x[i-1]), alphabet.index(y[j-1])]
D[i,j] = max(horz, vert, diag, 0)
argmax = np.where(D == D.max())
return D, int(D[argmax])
D, best = smithWaterman('MEANLY','PENALTY', PAM250)
print(D)
print("Best score=%d, in cell %s" % (best, np.unravel_index(np.argmax(D), D.shape)))
[[ 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 4 0 0] [ 0 0 4 1 0 0 4 0] [ 0 1 0 4 3 0 1 1] [ 0 0 2 2 4 0 0 0] [ 0 0 0 0 0 10 5 0] [ 0 0 0 0 0 5 7 15]] Best score=15, in cell (6, 7)
np.argmax(D)
55
def traceback(D, x, y, score):
""" Trace back from given cell in local-alignment matrix D """
# get i, j for maximal cell
i, j = np.unravel_index(np.argmax(D), D.shape)
alx, aly = [], []
while (i > 0 or j > 0) and D[i,j] != 0:
diag, horz, vert = 0, 0, 0
if i > 0 and j > 0:
diag = D[i-1, j-1] + score[alphabet.index(x[i-1]), alphabet.index(y[j-1])]
if i > 0:
vert = D[i-1, j] + score[alphabet.index(x[i-1]), alphabet.index('-')]
if j > 0:
horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]
if diag >= vert and diag >= horz:
alx.append(x[i-1]); aly.append(y[j-1])
i -= 1; j -= 1
elif vert >= horz:
alx.append(x[i-1]); aly.append('-')
i -= 1
else:
alx.append('-'); aly.append(y[j-1])
j -= 1
alignment = map(lambda x: ''.join(x), [alx[::-1], aly[::-1]])
return alignment
x = 'MEANLY'
y = 'PENALTY'
algn = traceback(D, x, y, PAM250)
print '\n'.join(algn)
EANL-Y ENALTY
x, y = [i.strip() for i in open('input/rosalind_ba5f.txt', 'r')]
D, best = smithWaterman(x, y, PAM250)
algn = traceback(D, x, y, PAM250)
print best
print '\n'.join(algn)
3682 NP-CPPA--GEC-LAVNMDPEYGAEDTPSKMMWHEVIFLFYGFHSGNSSISLRSFRMEVQYHSADKKKLWFTT-G--WIFQTMLFSQWHDRVD-RIQSAF-QICYMSALEKYPAMMDASRWNPQGTLWMATYMEWVDGMWSTAERHYPWN-LDHNEDFGFRRNDMYVLWCFVDHFFKDDSTLSGIFCLITFP-PGWFW-FQSIPDMKKFGLMHFFGGTWPAGSQHFANSKFTINL-MRPL-D-DWK-THHQFTMMAFQTGLKRFKGIQAAYM-HERRADKNSM-RRYWNRNNVPFYIFYDVHV-KWRLPQFVCACMEPPMVVTMS-ELGDMQ-SEFFHLNNQADCKNEFCARS-L-K--GYLFYLRVHCGCRSHPKNENWADFAWQKAHGVSIGKAQHAITEGFYSHGNNEPMF-CN--LHPWEPKPHMVIIQDF--ICKLFDEWQTMQWNLSNHLE-L-IN-IPC-GMM-YPG-AVEGGAWSAD-S-KQTGQLAVQLEFIVYKNMEYGQRDVC-DAIHNGKCLEYCYHG-WDYDRGLCFLIIPVENLYLC-MTTW-DNCTPLFLNPTIPARKPFPDIWQAMGSLVSLGTELTHEQVMGGPRRFTNF--Q-YHVGKSPETRQSGNIEKSIKHFHKLQGQF-RNWCMVQYLYQGIDDPDKSMGRPQCHCFNHGDEPPQSDMW-FCNGPVYETDMSMLRWQFRCKWPDMGYDFIHAHY-FVDFMTWG-ALEWMITSF---I-F-F-IGIEKGNSMV-SILY-VVGIMTYMFNMLTFMGEEVQCYSDIDNTATLDQLQ-VLRPSM-VLNNVPRQTRGHFRSDQTHARTCAHMHYLSIIG-GFFLYMQDGDEFIFCPQGAKLRMTTTYPPMIEKSATFDQNDNVELVLCLMHTVEHHSFATKDEHILQVPPFDGYHGPIEEKWWVQKIFKMTSMVQVWHVEGEKAFKKNYPVS-MIDDKGLCSHDHGSFQ-Q-LSYWMIKQHYHAIC--MAHKDCR---G--WGHASAIWTKPYQCDMEFSHMHLGKHCLYTD-K-NFHDSGAVHSKCFDNGNDSCWNIQNNHQFQEMKEGGYDDVIESCLAIQVGQQFKFEENSSMCPCRHCCHICQMMNWSPKLHE-IWSF-GCVLDN--QSS-MGQF-W-EETKWREKSWGN-E--HISEASSTNIHNDTYKTPDLHAYHERDDQPGEWWMQNLRCVNVFRYHDKKQGMGGPSYPVKTVTYLACYQTDIQFWMHQAWSMRINWPQKRIPETFHPFQYCYFQHRGPTKHGLKYWKMSFAKLQISLKWPHWYYKDWR-RDCQGGCVTMDRISNHIYIHPCSVRGDREYACCFKLTGTDHYKTRNTENKEPLWLEVPINVMEQTWHSLHVVFCFAC-LGFHHGKRFRL---NTS----VE-DLAWM-E--QKRTATVEFELVIPRAR------R-TRPILHCQCWCIHVFVIAERDPGYV--GTV--FNKETCSTRSWEGLKILSSNGHLMATMDKIEFPLCWSEHTDNFGLAHCEDYFDSWHWPLWYCV-Q--QEWVMK------FGE--Q--KRNEDWNHIDHEWNEIEPFASHAIGIVHIPAHCDYIV--VDYNVSGGYPPC---KGSTNVNMISCNWNVGKRAPCAPHTRKDDCKLSVT-A---ETLLKKDNFITVFVMLGWYDKQNVKIQPEYGLTCTPKWTATNEYGTTDDLMPIPCGEQNQYSFVWKHNMRDLMYTTKALIQVPRCYI--P-DQ----P--GARY--VEL--PWMNAGAHYQKVPNF---S----HWQVRMRYKRMMMSTDMHCIQRYVPYRGPQFWINGFPLNGGEYVGFSMNSCNNFNEIEHIREENLLGNFLMNNCFQMEQY-VFKARNYFIECDEY---S-SI--S-RPYGMPFY--AQKALMIVIAFVDTIPGDFVSLKQ-SYFR-NMLAF-GIML-ILRQAV-QWYKAHHHHYAVDWFTRSYIR-RIFYHNHFDMVVCPDWHICPDGPDYREVIWSFPDHLSQKGMAKNVYQVIYIEYPEPQKIFPCHKFHSEDISFKWMYIK--RDY---YTWVNDFLRDFGEQDITCMRNKFAHRHEQMNINPACDGKIPAVINCAYGFRQKQIKQIIHWPKENDFRCAKYALLIWQKNCREL-ENDVYHHKESVEQVWAGFTEKQRISWTHICNETLMCSIRCFADPENCPAETWTRHEWIQQEGDFDLDEHATSPTLMPERCSMQIGNDAHMNDAAHGDMHWLL-W-NYLNYKLGFVTKPVTMNCEFAVTWML-VK-WFQGIIDSF--Q-IAMMMIIGIMDF-SEN-RIHSVED-NRLYFHPKLWQQSAFMNYRGTINHRW--PQRANYSRREFVLATRPMVRTFKFVFMMVCMLNTYDTQLKAEWF--DPL-IK-GMNTWYMVAMW-AHSDQQHMDPC--LD--GMQHDAR-HYWCHDH-WMRGHF--Y-PMMP----E-H--W--SRQRRDAIV-NR-G-FMGNR-EGVGPE-FISMHPIVDDFHTLTKPSNETIAWRQSKENPARDKMH-PFYGLKDRGQIMGQVM-LD-QIR-VHWIYKEL-S-PVPRR-LPCNRSTSDQAWTLHTLHIRPWHV-EQEEVVRCR-LKN--APFN-D-PNFDFMI-LFTVVYNYMMRKF--EFVIKN-GDKW-RFTWTCMDMIQLVWRYVLPFGKKQVTQTFCKKDYVVRKNELSKFMSS-QSMFPVRR-WMQRINEGK-HQHGYS-NKQAGRASQTPQQDKAMRRHEKEPLRYNKDSLEHNNESSGVGFYGRIKKIVVCLLAVICITAHNGTTGQYGAFNAAR-NFCIMYRDI-KAFQGDH-HR-KCESKDHDEQLDNSRMPAEGQRAYDMLRK-MWFD-FDNNFGIMSH----R-REY-LHRCAPGC-WKWQCFLDLY--SWIFDTQIYGVPRCKFHK-PTQSNVAQC--C-IIMEPT SPFCAPGRCGNCVFAI-MAHHW----SRSKLVSYG-YFGSYGCEVPQNA-PVRPF-M---FKPTNWFCARFEACSKQWAFPVSRHRTRHKDTEKKLGMSYCSPCTASQCGPYHA-DEAGK--PMGYNY-GHYGNCPKAV-ATNVGG-PAVCMMRDIDLAY-CAHL-AM-G-ISH-LGIVHPMWPNY-TNRYKCPTW-WKYATLARIQPWNSENANCKLWSRLCIQ-PMCVL-VNLWISDVYQWLWMFVWPRWKMLE-Q-G-DEVSGWQHAYGCCGEANIIDCKEEDLWEPEAL-F-E--GVHLYERR--QYK-H-I-GWIVVARCFKVGDIKYSAELEVTVEGDC-ETYENWGDLTKIVDYMTFFAMQKSCYYQP-----C--VWA-ARAIQV-HTCHQL--GFVELVKSMRVTECKARLGNFDGHG-LNEGPSFLHIC--F-QW-T-EWCLTQRDDGMPMRCVSCFAQMENQEYAIVPG-WRISYRWRVTGGI-IKAIFPT-K-IYY-LSHMCVDFMWQMK--DW-YDGPRP-NKSE-YMQYAQNEGY-CGFAHMIDACSPWCME-SILCFSCIHIRWKC-EN-I-MES-FTHETSLGCP-AFVTYNPHLYTTHSSPPAWMTG-IG-EYM-YCEYE-KYPQPW--IH-G-DSIGASGVIMCLGQ-A-YVWGHE--Q-DLWGFIHDCVAICCPPKEHYEL-CNWRDWIFAF-RSSFSFMGRMPMQHAYVWSLPNFQQQVHFKFWIGSAQ-HYMRWANQYQHKGALSF-YQWLHPPGQRI--WPDIHAGIVMVPWQKAVSETLKAMGHDECTMWIGIDNNSYTGKICA---YAEVRGIKHYIRRHAHHPGIIC--FSQL----TFNPVCQDKFYYVH-GTV-YIKG-NHS---WCYA-KMEPI--IPPFDGFHGPIEESWWVEKI-K-TSMVQVWHV------KKIYPVSTKIDD--LFSHDHGSFQYQCLSYWMIKQHYHAKCTETAHKDCQISVSVYWGHASAIWTKIYQCVMEFSHMHLGKHCFYNNFSVAWCDSGPVHSKCFKIMKCVC-MIQNNHQ-EQYGEYGYDDVIESFLIYPI--QVKFEEVSSW-PPIE-VHICQMMVWSPKLHEIIW-FRGCVLDRMVPEKFVKQYRRCPETKWREKSWGNADCWTIYRESSTNIHNDLYKTPDL----------GN--I--M-VMKFFRYHDKKQGMGGPG-DV--LT-MK-I-TKKHFWM--SR--KSDR--KRIPETFHPVQYCYFQHR---------W---FAKLQISLKWPHWYYKDWRIRDCM--Y-TQDRISNHI-IH------DK-YHIC-SVRGWREYAICNKLATEN-W-E-PLNDMENTWHSLLVVFCFACNL-FHHGKRFRQEGGNTSNHWNNNIDLAWMAESFQKRTATVEFELVIPRIRMAGLANHCTRPILHCQCWCIHVFVISERDPGYVFTEPMGGFNKETCSTRSYSA----AW-S--MATMDKIQLEIP-GPAFLLFMLAHCEDRFDVWHWPLWYCVLKGLQEWVMQWVTFCDFGEVRDKIKRNEDTNHIDHELNEIEPFASDAIGIVHEMLH--FMACPAHCAIMNR-MRCEVFNSSTNVNMIM-NWNVGKRAPCAPHTRKDDCKHSVTGPLYCETLLKKDNFITV-VMLGWYDKQNVKFQPSYG---TPKWTAWNEYGRSGDLMPIP---QHN-P-MW-----DL----KALIQVPRCYIGNPSQQAEMKPCEGARFGCFSLRSSWMNA-AHYQKVPNFLRKSMGWLHWQVRMRYKWMMMSTKMHCIQRYVPYRGPQ---NG-G-E-G--VGWNMN--EAF--C-N--AQ-LFANFLMNNCFEV-VFWRMEVPEYW-QCSVFRDAPHAICYNQRPYGMPFYSSTQKALMI-V-FC-TV-GPYTVDKHYRHIPGDTVSLKSSKADILRQAMFRWYNS---TYAVDWFTRSYIRTHDFANQNMDMVVHPDWH---DGPDYREVIISF---WNP-ANAKNVYQVIYIEYRRPQKIFPCHKFHSEDISMKWMYIRVPRFYTRDYTWVNDFLRDF---DITCMRNKFAARHEVMNINPACDGKIPAVINCAYGFRQKQIKQIQHWPKENDFRCAKYAVHE-HRICSHIWQKDVYHHKECVE-SRPS-GCKPR-WW-RA-KKYWVCHFWHY-KAE-YVMEQ-IMLA-F-DEMQYEMWRHYHAWDFI-G-CRRWM--DILM--AWQIPRCWIFSREKY-GFHDEFMRDP---QCMLHY-WLVSFSAW--CITTTFCWRCVNKDMFQQHAKFYADNTDICKEHGFDQNY-GPIL-IGSMF-S-MCTFEPWWFKGIKGMYA-FQFLMEHKATDWCFTAAVIPFKIVNV-GAAQATNWFQSESITINCGPGG-CQSLFRCDNSPTQFYSKCTLLKFWG-DKTPRMGPWCATHDW-KGPWTTYESKLAFHFGDKKGFWFDEPASINSVTKSWFSCYTGVRWMGIGPECRIG-NAIDEPWWEMEEDNLDYTQSKLSTNMHGWCYRHVRFFP--D-SQVQPQAMYLQWHRKSAQESFHHLISRRDERKEFA-KRET-HITW--HTPDVIPWDISDQMELFKFWGWKDCMIKFQTECVKTNTSTRFFKGI-HW-ARAFIRPF-SKPFANPWYKLS-KCVERC-MHWDWAQMIS-K-GWEDF--IT-LLRWGSMF-FTECVQAYFMLSPEYAQHYTTPDVFEWVFDFHNEMVQVSGNPDHHQVTCSPTNN--QC-TGRM-LPF-SPG-GFHPR-SK----LMQKR-I-PW-PRTEQ-KTFQRLRQNFRSLEPGAPRGEEDDHIYMAHYLGETHNYMIQNCTMNM-G-YIYYIIPACV-FVMYTIQFQLMSEGLFGSFEQFEIHIC---CHWN-DCVAKQWEKRWAFVMMLPGIRKQTWKDFTMHNEFISCNWCQEMLNAT