### CODE CHALLENGE: Solve the String Composition Problem.
def kmer_composition(text,k):
"""Input: An integer k and a string Text.
Output: Compositionk(Text) (the k-mers can be provided in any order)."""
composition = []
for i in range(len(text)-k+1):
composition.append(text[i:i+k])
return composition
text = 'CAATCCAAC'
x = kmer_composition(text,5)
print x
' '.join(str(i) for i in x)
['CAATC', 'AATCC', 'ATCCA', 'TCCAA', 'CCAAC']
'CAATC AATCC ATCCA TCCAA CCAAC'
###String Spelled by a Genome Path Problem. Reconstruct a string from its genome path.
def string_from_genome_path(kmers):
return kmers[0] + ''.join(map(lambda x: x[-1], kmers[1:]))
kmers = ['ACCGA', 'CCGAA', 'CGAAG', 'GAAGC', 'AAGCT']
string_from_genome_path(kmers)
'ACCGAAGCT'
kmers = [line.strip() for line in open('dataset_198_3.txt', 'r')]
string_from_genome_path(kmers)
'AGACTGAAAGTCAATGCAGGTTGTCAAGAGCTGGTCGGAGGAGCCATTATTGTTAACGGGTCCCGGTAATCCGTCATGTGTATCCTACATGTCAATAGAACGACACAAAGGCTAAAATGATCTGTCGATGATACACATGTCCTGCAAGACAACTGACGCTAAGGTTTGAACTGCAGAGGGTTTGTCCAACACACATAATCTCGGCACATCTTATTAAGCGCCCCGAATCGCTTTGTTGGATTAACACGCCTCGACGCCTGAATAGGAACCACATCCCCGTAACTGAGCTCGTGCACAGAGCTTCATCGCATAGGTCACTCAAGACTTGTGGCTTGTTCATCGTAGGCCGCATTACAAGGCAATAGCATGTCGTGGTGTGTACAGCAGAATCCCCGTAGACCAAAAGAGAGAAGTGCCCCGGCGTCCTAACTCTAGCGCTTCATTACTAAGGGACGCAAACATTGAACGTCCTGTAATTTAAAGCATGGAGCTTTATGGTAGAATATAAACCCTGTGGCTTTCAGCTTAAAAATTGTCCCAATAGCCGGTTTACTTATCGGCCGCGTGTCGCCAACTGTGGCTTGCGTAGGTCGCGTTGGTAAGGGGCACAGCGTTTTGGTCCTCAATCCGTGCGGTCTGTCTTAGTACCAAGCAAAGCAGTGCCTTGCAGATCTCCGTGCACACAAAGCGAGTGTAACGCAACTGCCAATCCCCGAGTCCATAGGAGGTTAGACTTCCCAAAAAAACTACCAATGTGGATATGTCTCCGTTTGACCCTGATCAATTCGCACTTGCTTGTCATTCCCACGACCTCCATTTTTAGCCATAGACAGCGTACTGTTCCCGATGATAAGCGTTGCCACGATGGCTAGACTAATCCACCGCTAAAGTGTGAAAAAGCGAGCTGAAGGTGATATGATAGTCCAGTACAGTGTGCTACTACGGGGTAGAGGATCCAATAGGTTTCAGCCGAATGAAGAATAATCATCGCAGCTGGCTTGATCCGGGCGCGTAACTGCTGCGAGAGAAGTCAACTGGTGGTGACCTCGCGGACTTTCCCTCAGATAGACGCATACGTTCACTAGACAGCAGCTGAACACGTATTGTTGATGGGGATTTAGGCGGTTTATCGTACGCCGCATGGAATGCAGCAACCGACAATGAAAGACGAGCACGAGCCGATAGCTCGCTCATAATTTCCGTGGTGTTGATAGAATGTGCTTAAGCTACCAGGCGTCTTTTACATCCTATTCCGTTCACTTTTATTACACTTACTCCGGCGTATCCCAGGGAGCATACCACAGCTTGGATCATAGAGATCTGTAGCCGGCTTGCATGGGCAGAAGCTATGAGGGAAAGAAAGGTGACGAAACATACGTACAGTAAAGGCACAAGCTCTAGCCTTCTAGCATGTTGCACGACAGTCTGCGTGTACTTCCACACCACACTCGTTTGGTTTCATCTCACGTAAAGCGTACAACATCCGTTTCGGCACGGGCGTTTACGTTAAAACGCGCCGCGTAACCTTAATACAGTCGGTTCAGGTGCATTGCGACCGACACATTAACTCAAGGTGATGGCTAGAAGGGGCTCAGTTCGGGCCTAACGCCTACTGAACCTACTGCGGTCGACTCAGAGGGTATATTGATTATGCCATCGTGGACATGTCACAATTTCGGACGAGGCGACCCCAAGGGCTGCGGTCCAATGCCTTGACTTGAGCACACCTGGAGCATTCGCTGTCCTCTACCCTAGAGTTCAAGTCGTGGGACGGTACTCGACCTTCCGCCTCCGCTTGGTGGCAGACTCTGATCGCCCTCACATCCTACATGGTACCTTAGGATTAAGCTTCCGAAAGGGGATATACTCACCCGTATCCAACCTCTATTTATAGGTGCAACTCGCGGCATGAATCGCCATCAAGAGTCTGCGACACTATACCCTACGCCCTCGAGGAAGCAACTCAGAAGCATACTGTGCTAGTGAGATATTAATGGAGTAACCATCCTACGGGTCATTCCAATTACTCTGTACGAATATGTGCTCTAAGCTTCTAGCGGGTCCACAAGAATACCGTCGGCCTCGTCCTGGTCCTGACGCCCCAGCTCGCGGATAATTGCCCATTAGCGGACGAGGTTCAGTCCCACCGACCCCACAGATTTTCCAGACCATCGGGGCCCGATCGCTCGCTGAGCACCTAGTCTCGGGGGATCTGTACCGATGGAAAACCCGACACAGGCAGAGGTCCCGGCGACAGCCGCACTTAATGTACCTCTCTCCAAGTTCATGAATCTGACGGGACGATGCAGACCTAATAACCAAGCCGAGCTTCTATTAGGAAAGAGGATCCGCCTTGGGCACCTGGATGACTCACAGAAAATTATGGGTTCTATGGACGCGTCTTGTATACAGATAGTCGGAAGACACCATGGTGGACAGCGGCGAACGAGGTACGATTTTTCGACATCCACGCGTTCCTCAGGGGGGCTTGGTAGGCCATGATTGAAGGACACATCAGGGGAGTCAACCAATGGTGCAGGTGGGACAAGCGTTGGGCAGGCTTGGACAGCAGGGAATCGCCACCAGGTAATACGTAGCGTAACAGCTTAAGAGGTTGGCCGTAGAACCGGCATTCTTAACTATGAGCATATACCAACCCCCCCTCCATCCGAGAGGAGCCTATCCGATCCGTCGTATCACATATGGGCGCATGCATGGACTACGGGACGGCTAGATATCTCCAATGGTAGGGGCGTGATGGTAAGTCTCTGATAGAGCTTTTCTTTTTATCGAGTACACGTCTAAGTAGCAGTGGCAGACCCATGGAGTCGCCGGTCATAAAGCCATTACCCGTCGTGCATGTGTGAGCATTTTGGAGCCTCTCGGATGATTATTAGTATCCGCCAGCATGATACAAAAGGCTAGTAGAGTGGCAGATACTAAGCGGTATGGAGCAAACTATACCTACTGAACCATAGGATTGATTTCTTCGTGTCAAGGCCGGTATGGCTTCAGAATTCTTTTGTTATAGTATGACTCGATGTATGATGGTCCGCGCCCTACCCCCCGGAGTGTACCTAAAAGAAATCTGCAGCGCGCCGAGTTATGACCTAGGTGAACCTACAATGCTGGCCGCGTGACTCGGAAGCAACTACCGTTAAATAATCATTCTCTAATGGAGCTAAAAGGCCATGCTCTAGGACTCGGTCGCGGCAAAGTCCCGGCTAAACTTATCCCCCATAGTGCGCCCCAACTAGACCTGCCAATCGGACGTTTTGAGGAGCACCGTATCATTGTCACGACTGCCGTGGGCGCTAAACCTAACTAGTCACACATGGATGTACACTTGAGGTGGCGTTACTCTGCACGGCCTATTGAAAGCGCGCGCCTGCTAAGTGCCCTTAGCGCTCTTAATTTTGGCGCGAGCATGTGTAGGTATCAATCCAGCGCCAACGCTTATTCGGCTATTGCGGAAAGTTAAGCTAACTACACTCGCTATCACCCAGACCATTGATTAGGATGATGTTACTAGCAACCGATCGGCAACCCTGCCGTCATGGTTTGTGTGGGTTTGGCTAGTTCCAAAACCTATTGTAACTTGAACTGGTGAGTGACTGGCCTGCATGAGGCTGTGCTTCTGGCAGAAAGGAGCGATCCAGGATATGAAAATAAACGAATCGCCACGTGAGCTCACTAAGTCTTCACCCATGTAGCACGAGTCTGGCATTTAATACCTACACGTTAGAGTCCACGATCCTATAACATGGGTAGCTGCCGTGAGAAGTCAATCATGGCACCACCATGCTTTAGCAAGCAGGGTCCAAATGAGAAGACTCCACTTACAGATATCAAAACCGAGCCTTAAGCACCTGAACGCGCCTTTCTTGCGTATTCCACATCCCACTTGGCTGCTTGAGAGAGTCGATATCGAAGTTAGGTCCGATTGTAATTCGCACGGGGTAGAGTGCCCAAGGCCAAAACTGATTAGTTTCCATAGAATCGGTGCTAGTAGTAGACAAGGGTGCGGCGGAGTCATCGTTTATGACTCTCTCCCGCAACATTGAAGCCGGCAAAGTAGATAAGGCGTTTATAAGACTGAGACTACGTGGTCCCTTACTATTTTTGCTGGAGGAGGACGCCGCACGCAGCATGCTTATAAGCACGTTGATGGATAGTGCCCACCCTTCAGTGGAAGAGTGTACGATTCGCCAGAGCGGAGGAACAGATTTCTCCGTGTCTGTATACTTTAGCGCCGTCACCCCACTGACTCATCGGTTAGTACAGCACGAATGGGGCCTTGGGACGCCGTAGTACCAGACGCATTTTTGAGAGTCTCGCTATATCGGCCCTCTGATACTCAGAGTGTCGTTAATCGTAGCTACTTATGAAAAGCTATTACGATCCTCAAATATTCGGACTTGAAGATCAGTTAAGCCTCAGTCGCCGGTCGTTAAAACTAACTCCTGGCTCTTAGGACGGAGAGTAGCAATATACGTGACATTGGCTTCACACTCTGTATTAGCTACACGTGCAGCTGTACAATTTTAGCCTTACAGAGTTTATGCGAAGTTTCACCCTTGTTGCCGTACAAGTAGGTAGTAAGTTTGCCACGTACTCAGAAGGCCTAAGCCTCAACACTGTTCGCTGCCTCCATGGGGGGGAGGCTTTGTTCCAGGCTCTGCTTTGTTTCCAACTAATTCTGAATGCTGAGCTAGGCATATTCGTGTATCCGGGGCAAACTCTTACGGTCTAAGTATATAGAACAGTGTCGTGCCAACCGGCCAGGTGTTCCCTTTGTCCGGTGGTGTATATTTAGCTCCAACAGACATCTGCAGCGGAATTTAAAGTCGTTATTTATTTCTTCGCGTAAGCCCATAATATTGGCATACCGGCCCAGTAATCGCGGCAAGTGATAGTGGCAACGCACAGACAGGGTTTAGCTCAAACTGACCGCCCACACGTAGATGGCTGTTATAGAAGGTTTTACGATTCAGACCTTC'
def overlap(a, b, min_length=3):
""" Return length of longest suffix of 'a' matching
a prefix of 'b' that is at least 'min_length'
characters long. If no such overlap exists,
return 0. """
start = 0 # start all the way at the left
while True:
start = a.find(b[:min_length], start) # look for b's prefix in a
if start == -1: # no more occurrences to right
return 0
# found occurrence; check for full suffix/prefix match
if b.startswith(a[start:]):
return len(a)-start
start += 1 # move just past previous match
from itertools import permutations
list(permutations([1,2,3],3))
[(1, 2, 3), (1, 3, 2), (2, 1, 3), (2, 3, 1), (3, 1, 2), (3, 2, 1)]
reads = ['ACGGATC', 'GATCAAGT', 'TTCACGGA']
list(permutations(reads, 2))
[('ACGGATC', 'GATCAAGT'), ('ACGGATC', 'TTCACGGA'), ('GATCAAGT', 'ACGGATC'), ('GATCAAGT', 'TTCACGGA'), ('TTCACGGA', 'ACGGATC'), ('TTCACGGA', 'GATCAAGT')]
#Solve the Overlap Graph Problem (restated below).
from itertools import permutations
from collections import defaultdict
def overlap_graph(reads):
'''Input: A collection Patterns of k-mers.
Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. (You may return the edges in any order.)'''
olaps = defaultdict(list)
for a, b in permutations(reads, 2):
if a[1:] == b[:-1]:
olaps[a].append(b)
return olaps
reads = ['ATGCG','GCATG','CATGC','AGGCA','GGCAT']
graph = (overlap_graph(reads))
for x,y in graph.iteritems():
print x,'->', ' '.join(y)
AGGCA -> GGCAT GGCAT -> GCATG GCATG -> CATGC CATGC -> ATGCG
d = {}
d['a'] = 'b'
d['b'] = 'c'
d
for x,y in d.iteritems():
print x,'->',y
a -> b b -> c
#Solve the De Bruijn Graph from a String Problem.
from collections import defaultdict
def debruijnize(st, k):
""" Return a adjacency list, for each k-mer, its left
k-1-mer and its right k-1-mer in a pair """
edges = defaultdict(list)
for i in range(len(st) - k + 1):
kmer = st[i:i+k]
edges[kmer[:-1]].append(kmer[1:])
return edges
edges = debruijnize("ACGCGTCG", 3)
for x,y in edges.iteritems():
print x,'->', ','.join(y)
GC -> CG AC -> CG GT -> TC CG -> GC,GT TC -> CG
def de_bruijn_ize(st, k):
""" Return a list holding, for each k-mer, its left
k-1-mer and its right k-1-mer in a pair """
edges = []
nodes = set()
for i in range(len(st) -k + 1):
edges.append((st[i:i+k-1], st[i+1:i+k]))
nodes.add(st[i:i+k-1])
nodes.add(st[i+1:i+k])
return nodes, edges
def visualize_de_bruijn(st, k):
""" Visualize a directed multigraph using graphviz """
nodes, edges = de_bruijn_ize(st, k)
dot_str = 'digraph "DeBruijn graph" {\n'
for node in nodes:
dot_str += ' %s [label="%s"] ;\n' % (node, node)
for src, dst in edges:
dot_str += ' %s -> %s ;\n' % (src, dst)
return dot_str + '}\n'
%load_ext gvmagic
The gvmagic extension is already loaded. To reload it, use: %reload_ext gvmagic
%dotstr visualize_de_bruijn("TAATGCCATGGGATGTT", 2)
%dotstr visualize_de_bruijn("TAATGCCATGGGATGTT", 3)
%dotstr visualize_de_bruijn("TAATGCCATGGGATGTT", 4)
%dotstr visualize_de_bruijn("TAATGGGATGCCATGTT", 3)
from collections import defaultdict
def debruijn_from_kmer(kmers):
'''Input: A collection of k-mers Patterns.
Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).'''
edges = defaultdict(list)
for kmer in kmers:
edges[kmer[:-1]].append(kmer[1:])
return edges
kmers = ['GAGG','CAGG','GGGG','GGGA','CAGG','AGGG','GGAG']
for x,y in sorted(debruijn_from_kmer(kmers).iteritems()):
print x,'->', ','.join(y)
AGG -> GGG CAG -> AGG,AGG GAG -> AGG GGA -> GAG GGG -> GGG,GGA
def generate_edges(graph):
edges = []
for node in graph:
for neighbour in graph[node]:
edges.append((node, neighbour))
return edges
graph = {0:[3],1:[0],2:[1,6],3:[2], 4:[2], 5:[4], 6:[5,8], 7:[9], 8:[7], 9:[6]}
edges = generate_edges(graph)
print edges
[(0, 3), (1, 0), (2, 1), (2, 6), (3, 2), (4, 2), (5, 4), (6, 5), (6, 8), (7, 9), (8, 7), (9, 6)]
def nodes_degrees(edges):
''' This returns a dictionary of nodes and their degrees. The degree of a vertex is the number of edges connecting
it, i.e. the number of adjacent vertices. Loops are counted double, i.e. every occurence of vertex in the list
of adjacent vertices. '''
degrees = dict()
for a, b in edges:
degrees[a] = degrees.get(a, 0) + 1
degrees[b] = degrees.get(b, 0) + 1
return degrees
edges = generate_edges(graph)
nodes_degrees(edges)
{0: 2, 1: 2, 2: 4, 3: 2, 4: 2, 5: 2, 6: 4, 7: 2, 8: 2, 9: 2}
def highest_degree_node(edges):
degrees = nodes_degrees(edges)
return max(degrees, key=degrees.get)
highest_degree_node(edges)
2
g = {0:[3],1:[0],2:[1,6],3:[2], 4:[2], 5:[4], 6:[5,8], 7:[9], 8:[7], 9:[6]}
def eulerianCycle(g):
""" Input: The adjacency list of an Eulerian directed graph.
Output: An Eulerian cycle in this graph."""
tour = []
src = g.iterkeys().next() # pick arbitrary starting node
def visit(n):
while len(g[n]) > 0:
dst = g[n].pop()
visit(dst)
tour.append(n)
visit(src)
tour = tour[::-1] # reverse and then take all but last node
return tour
tour = eulerianCycle(g)
'->'.join(str(i) for i in tour)
'0->3->2->6->8->7->9->6->5->4->2->1->0'
def read_input(filename):
f = open(filename, 'r')
g = dict()
for line in f:
line = line.strip()
line = line.split(' -> ')
g[int(line[0])] = [int(x) for x in line[1].split(',')]
return g
g = read_input('rosalind_ba3f.txt')
def eulerianCycle(g):
""" Input: The adjacency list of an Eulerian directed graph.
Output: An Eulerian cycle in this graph."""
tour = []
src = g.iterkeys().next() # pick arbitrary starting node
def visit(n):
while len(g[n]) > 0:
dst = g[n].pop()
visit(dst)
tour.append(n)
visit(src)
tour = tour[::-1] # reverse and then take all but last node
return tour
tour = eulerianCycle(g)
'->'.join(str(i) for i in tour)
'0->2->6->541->1215->1214->1213->541->542->543->6->31->32->771->1355->2441->2442->2440->1355->1356->1354->771->770->769->32->33->156->911->910->1203->1201->1202->1674->2189->2190->2188->1674->1672->1673->1202->910->912->1232->2903->2902->2904->1232->1873->1874->1875->1232->1231->1233->912->156->154->553->555->554->932->931->933->554->2488->2489->2490->554->2201->2200->2202->554->2088->2086->2087->554->1184->1183->1185->2103->2102->2101->2396->2395->2397->2101->1185->554->154->155->33->6->4->8->83->231->1723->1725->1800->1799->1798->1725->1724->231->229->230->83->82->84->8->39->48->62->688->1088->1586->1585->1587->1088->1089->1087->688->690->967->1250->1249->1251->967->968->969->690->1486->1487->1488->690->689->62->63->61->48->1365->1363->2030->2031->2029->1363->1364->48->47->514->516->627->625->626->516->515->876->875->874->515->2323->2325->2324->515->47->483->482->481->530->1438->1440->1439->530->529->680->1860->1858->1859->2041->2042->2956->2957->2958->2042->2043->1859->680->679->681->529->531->926->927->1111->1113->1112->927->925->531->481->47->46->39->37->215->2057->2056->2058->215->216->1661->1660->1662->216->1015->1971->1970->1969->1015->1017->1016->216->214->2558->2559->2652->2651->2650->2559->2571->2570->2569->2559->2557->214->37->38->8->1621->1622->2452->2454->2453->1622->1623->2433->2432->2431->1623->8->126->124->478->480->962->1128->1167->1165->1166->1171->1930->1931->1932->1171->1173->1172->1166->1128->1127->1126->962->961->1037->1038->1036->1581->1579->1580->1639->1640->1641->1580->1036->961->963->480->1257->1256->2513->2512->2514->1256->1255->480->479->1749->1748->1747->479->124->125->238->240->1278->1276->1277->240->239->2018->2938->2940->2939->2018->2536->2537->2538->2018->2017->2019->239->125->8->7->9->24->462->949->950->2684->2683->2685->950->951->462->1655->1656->1654->462->460->1446->1444->1445->2337->2335->2336->1445->460->461->24->1839->1837->1838->24->22->96->1146->1145->1144->96->95->114->261->323->322->324->465->511->2920->2921->2922->511->512->513->465->1216->1217->1497->1495->1496->1217->1218->465->464->520->865->867->866->986->985->987->1681->1683->1682->987->866->520->1510->1511->1512->520->522->521->464->463->324->261->260->259->1319->1318->1320->259->114->113->268->878->879->877->268->1180->1389->2241->2240->2239->2386->2387->2388->2239->1389->1387->1388->1180->1182->1181->268->270->1626->1625->1624->270->269->363->361->362->269->113->218->894->893->892->218->217->2290->2326->2328->2882->2881->2883->2328->2327->2290->2291->2292->217->1631->1632->1630->217->219->113->112->330->329->728->727->1018->1019->1020->727->729->1062->1061->1060->2230->2232->2889->2888->2887->2232->2231->1060->729->329->328->525->2690->2691->2689->525->1407->1405->1406->1771->1977->1975->1976->1771->1773->1772->1775->1774->1776->1772->1406->525->523->524->1191->1705->1854->1853->1852->1705->1706->1707->1191->1190->1189->1649->1648->1650->2141->2140->2142->1650->1189->524->328->2553->2551->2552->328->112->213->2149->2151->2150->213->211->351->349->2343->2835->2833->2834->2343->2342->2341->349->350->2678->2677->2679->350->211->212->341->564->897->896->895->1043->1118->1117->1119->1043->1044->1179->1994->1995->1993->1179->1177->1178->1044->1042->895->564->562->1334->2839->2841->2840->1334->1909->1911->1910->1334->1335->1333->1419->1417->1418->2524->2525->2526->1418->1333->562->1285->1287->1286->562->1109->1108->1110->1761->1759->1760->1110->562->563->341->366->364->365->341->1281->1280->1279->341->340->452->451->798->797->2457->2455->2456->797->796->2159->2768->2769->2767->2159->2158->2160->796->451->545->602->755->754->756->602->603->601->545->544->546->585->2620->2622->2898->2896->2897->2622->2621->585->584->1695->1693->1905->2752->2754->2753->1905->1904->1903->1693->1694->584->583->978->977->976->583->546->451->453->616->676->1964->2894->2893->2895->1964->1965->2705->2706->2704->2947->2948->2949->2704->1965->1963->676->678->677->616->618->617->453->1528->2067->2065->2066->1528->1529->1530->453->340->342->376->377->2658->2656->2657->377->378->342->1030->1032->1135->1137->1136->1032->1031->1767->1765->2530->2531->2532->1765->1766->1031->342->212->112->95->94->22->78->264->263->262->78->1539->1537->1538->78->137->971->972->2519->2520->2518->972->970->137->606->604->685->686->687->1242->1240->1241->687->604->605->137->136->586->587->588->136->138->650->651->649->668->669->667->649->1021->1022->2345->2344->2346->1022->1023->2257->2259->2258->1023->649->138->1507->1508->1509->138->1393->1395->1394->138->78->77->76->367->662->663->661->367->369->2219->2379->2378->2377->2219->2218->2220->369->368->2306->2305->2307->368->76->22->432->2535->2533->2534->432->1844->1845->1843->432->1543->1545->1544->432->430->431->22->23->311->1050->2419->2421->2581->2582->2583->2421->2420->1050->1049->1048->311->310->312->1519->1520->2877->2876->2875->1520->1829->1828->2715->2714->2713->1828->1830->1520->1521->312->1459->1461->1460->312->23->101->256->842->841->843->2125->2127->2126->843->256->1195->1196->1197->256->258->298->299->300->2450->2449->2451->300->1569->1567->1568->300->1494->1492->1493->300->258->257->845->2015->2014->2016->845->844->1134->1966->1967->1968->1134->1132->1133->844->846->257->101->100->936->934->935->100->102->23->9->4->2900->2901->2899->4->27->129->285->296->568->570->1592->1688->1689->1687->1592->1593->2546->2545->2547->1593->1591->2161->2163->2162->1591->570->569->713->1534->1536->2199->2198->2322->2659->2660->2661->2988->2986->2987->2661->2322->2321->2320->2332->2333->2334->2352->2351->2350->2334->2320->2198->2197->1536->1535->713->712->2562->2560->2561->712->714->569->1096->2301->2299->2300->1096->1098->1097->569->296->537->536->1399->1401->1400->2365->2367->2366->1400->536->535->1746->1744->1745->535->296->1736->1735->1737->2252->2253->2251->1737->296->295->297->285->284->283->1652->1653->1651->283->129->127->1299->1298->1297->127->128->2522->2521->2523->128->2007->2006->2005->128->27->26->49->50->53->201->1156->1158->1157->1936->1938->1937->1157->201->200->2004->2003->2002->200->199->53->52->406->407->2264->2265->2263->407->408->52->337->338->339->52->2310->2308->2309->52->54->209->208->558->2280->2422->2448->2447->2446->2422->2424->2423->2280->2279->2278->558->556->557->208->232->233->748->750->749->233->254->1710->1708->1709->254->253->255->916->918->917->1404->1832->1833->1831->1404->1402->2668->2670->2669->1402->1877->1876->1878->1402->1403->917->255->267->266->265->487->890->1244->1823->2575->2577->2576->1823->1824->1822->2382->2380->2381->1822->1244->1243->1505->1504->1551->1550->1549->1504->1506->1243->1245->890->889->1541->1722->1721->1720->2206->2207->2208->1720->1541->1542->1540->889->891->487->2630->2629->2631->487->2080->2082->2081->487->1861->1863->1862->487->488->620->621->2625->2624->2623->621->619->488->489->265->255->1872->1870->2463->2462->2461->1870->1871->255->233->234->208->210->699->2154->2153->2152->699->697->698->210->657->655->656->210->54->1160->1161->1159->1435->1437->1436->1159->54->50->1347->1345->1346->1703->1702->1704->1346->50->51->1357->1358->1359->2808->2807->2806->1359->51->26->224->223->225->26->25->35->309->307->1001->1002->2097->2095->2096->1002->1000->1381->1382->1383->1000->307->308->35->34->220->947->948->946->1373->1374->1680->2786->2787->2785->1680->1679->1678->1374->1372->946->220->2664->2662->2663->220->221->726->725->724->221->571->572->1139->1138->1140->572->573->221->2675->2674->2676->221->222->34->153->1142->1143->1141->153->151->186->185->538->2460->2458->2459->538->540->539->185->184->151->152->34->36->25->244->371->825->823->824->1812->1810->1811->824->371->370->372->2711->2712->2710->372->2177->2579->2580->2578->2177->2178->2176->372->2137->2139->2271->2270->2269->2139->2138->372->244->1124->1306->1307->1308->2037->2036->2035->2916->2915->2914->2035->1308->1124->1125->1235->1325->1326->1324->2077->2079->2078->1324->1728->1727->1726->1324->1235->1312->1313->1314->1235->1236->1247->1248->2789->2788->2790->1248->1457->2166->2164->2165->1457->1456->1458->1664->1663->3000->2999->2998->1663->1665->1815->1813->1814->1665->1458->1248->1246->1236->1234->1941->1940->1939->2617->2626->2628->2627->2617->2618->2619->1939->1234->1627->1628->2871->2869->2870->1628->1803->1802->1801->1628->1629->1234->1125->1123->1634->1635->1633->1123->244->246->245->25->4->18->16->67->559->561->560->1091->1452->1450->1451->1091->1092->1090->560->67->317->318->404->785->784->786->404->405->486->485->703->705->704->1294->1296->2497->2498->2963->2962->2990->2991->2989->2962->2964->2498->2499->2501->2500->2502->2499->1296->1295->704->485->484->1066->1068->1067->484->405->403->318->316->684->2246->2245->2247->684->683->1427->1426->2931->2929->2930->1426->1428->683->682->1398->2133->2131->2132->1398->1397->2740->2742->2741->1397->1396->682->1148->1789->1791->1790->1148->1147->1149->682->316->67->179->781->1645->2146->2147->2148->1645->1646->1647->781->782->783->179->644->741->739->2196->2195->2194->2634->2859->2857->2858->2634->2632->2633->2194->739->740->1523->1524->1522->740->644->643->645->737->736->738->645->179->417->415->416->518->2071->2073->2072->518->519->1410->1409->1408->1503->2971->2972->2973->1503->1576->1577->2038->2040->2039->2854->2855->2856->2960->2961->2959->2856->2039->1577->1578->1503->1501->1502->1408->519->517->416->1471->2968->2970->2969->1471->1472->1473->1950->1948->2107->2108->2109->1948->1949->1473->416->179->178->180->67->119->332->2475->2474->2473->332->333->331->959->960->958->331->810->808->809->331->119->118->792->790->791->118->120->67->69->157->158->2401->2403->2942->2943->2941->2403->2402->158->159->804->803->802->159->69->68->574->576->575->68->16->17->646->647->648->17->4->165->164->1366->2010->2008->2009->1366->1368->1367->1697->1698->1696->1367->164->163->386->387->385->2979->2978->2977->385->2316->2315->2318->2317->2319->2315->2314->385->163->4->5->2745->2743->2744->5->2->3->98->97->501->500->499->97->1658->1659->1657->97->1155->1154->1925->1924->1926->2340->2339->2338->1926->1154->1153->97->111->109->2047->2912->2911->2913->2047->2049->2048->109->110->829->831->1265->1264->2797->2798->2799->1264->1266->831->830->2391->2390->2389->830->110->450->1227->2732->2731->2733->1227->1226->1225->1732->1733->1734->1225->450->449->448->110->97->99->1376->1377->1935->1933->1934->1377->1375->99->3->11->248->249->634->635->636->249->247->11->2168->2167->2700->2699->2698->2167->2169->2601->2599->2600->2169->11->10->965->966->964->1685->1684->1686->964->1353->1351->1352->964->10->66->65->64->271->272->468->533->534->592->594->593->734->733->735->902->2076->2398->2399->2400->2076->2074->2075->902->903->2011->2013->2012->2906->2905->2907->2012->903->901->735->593->534->532->468->466->2113->2115->2114->466->1115->1114->1116->466->467->2709->2708->2707->467->272->313->746->837->835->1315->1466->1465->1467->1315->1316->1317->835->1078->1464->1463->1462->1078->1080->1079->2593->2594->2595->1079->835->836->746->747->745->313->2639->2640->2638->313->314->436->447->598->707->708->706->598->2329->2331->2330->598->600->1856->1855->1857->600->599->731->1448->1447->1449->731->732->730->599->447->446->445->1590->1589->1588->2354->2355->2353->1588->445->436->437->869->868->870->1176->1175->1174->2268->2267->2266->1174->870->437->776->777->775->437->438->314->315->1490->1489->1491->315->272->1546->1548->1547->272->273->1821->1820->1819->2647->2648->2649->1819->273->64->10->60->59->58->72->133->2680->2681->2682->133->134->188->457->1293->1292->1291->457->1083->1350->1348->1349->2815->2816->2817->1349->1083->1082->1081->457->458->1527->1525->1526->458->459->759->1258->2735->2734->2736->1258->2567->2568->2566->1258->1599->1598->1597->1258->1259->2665->2666->2667->1259->1882->1884->2411->2412->2410->1884->1883->1259->1260->759->757->758->459->188->374->702->701->819->923->924->922->2507->2506->2508->922->1670->1669->1671->922->819->1453->1455->1454->819->817->850->852->851->817->818->1985->1984->1986->818->701->720->719->718->701->700->1616->1615->1617->700->374->373->409->410->580->581->582->410->411->473->2757->2756->2755->473->474->508->510->863->864->862->1895->1894->1896->862->510->1336->1338->1337->2950->2952->2951->1337->510->1102->1103->1104->510->509->787->789->788->509->474->472->872->1230->1951->1953->2437->2439->2438->1953->1952->1230->1229->1228->872->871->1607->1608->1606->871->1073->1074->1979->1980->1978->1074->1072->871->873->472->411->373->375->780->1596->2027->2028->2026->2814->2812->2813->2026->1596->1594->1595->780->778->779->1868->1867->1869->779->375->591->2759->2758->2760->591->589->590->375->1211->1302->1301->1300->2179->2180->2784->2783->2782->2180->2181->1300->1211->1210->1212->1788->1787->1786->2486->2554->2555->2556->2486->2485->2487->1786->1212->375->1003->1004->1005->375->188->189->884->883->1267->1268->1269->883->885->189->187->134->135->659->658->660->135->72->70->71->81->475->477->1392->1391->1390->477->476->847->848->849->476->1150->2694->2692->2909->2910->2908->2692->2693->1150->1152->1151->476->1077->1076->1075->476->81->1432->1780->1991->1992->1990->1780->1782->1781->1432->1434->1433->81->79->282->711->812->811->1558->1560->1559->811->1554->1553->1552->811->813->711->709->710->2821->2822->2823->710->1846->1847->1848->710->1095->1093->1094->710->282->281->280->2436->2434->2435->2739->2737->2738->2435->280->79->1414->1415->1416->79->80->71->58->10->14->88->335->768->767->766->335->336->334->2848->2850->2849->334->1223->1224->1222->334->88->90->89->527->528->1441->1442->1443->2598->2596->2597->1443->528->526->89->14->15->13->654->1069->2244->2242->2243->1069->1071->1070->654->652->905->988->989->2313->2311->2312->989->990->905->904->906->652->1923->1921->1922->652->653->1208->1209->1956->1955->1954->1209->1207->1794->1792->2295->2294->2293->1792->1912->1913->1914->1792->1793->1207->653->13->30->86->87->2796->2795->2794->87->104->252->980->2481->2479->2480->980->981->979->2288->2287->2289->979->252->505->506->2803->2805->2804->506->507->252->279->1054->1055->1056->2867->2868->2866->1056->279->278->624->622->623->1412->1411->1413->623->278->277->252->1273->1275->1274->252->251->2262->2935->2937->2936->2262->2261->2260->2476->2477->2478->2260->251->250->104->1385->1386->2886->2884->2885->1386->1816->1817->1818->1386->1384->104->105->183->181->182->379->381->380->182->2932->2934->2984->2985->2983->2934->2933->182->105->103->886->1479->1478->1477->1636->1637->1638->1477->886->888->887->103->1850->1880->1879->1881->2529->2528->2527->1881->1850->1849->2362->2363->2364->1849->1851->2730->2728->2729->1851->103->87->85->142->143->938->939->937->143->288->815->816->814->288->722->881->880->882->722->1220->1221->1219->722->723->721->288->353->2750->2830->2832->2831->2750->2749->2751->353->352->354->288->2653->2655->2654->288->287->286->143->144->206->412->414->420->1750->2203->2205->2204->1750->1751->1752->420->418->717->920->1311->1310->1309->920->921->942->2565->2563->2564->942->941->2586->2585->2584->941->940->2098->2100->2099->940->921->919->717->1676->1677->1809->1808->1807->1677->1675->2089->2090->2091->2482->2484->2483->2091->2186->2185->2187->2091->1675->717->716->1340->1341->1339->716->715->2643->2642->2641->715->418->2541->2539->2540->418->419->414->2717->2716->2718->414->413->206->207->2645->2646->2644->207->205->359->360->642->744->2120->2119->2121->744->743->742->642->2064->2063->2062->2111->2112->2110->2062->642->640->1289->1290->1288->640->641->360->1361->1362->1360->360->358->1982->1981->1983->358->205->290->289->306->305->394->395->396->610->928->929->930->2892->2890->2891->930->610->612->611->396->305->304->289->2809->2811->2810->289->2573->2572->2574->289->291->205->275->276->274->205->144->85->30->826->828->1106->1107->1989->2000->1999->2001->1989->1988->1987->1107->1105->828->827->30->2405->2406->2404->30->28->1199->1200->1198->28->29->141->760->762->1304->1601->2851->2853->2852->1601->1600->1602->2636->2637->2635->1602->1887->2224->2721->2719->2720->2224->2226->2225->1887->1885->1886->1897->2695->2696->2697->1897->1898->1899->1886->1602->1304->1303->1305->762->761->1051->2214->2349->2347->2348->2214->2212->2213->1051->1053->1052->761->141->139->140->29->13->293->292->294->13->176->595->596->597->2763->2761->2762->597->176->1205->1666->1667->2606->2607->2605->1667->1915->1916->1917->1667->1668->1205->1206->2590->2591->2592->1206->1204->2837->2836->2838->1204->2800->2802->2801->1204->2277->2276->2842->2844->2843->2276->2775->2773->2774->2276->2275->1204->176->175->177->13->10->12->117->116->632->1742->1743->2702->2703->2701->1743->1741->632->631->2722->2723->2724->631->633->116->115->171->169->204->202->639->1100->1101->1099->639->638->637->957->956->955->1763->1762->1764->955->637->2393->2392->2394->637->202->203->169->170->1785->1929->1928->1927->1785->1783->1784->170->115->145->443->490->491->492->1574->1575->1604->1605->1943->1944->1942->1605->1603->1575->1573->492->443->1865->1866->1864->443->1162->1164->1163->443->442->444->984->1779->1777->1778->984->982->983->1739->1738->1795->1796->1797->1738->1740->983->444->615->613->1840->1841->1842->613->614->444->548->549->547->444->145->2144->2371->2373->2372->2144->2143->2145->145->147->243->2510->2509->2511->243->242->383->384->382->242->1692->1890->1888->2183->2184->2613->2612->2611->2184->2182->1888->1889->1692->1691->1690->242->241->147->146->115->130->996->1007->1423->1425->1424->1007->1006->1008->2608->2609->2610->1008->996->994->995->2384->2383->2385->995->130->857->1187->1332->1330->1974->1972->1973->1330->1331->1187->1186->1614->1612->1613->1186->1188->2829->2827->2828->1188->857->858->2283->2878->2879->2880->2283->2282->2281->858->856->1284->1282->1283->856->130->132->131->1431->1430->1429->131->115->12->3->1->497->496->498->1->2982->2980->2981->1->21->493->494->1122->1121->1120->494->495->1962->1960->2106->2216->2370->2368->2369->2216->2217->2215->2106->2105->2104->1960->1961->2069->2068->2070->1961->495->1718->1719->2726->2725->2727->1719->1717->495->1371->1369->1370->495->21->44->191->192->190->1920->1919->1918->190->44->45->1327->1328->1329->45->43->441->1755->1753->1754->441->440->1514->1513->1515->440->439->43->235->236->578->855->2992->2993->2994->855->853->854->1768->1770->1769->854->1714->2032->2034->2055->2672->2671->2673->2055->2054->2053->2034->2033->1714->1715->1716->854->578->579->1902->1900->1901->579->577->1013->1012->1014->577->236->321->392->840->1996->1998->2864->2865->2863->1998->1997->840->838->839->392->391->552->550->1805->2927->2928->2926->1805->1806->1827->1825->1826->1806->1804->550->1480->1481->1482->550->551->2124->2123->2686->2687->2688->2123->2249->2250->2248->2123->2122->551->391->393->859->2543->2542->2589->2746->2747->2748->2589->2588->2587->2542->2544->859->861->860->1033->2223->2221->2361->2359->2360->2221->2222->1033->1034->1323->1321->1322->2792->2791->2793->1322->1034->1035->860->393->321->320->1644->1643->1642->320->1063->2296->2297->2298->1063->1064->1065->2492->2493->2491->1065->1609->1610->1611->1065->320->319->993->992->2358->2356->2357->992->991->319->236->237->1046->1045->1047->237->43->21->42->40->2085->2285->2286->2284->2085->2083->2084->40->41->607->608->953->954->1583->1582->1584->954->952->608->609->1620->1619->2272->2274->2273->1619->1618->609->41->56->73->75->1252->1253->2060->2059->2117->2118->2116->2059->2061->1253->1254->75->74->502->503->504->74->2427->2425->2426->2430->2429->2428->2426->74->56->55->92->167->168->753->751->752->799->1169->1168->1170->2135->2136->2134->1170->799->800->1484->1729->1730->1731->1484->1485->1483->800->801->752->168->166->92->91->93->691->2504->2503->2505->691->692->693->93->55->343->344->425->426->424->2945->2946->2944->424->344->345->975->1379->1957->1958->2236->2237->2238->1958->1959->2415->2414->2413->1959->1379->1469->1470->1468->2495->2494->2496->1468->1379->1380->1378->975->973->974->345->55->2818->2819->2820->55->194->195->193->2771->2770->2772->193->55->122->123->303->302->346->2470->2471->2472->346->348->356->357->421->422->423->357->1908->1906->1907->357->355->794->900->899->898->2974->2976->2975->898->1517->1518->2092->2093->2094->2766->2765->2764->2094->1518->1516->898->794->2044->2860->2861->2862->2044->2045->2046->794->793->1836->2022->2021->2020->1836->1835->1834->793->1498->1500->2516->2517->2515->1500->1499->793->795->355->2443->2445->2444->355->348->2845->2847->2846->348->347->302->301->435->471->470->469->435->434->2824->2826->2825->434->433->2023->2024->2025->433->301->123->121->55->108->326->325->327->108->1028->1027->1029->1533->1532->1531->1029->108->107->807->805->806->2374->2376->2375->806->107->399->694->833->834->832->1474->1700->1699->1701->1474->1475->1476->832->694->822->820->1010->1009->1011->820->821->944->945->943->821->694->696->2614->2616->2615->696->1421->2234->2233->2995->2997->2996->2233->2235->1421->1420->1557->1556->1555->1420->1422->696->695->399->398->397->2549->2550->2548->397->107->106->427->2228->2229->2227->427->429->566->1129->1131->1130->566->565->567->429->1563->2781->2779->2780->1563->1562->1561->429->428->1239->1271->2211->2209->2210->1271->1272->1270->2917->2918->2919->1270->1239->1237->1238->428->106->162->160->773->774->772->1565->1566->1564->772->160->2050->2466->2464->2465->2050->2051->2193->2192->2191->2051->2052->160->161->2408->2407->2409->161->106->55->57->41->401->402->2965->2967->2966->402->400->1085->1084->1086->2925->2923->2924->1086->400->41->21->174->1261->1262->1263->174->172->671->1756->1757->1758->671->1192->1194->1193->2304->2302->2303->1193->671->672->670->1947->2256->2255->2254->1947->1946->1945->670->1713->1711->1712->670->1570->1571->1572->670->172->173->1040->1041->1039->173->21->20->389->1892->2874->2872->2873->1892->1891->2129->2173->2174->2175->2129->2130->2128->2778->2777->2776->2128->1891->1893->389->390->2954->2953->2955->390->388->20->226->628->629->907->908->909->1059->1057->1058->909->1024->1026->1025->909->629->630->666->665->664->630->226->2170->2171->2172->226->227->2417->2416->2418->227->228->20->197->196->764->763->765->914->913->999->2469->2467->2468->999->998->997->913->915->765->2157->2156->2155->765->196->198->2603->2604->2602->198->20->19->1->0->149->148->150->456->455->675->673->674->1344->1342->1343->674->455->454->150->0'
def eulerianCycle(graph): # init cycle with any node cycle = [graph.keys()[0]] ##### move all edges from graph to cycle while len(graph) > 0: # if cycle is closed shift to a node with remaining targets if cycle[0] == cycle[-1]: while not cycle[0] in graph: cycle.pop(0) cycle.append(cycle[0]) ##### source is the last node of cycle source = cycle[-1] ##### move one target from graph to cycle cycle.append(graph[source].pop()) ##### clean sources without targets if len(graph[source]) == 0: del graph[source] return cycle
g = {0:[2], 1:[3], 2:[1], 3:[0,4], 6:[3,7], 7:[8], 8:[9], 9:[6]}
nodes = g.keys()
nodes
[0, 1, 2, 3, 6, 7, 8, 9]
def outdegree(g,node):
"""Returns the number of outgoing edges from a node"""
try:
return len(g[node])
except:
return 0
outdegree(g,4)
0
def indegree(g,node):
"""Returns the number of incoming edges from a node"""
incoming = [k for k,v in g.iteritems() if node in v]
return len(incoming)
indegree(g,6)
0
def isSemiBalanced(g,node):
return abs(indegree(g,node) - outdegree(g,node)) == 1
def isBalanced(g,node):
return indegree(g,node) == outdegree(g,node)
def head(g):
"""Returns the head of a graph in case of Eulerian Walk"""
nodes = g.keys()
head_node = None
for node in nodes:
if indegree(g,node) == outdegree(g,node) - 1:
head_node = node
break
return head_node
def tail(g):
"""Returns the tail of a graph in case of Eulerian Walk"""
neighbors = [v for k in g for v in g[k]]
tail_node = None
for node in neighbors:
if indegree(g,node) == outdegree(g,node) + 1:
tail_node = node
break
return tail_node
g = {0:[2], 1:[3], 2:[1], 3:[0,4], 6:[3,7], 7:[8], 8:[9], 9:[6]}
#kmers = ['CTTA', 'ACCA', 'TACC', 'GGCT', 'GCTT', 'TTAC']
#kmers = [line.strip() for line in open('dataset_203_6.txt', 'r')]
#g = debruijn_from_kmer(kmers)
#setdefault is useful for setting defaults while or after filling the dict.
initialNode = head(g)
print initialNode
terminalNode = tail(g)
print terminalNode
g.setdefault(terminalNode, []).append(initialNode)
print g
# graph g has an Eulerian cycle
tour = []
src = g.iterkeys().next() # pick arbitrary starting node
print "src", src
def visit(n):
while len(g[n]) > 0:
dst = g[n].pop()
visit(dst)
tour.append(n)
visit(src)
print "tour", tour
tour = tour[::-1][:-1] # reverse and then take all but last node
print "tour", tour
sti = tour.index(initialNode)
print "sti", sti
tour = tour[sti:] + tour[:sti]
print "tour", tour
6 4 {0: [2], 1: [3], 2: [1], 3: [0, 4], 4: [6], 6: [3, 7], 7: [8], 8: [9], 9: [6]} src 0 tour [0, 3, 6, 9, 8, 7, 6, 4, 3, 1, 2, 0] tour [0, 2, 1, 3, 4, 6, 7, 8, 9, 6, 3] sti 5 tour [6, 7, 8, 9, 6, 3, 0, 2, 1, 3, 4]
def eulerianWalk(g):
initialNode = head(g)
terminalNode = tail(g)
#setdefault is useful for setting defaults while or after filling the dict.
g.setdefault(terminalNode, []).append(initialNode)
# graph g has an Eulerian cycle
tour = []
src = g.iterkeys().next() # pick arbitrary starting node
def visit(n):
while len(g[n]) > 0:
dst = g[n].pop()
visit(dst)
tour.append(n)
visit(src)
tour = tour[::-1][:-1] # reverse and then take all but last node
sti = tour.index(initialNode)
tour = tour[sti:] + tour[:sti]
return tour
g = {0:[2], 1:[3], 2:[1], 3:[0,4], 6:[3,7], 7:[8], 8:[9], 9:[6]}
#g = read_input('dataset_203_5.txt')
'->'.join(str(i) for i in eulerianWalk(g))
'6->7->8->9->6->3->0->2->1->3->4'
#kmers = [line.strip() for line in open('dataset_198_3.txt', 'r')]
kmers = ['CTTA', 'ACCA', 'TACC', 'GGCT', 'GCTT', 'TTAC']
g = debruijn_from_kmer(kmers)
print g
eulerianWalk(g)
defaultdict(<type 'list'>, {'CTT': ['TTA'], 'ACC': ['CCA'], 'GCT': ['CTT'], 'GGC': ['GCT'], 'TAC': ['ACC'], 'TTA': ['TAC']})
['GGC', 'GCT', 'CTT', 'TTA', 'TAC', 'ACC', 'CCA']
def stringReconstruction(kmers):
"""Input: An integer k followed by a list of k-mers Patterns.
Output: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may
return any one.)"""
g = debruijn_from_kmer(kmers)
walk = eulerianWalk(g)
return walk[0] + ''.join(map(lambda x: x[-1], walk[1:]))
stringReconstruction(kmers)
'GGCTTACCA'
kmers = ['AAAT','AATG','ACCC','ACGC','ATAC','ATCA','ATGC','CAAA','CACC','CATA','CATC','CCAG','CCCA','CGCT','CTCA','GCAT',
'GCTC','TACG','TCAC','TCAT','TGCA']
stringReconstruction(kmers)
'CAAATGCATCATACGCTCACCCAG'
#map() will apply its lambda function to the elements of the argument lists, i.e. it first applies to the elements with the 0th index, then to the elements with the 1st index until the n-th index is reached
walk = ['GGC', 'GCT', 'CTT', 'TTA', 'TAC', 'ACC', 'CCA']
walk[0] + ''.join(map(lambda x: x[-1], walk[1:]))
'GGCTTACCA'
import sys
sys.setrecursionlimit(2500)
kmers = [line.strip() for line in open('dataset_203_6.txt', 'r')]
stringReconstruction(kmers)
'AATAAAAGTCAGTATCAGGTGTTCACCGTTATCTCCTGCCGGGCACGGGGGAGGTACCGTGCGTATCTAGGGAGAAGCCCGTCGAGTTCAGGGCCCAATGGTAGGTACTCCCGGTCCACTAATGACCCAACTAGACCACAAGGGGACTACGTACCCTACTCGCGAGCCTAGCTAACTAACTGCCCTGGCTTACGCATGACATACTGCGGTTTGCAGGGTGATGCTAGTATACATGGATGTGATAGCAATCTTTCAACCAAGCATTAGTACCATCGCACTGGAGCGATTTAGGTCGAAACTCATATTCGCTTTATGTGTCCTGGACGTGTATACAAGTCCTTCCTTACTTTACACAATAGCTTGGTACAGTGTCTCGACTCAACTTAAGGTCTTCACCATTTGATCTGTGCTCGTGTGCAGTTAGAGGAGTAGCCCCCACTTTCGTTTCGACGAGTAGACCCACAGCGTTGCGACGTCCACCACCTTTCATCACCACAGTCTCTTCTGGACTTAACAGCATTTATAATACCAGAGGCGCGGTATCACTGGATCTTCAACTGCGAATGTGAGGAGGTCCTGGATTATAGTGTCCTGCAACATGGATGCACGACCCGGAAACACCGGGATTGATCCAGACGAGATTGACTATTTAACGAGGAAGGTTCGTAATAGCATCGGGAGAGGATATGGCTCACCAGTCTTCGGTATATGGTTGACTGCTGACAGACTGTCGAGCAGCGATCGTTATAGAAGCTCCCTTTTAGCGCTCCGTGCCCCAGCTGATTGGTCGGTATATAATGCCTAGCGGGCTGCTTTACTCACGTTCCGCTTGTGGCGCCACAGACCCACGGGCAAAAGAGGGGGTCACCGCAAGTCTCCGTAGGCGCGAGTAACCTTATATTCACTTAACTTTTTTTCAAGGAAATCACAGCTTGGGACTACGAGCGACGACGCAATTGCATTGACGGCTAGTTGTGATATGCCCATTACACTGCGAGATAAGGGGAAACACATCACGGAATCCAAACCTCTACATTCCTTATAGGGCAAGTGATGTTCGGATCCCAAAGTATCGTGCATGAGTAGAAGCCGTGAGGGTATTCGAATCAGTGAATTACGGCGCGCGGAGGCAACCATACATATGTCAGACTATGCCCCGATGGATGAGGCCCATCAGGCTAAGCGATCATTCGGACGGGCCCCGATTCCTAACGCAGCCTTGTATAGCTGATGAAAGTGAGAGACGCACTCGATGCACATGTCCTGTCGTGTTATGGACCCAACGCGAGAAGAAGCACGTGAAACGTACAAGGGGCCGGACTGCAGCAAGGTGTTCGGTGTATCAAACACGTGTCAGCCCAGAGAAGTAGCCAACCGAGAGTATGAAACATCAGTAACAAGCTTACTCGGTTGAAGCGATGATCCCTATCAAGGGTCATCGGCACCACCTTTGAAGTAGCGGTAAGACCCCTCAGTTAAGAGTGGCGTTAGGTGGGACGGCATATGATCCTATTATTAGGTACACAGGGACAACAGAGTAGACTCATGAGTCATGCCAAACCGCTTCCGCTATGGTTTATTACAAAAAGACGCGGCCCTAAATTAAGCCTATGTGGAGACCATGGACCAATGACAAGATGTGAGTTGAGGCGTGGGTAGAGTGCTATGAATCAACGTTTAACTCGGGTAATGGCTCTGACACAACCGACTAATATCCCCGTTAGACAGACTGCCACGACGAGGACCTCCACCGATGGGGTGGAGTGCGACCTCTGGCGGGCAATGCGTAAGCTACAATTGATCTGCCTTAATCTGTCACCGGCAGTTATCCTCATATGAAGCGCTGCGGCAATGCCGCCTGATGAAGGTGCCGGATGACTGCGCTCTATACGAGGTTCTACGGACACGTTCCGTTACTCCAGACTACCCGTCAAACCTCCCGTCCGTCCACGCCATAATAGGTCCACTGAATGTGGATATTGCTAAATTGGAACAGATGTCGGTTCTAACAATATATAAAGAGGTTAAGTTACTTTGTAGTGCGAATTAAGTCCGGCATATGAGATACATCTTCAGCCATTGCCAGTTGTGCGTTATGCCCAACTCGGTTAGCTGTTATCCTTTTGATTACCAGTCTTCGGACCCCCCAGTATCGTAGCCTTTAGATAGATGCGGTCCACCTGGGTGGTTCCTCTGTGTAAATCGAAAAGAGTATGGCGACGCCGGGTTTCACTTCAGAA'
import itertools
[''.join(binary) for binary in itertools.product('01', repeat = 3)]
['000', '001', '010', '011', '100', '101', '110', '111']
#CODE CHALLENGE: Solve the k-Universal Circular String Problem.
import itertools
def k_universal_circular_string(k):
"""Input: An integer k.
Output: A k-universal circular string."""
binary_kmers = [''.join(binary) for binary in itertools.product('01', repeat = k)]
g = debruijn_from_kmer(binary_kmers)
cycle = eulerianCycle(g)
return cycle[0] + ''.join(map(lambda x: x[-1], cycle[1:-(k-1)]))
k_universal_circular_string(4)
'0101111010011000'
cycle = ['11', '11', '10', '01', '10', '00', '00', '01', '11']
cycle[0] + ''.join(map(lambda x: x[-1], cycle[1:-2]))
'11101000'
k_universal_circular_string(8)
'0001011111111011111010111101101111001111110010111011101010111001101110001111100010110110101101001111010010110011101100101011000110110000111100001010101001101010001110100010100100111001001010000110100000111000001001100110001001000110010000100010000001100000'
cycle = ['11', '11', '10', '01', '10', '00', '00', '01', '11']
cycle[1:-2]
['11', '10', '01', '10', '00', '00']
CODE CHALLENGE: Implement StringSpelledByGappedPatterns. Input: Integers k and d followed by a sequence of (k, d)-mers (a1|b1), … , (an|bn) such that Suffix(ai|bi) = Prefix(ai+1|bi+1) for 1 ≤ i ≤ n-1. Output: A string Text of length k + d + k + n - 1 such that the i-th (k, d)-mer in Text is equal to (ai|bi) for 1 ≤ i ≤ n (if such a string exists). Sample Input: 4 2 GACC|GCGC ACCG|CGCC CCGA|GCCG CGAG|CCGG GAGC|CGGA Sample Output: GACCGAGCGCCGGA
def gappedPatterns(f):
firstPatterns = []
secondPatterns = []
for lines in f:
lines = lines.strip()
lines = lines.split('|')
firstPatterns.append(lines[0])
secondPatterns.append(lines[1])
return firstPatterns, secondPatterns
def stringSpelledByGappedPatterns(firstPatterns, secondPatterns, k, d):
prefixString = string_from_genome_path(firstPatterns)
suffixString = string_from_genome_path(secondPatterns)
for i in range(k+d, len(prefixString)):
if prefixString[i] != suffixString[i-k-d]:
return "there is no string spelled by the gapped patterns"
return prefixString + suffixString[-(k+d):]
f = open('StringSpelledByGappedPatterns.txt', 'r')
firstPatterns, secondPatterns = gappedPatterns(f)
stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 4, 2)
'GACCGAGCGCCGGA'
f = open('dataset_204_14.txt', 'r')
firstPatterns, secondPatterns = gappedPatterns(f)
stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 50, 200)
'there is no string spelled by the gapped patterns'
read_pairs = [('TAA','GCC'), ('AAT','CCA'), ('ATG','CAT'), ('TGC','ATG'), ('GCC','TGG'), ('CCA','GGG'), ('CAT','GGA'), ('ATG','GAT'), ('TGG','ATG'), ('GGG','TGT'), ('GGA','GTT')]
#read_pairs = [('GACC','GCGC'),('ACCG','CGCC'),('CCGA','GCCG'),('CGAG','CCGG'),('GAGC','CGGA')]
from collections import defaultdict
def debruijn_from_readpairs(read_pairs):
'''Input: A collection of k-mers Patterns.
Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).'''
edges = defaultdict(list)
for pair in read_pairs:
edges[(pair[0][:-1], pair[1][:-1])].append((pair[0][1:], pair[1][1:]))
return edges
g = debruijn_from_readpairs(read_pairs)
g
defaultdict(list, {('AA', 'CC'): [('AT', 'CA')], ('AT', 'CA'): [('TG', 'AT')], ('AT', 'GA'): [('TG', 'AT')], ('CA', 'GG'): [('AT', 'GA')], ('CC', 'GG'): [('CA', 'GG')], ('GC', 'TG'): [('CC', 'GG')], ('GG', 'GT'): [('GA', 'TT')], ('GG', 'TG'): [('GG', 'GT')], ('TA', 'GC'): [('AA', 'CC')], ('TG', 'AT'): [('GC', 'TG'), ('GG', 'TG')]})
walk = eulerianWalk(g)
walk
[('TA', 'GC'), ('AA', 'CC'), ('AT', 'CA'), ('TG', 'AT'), ('GC', 'TG'), ('CC', 'GG'), ('CA', 'GG'), ('AT', 'GA'), ('TG', 'AT'), ('GG', 'TG'), ('GG', 'GT'), ('GA', 'TT')]
firstPatterns = [a for (a,b) in walk]
secondPatterns = [b for (a,b) in walk]
print firstPatterns
print secondPatterns
['TA', 'AA', 'AT', 'TG', 'GC', 'CC', 'CA', 'AT', 'TG', 'GG', 'GG', 'GA'] ['GC', 'CC', 'CA', 'AT', 'TG', 'GG', 'GG', 'GA', 'AT', 'TG', 'GT', 'TT']
stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 3, 1)
'TAATGCCATGGGATGTT'
read_pairs = [('ACC','ATA'),('ACT','ATT'),('ATA','TGA'),('ATT','TGA'),('CAC','GAT'),('CCG','TAC'),('CGA','ACT'),
('CTG','AGC'),('CTG','TTC'),('GAA','CTT'),('GAT','CTG'),('GAT','CTG'),('TAC','GAT'),('TCT','AAG'),
('TGA','GCT'),('TGA','TCT'),('TTC','GAA')]
g = debruijn_from_readpairs(read_pairs)
g
defaultdict(list, {('AC', 'AT'): [('CC', 'TA'), ('CT', 'TT')], ('AT', 'TG'): [('TA', 'GA'), ('TT', 'GA')], ('CA', 'GA'): [('AC', 'AT')], ('CC', 'TA'): [('CG', 'AC')], ('CG', 'AC'): [('GA', 'CT')], ('CT', 'AG'): [('TG', 'GC')], ('CT', 'TT'): [('TG', 'TC')], ('GA', 'CT'): [('AA', 'TT'), ('AT', 'TG'), ('AT', 'TG')], ('TA', 'GA'): [('AC', 'AT')], ('TC', 'AA'): [('CT', 'AG')], ('TG', 'GC'): [('GA', 'CT')], ('TG', 'TC'): [('GA', 'CT')], ('TT', 'GA'): [('TC', 'AA')]})
walk = eulerianWalk(g)
walk
[('AT', 'TG'), ('TT', 'GA'), ('TC', 'AA'), ('CT', 'AG'), ('TG', 'GC'), ('GA', 'CT'), ('AT', 'TG'), ('TA', 'GA'), ('AC', 'AT'), ('CC', 'TA'), ('CG', 'AC'), ('GA', 'CT'), ('CT', 'TT'), ('TG', 'TC'), ('GA', 'CT'), ('AA', 'TT'), ('AT', 'TG')]
x = [a for (a,b) in walk]
y = [b for (a,b) in walk]
print x
print y
['AT', 'TT', 'TC', 'CT', 'TG', 'GA', 'AT', 'TA', 'AC', 'CC', 'CG', 'GA', 'CT', 'TG', 'GA', 'AA', 'AT'] ['TG', 'GA', 'AA', 'AG', 'GC', 'CT', 'TG', 'GA', 'AT', 'TA', 'AC', 'CT', 'TT', 'TC', 'CT', 'TT', 'TG']
#ANSWER SHOULD BE: CACCGATACTGATTCTGAAGCTT
stringSpelledByGappedPatterns(x, y, 3, 1)
'there is no string spelled by the gapped patterns'
#Process the readpairs into list of tuples
f = open('dataset_204_14.txt', 'r')
read_pairs = []
for lines in f:
lines = lines.strip()
lines = lines.split('|')
read_pairs.append((lines[0],lines[1]))
import sys
sys.setrecursionlimit(10000) #To avoid maximum recursion depth exceeded
g = debruijn_from_readpairs(read_pairs) # Constructs prefix and suffix for each pair in the read. De bruijnize the read pairs.
walk = eulerianWalk(g)
firstPatterns = [a for (a,b) in walk]
secondPatterns = [b for (a,b) in walk]
stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 50, 200) #Constructs the string from paired eulerian walk.
'CTCTACGAGCATTGGCATAACCGTAGCACCTCAAGCGTATTCTCGCTCGCCGATTGTCAGTGCCGCCAGGAGGCCCTTGACAAACGTACAACTAATGAGGCTTCCTAATCCCTCGTTTGGTAATGAGGAGTATCATGTTGATATCGGCCGCCTGCCGACTAACAGGTACTTTGGCTGCTGTACGAGGTGAAAATAGCGGCTGACATTGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCGATATAAATTGCATTACGCTACTAGGACCGATTGACTCGGGGGGTGCCAGTTGGTGACCCACTACAATCTACGCCCTTGCGACTTTGCCTAACATACTCGTCAGGGCAAAGTGTTGGGTACATGTGTTTCACCACAATGCTCTTGGGTCTTGACAACAGGCAAAGGTCTTTAATGCGTTCCCTTGGCGAGTGTGGCGCGGTAATCGTCGAATCGCATGAGACGAGCAGGGAAAAACTTGGACGTTACATCCGGCAATAGATTCCCACGATCTGCCCTTGCCCATATGCACAGATGATATTACAGCTAGGATCCCCAGGAGCGGCATGTGTATCACTAGTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCAAGGCTTGACCGCACCTCATAGACTCTCGTAACGTAGTAAATCTGTATTAACAGGAGCTATTCAAACTGCTCCTTGGGTGCTTCCTTGCACGAGCCGAAATCGCCTTCTAGTCAATGAAAACTAGGAGAGACTCCTAGCATCTGCGATACCGCCCCTTGATTTACCAACGAGGTAAGATGTCGAATTTCTTTTCTGTGGGCTACCCCGGAGTAGCAAGAGGTGAAGAGTATTCTTTGATCTAGGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGTCGATTTAGACAGTTAACTGATGGATCTTAAGGGTTAGAGCCATTCCCAGATTCATGTCGGGACTTTCTGTAGGTCACGGCCCCAACGGACAACCGGTCGAACTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCTCTAAATCCAGGTGGCTATTGAGGTAAGCGGAGCTTTGATCTCGGCACGTCGACGGGAACCACATATATCTTCCGCCGGTACCTTCGGTAGCGCTGACCCGGAGGATCGGCGTGGAATCTCAGCTTAAGCCGTGTCTGAGTGGTTAGGCCTTTTAACCTGGCCACTCCCTCGACGGAAATTTTTGGACGCTAGCAAGAGTCAGACTAACGTACCCTAGACAGGGCAAAGTCGGAATCGACTCTGAGGTCTTTGTATCATATTGAGGGTGCAACACAAAGCGTCTTAGTCACTGACAGTTGGTTGCACTGTTTACCCCCAATCTCGTGCGGACTTGGAATTCGTAGGCACAGTTCTTGGCAGCCGGGAAAAAGAGCCACTTTCATCGTCGAAACTCTCGATTGGTCAGCCCCGGTTTGAACCACGCCGACGTCGCCCTCCCAACAGAACACCCCACCCGCTGAGGCAAGTGGACCGTTTGCGCTTCCGCACGCCGCTATCGCGTACGCAGTTGAGAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCACCAATAAAAAGCACCGACTTTATGCGAGTACGAACCAGAGGAACGGGGATGATCGATCGTGTACTGGGGTACGTCAAAAGGAGGTAACGGCCGTTACTGACCTGGTTGCAATGGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGAGGGCGCAGGTGAGCGCGTCCCGTTGTCTCTATGATACCAACAAAGCACCCAACCTCTGGTTTTTTCCCTGTTATCGACCTCGGCATTGGAGTCTTTTAAGTTGAATAGCTACACAAAACCAAAAGAATTATGCCGCCGTAGTGCGGGTGGCCCGCCCGCACCGAAGCGGACGCGTTTTCATGAGTGCGGAGAGGCGAGCGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCACTTTTCCCTTCATTGTACCTTTCATCTCCGCATAGCTATTTGAGATCAGTATGTCGTACTGATTTTAAAGATCACCAAAGCATCGGAGGACACATTATGAGTGTCTCAGGAGCTATAGATATGCTAGGCAGGGTCGGTCAGTCTCTGTCCGACGGAGCACAAGGCACCTCACCCCCAAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGTGTTTCTCGATTGAATAAACCTGGCCCGTCAATTACTGAAGTAGCTTAAACGTTACGACGGCAGTCCTCATCACCATCCTGGCTCGAACTACCGAGCAGCGACGAGTTGCGCTCAACACCTTTTTTTGATACATCCAGATAGTCTACAGAATGCACGGATTGAGAATTCGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGAGTGAGACTGTTCTACACATAAGCTTAAACACCATAAAGTTGCTTGAAGGGTCCTTTCTGCGCGAGTTCGGAAAATATGCGCCCAGGTAGCGTTGGCTTTTCATATAGCGGAGCCCGACTAACTGTTCCACCCCCGTCGAAACGCCATTGAAGCGGACGCAAGTTCATCCCTGCCAAATTGAGACAGGTTTGTACGTCCTAAGAAGGGGTTTGTACCTTTGAGATTGTTTATGATCTATCTTGGCAGCCCGAGCAGAGTATCTTGTTTTAATATACGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCTGGCACACGGTACAATGTGCCAGGAGTTATTCTCATCGATGCTTCGACATTGACTTCGCTCTTTGAGTTAACAACTCGCGGAGGCACCTTCTCAGTTGCATGACTTGATGTCGTCATGAAAATGCTCGGTGGCTTCTCACGGGGACTCACCCAACTAAAACCCCGACGAACGGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGACCGCCAGCGACAAATATAATTACGGATGTCGGGGGCTGCTTCGCTGTTAAACGATGAAGAATCTGAGATTTGACAAGGCAAGTCCATAATCGTATGGTTCCTCCTGAGGCGGGGATCATCAGGGACTTTGGTCCTGCCTTAGGGACGTATTAAATTAAATTATCTCGGCCGCGGCTCAGGTTCATTAAAAAAGCGGGGCTTGCTATTGTCCCCCAACCCTTCGTGTTGAGAAACGAGAGAATAGGGACGAGCGCACGACGACACATAGAGATTGTGTGCCACCAGTGCTCCGAGTACCCCTTATTCGTCTCACTTTGAGTTTAATCCTACTATGTTTGGTACAACAGATGTCATAGGTTACGTTGAACCCGATAGCTAGTTCCAGCACGAACCCGCTAAATGAGACACCCAAGAATCGATCCCCTAGTTCGCGCCATGTAGGCTTTAGTAAGTCGAGGTCATACTTTACTAAAAAAAATTTTAGGGTACTACTTCGAGCATGCCCCTTGTCCCTTACCCTTATACGTCAGGATCTTGATTGGTCAGTGGGCCACAAGGATGGCTAATCTTAAGAGCGTCGCTCTATTTCCGGAACGGCAACAAATTCAAACCGGTACGCCTTGTCTACATATCTACAAGCGATAAGGAATAGAAGCCGGTTCAGTGTGTCCAGTTCCCCACAGCGTCATCCGCTTGAGTCGGCGCAAGCCCGGGTCACAACTATGAGAAATCCGATATAACCTCTCATTTGTATACCTCAATCAATACGGCCGAATTTGTTTTGGCTCCCGAAACGCCACACACACGCGTCACCCAGCCAATTACTCCGTTGGGGGTAGATCGGGGCGTGATACCAGTAAAGTTCAACTGCAAGGGTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGAGCCAATCTAAGGTGGGTTAGGTGAGCCCATTTGATGCCTCGCAAAGGATTATTTGCAAGATTGATCCTCGGGTTAACTTGTAATCTCACCGGTTGAAAGTAAACCTAGATACGCAGTATCACTTACTCCTCTACCGTTCCTAGAGGTAAGTTTTTGAACCAGTAACAACAATTAGACAATCACATAAAATTCTATCAATTTCTTAACGCTTAGATCCTTCTTCTACCTTATAACTGTTCCCACGCTACCGTGCTTGTTCGGTCGGCCACGCGAAACCTCTTGCGGCTAACCAGTATATCCCGGGCTTCGAGCGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGGCATTTAGCGCGCGTTACCGCTCCACTTCACTCTCCGTGGGGATCTCTCATGAGCTGGCGGCTCCCAGATTGCCCTCCACAGTCAGAATGGGTGCGCAGAGCATGACGACAAAACTCGAACTGCTACACCTCTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGGGATTTTCCCCTCTTTTGGTCATGCTGCCGCCATAATATAAGCCCAGTGCGATCTCATTTGCCATTTCGTTTTTCGCTTAGCAACCTATCTGAGTGCTCGTGTAGCAAATGACGCCGAATGTTATGAGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCGGTTATCGTCTCGAGGATGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGTCGGTCGATTATTGACTGACAGTTGATTCCCTGCATATGGCGGGTATAGGACACCCTCAAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCCATACCGTATAACGGGTGGTAGAGAGCATCTCTTCTATGCTTGGGGCCGATTAAGGTAGACTGAGGAGTCAATCCGATCCTAGTATATAAGGGCGAGCTCGGCTACTGTGACGACCTAGTTACATAATACTCGCTCTTCTATTATTCAGTAACTTCATTATAGTTCAGACTTGTTGCACGCTTGCTATCTACGAAAAGTAGCCTCCTATCGCTAGGAGCACGTTTCCAGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCCTGTACATCTGATGGCATTGTCTGAGTCCCGCCGAAACACTTCTGAAAAGGTTGCCGCTATCTTTAATTCCGTACCCGACGCTATTGTGTGAAATTGGATTTCAAAAGATAAACTGTCGCTCAACTGAACTTTCTTGAACTCGGTACCGGCGTACTATAGTAGGACTTTCTAGGCTGATACGTGACCGACCTATGCAATTACTATAAGTTCGCTGACGCGTCCGCCCGTGACCTTATCTGTGCCTGCCACGGTCGTCTGATGTGAGCGTTGTGGAAGGGAGAAATCGGGCACTCCTGTAACCCAAGTAAAGCCTTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGGCAGTTTACGCGATTCTAACTTGACTGGAGATTGCTCGGGACTCCACGAACCAAGTGTCACTCTCGCAGTGTTGCCTCGGCATGATGATGCGTATCGTCCCTTTTTTCTCCCGAAACCGCCGTTACCGCAACCCGCCAACCTTCGGAGTAGGGTCAACGCATTCGATACGTGGATAGAACTCATACTAATGTTTAGCGCCGCAATTGGCGGGTTCTCATGAATCAGCTCGCAATGTTGAACCTTTTTGCTTCTTGACATGTTTTATCCGCTGAGTAGAAGATCGCAATCCTAAAGAGACAGATGACATTCCGAGAATTTAGCGATAGCAGTGTCGTCGCCGGGAGAAGTTACACTAGGTTACGCTTCGACCACCTCCGGGCAGACGCGTTGG'