# Let's make a class that implements an inverted (substring) index where the map data
# structure is a sorted list of (substring, offset) pairs. Query w/ binary search.
import sys
import bisect # for binary search
class IndexSorted(object):
def __init__(self, t, ln, ival):
""" Create index, extracting substrings of length 'ln' """
self.t = t
self.ln = ln
self.ival = ival
self.index = []
for i in range(len(t)-ln+1):
self.index.append((t[i:i+ln], i)) # add <substr, offset> pair
self.index.sort() # sort pairs
def query(self, p):
""" Return candidate alignments for p """
st = bisect.bisect_left(self.index, (p[:self.ln], -1)) # binary search
en = bisect.bisect_right(self.index, (p[:self.ln], sys.maxsize)) # binary search
hits = self.index[st:en] # this range of elements corresponds to the hits
return [ h[1] for h in hits ] # return just the offsets
index = IndexSorted('CGTGCCTACTTACTTACAT', 5, 4)
index.query('CTTACTTA') # index: give us hints on where to look for this pattern
[8, 12]
index.query('TTTTTTTT') # index: give us hints on where to look for this pattern
[]
# Now let's make a similar class but using a Python dictionary instead of a sorted
# list. A Python dictionary is basically a hash table.
class IndexHash(object):
def __init__(self, t, ln, ival):
""" Create index, extracting substrings of length 'ln' """
self.t = t
self.ln = ln
self.ival = ival
self.index = {}
for i in range(len(t)-ln+1):
substr = t[i:i+ln]
if substr in self.index:
self.index[substr].append(i) # substring already in dictionary
else:
self.index[substr] = [i] # add to dictionary
def query(self, p):
""" Return candidate alignments for p """
return self.index.get(p[:self.ln], [])[:]
index2 = IndexHash('CGTGCCTACTTACTTACAT', 5, 4)
index2.query('CTTACTTA') # index: give us hints on where to look for this pattern
[8, 12]
index2.query('TTTTTTTT') # index: give us hints on where to look for this pattern
[]