#' '.join(str(i) for i in array)
!wget https://stepic.org/media/attachments/lessons/2/Vibrio_cholerae.txt
--2015-09-20 11:39:23-- https://stepic.org/media/attachments/lessons/2/Vibrio_cholerae.txt Resolving stepic.org... 54.194.188.166 Connecting to stepic.org|54.194.188.166|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 1108251 (1.1M) [text/plain] Saving to: `Vibrio_cholerae.txt' 100%[======================================>] 1,108,251 184K/s in 6.8s 2015-09-20 11:39:32 (160 KB/s) - `Vibrio_cholerae.txt' saved [1108251/1108251]
def PatternCount(text, pattern):
"""Given: {DNA strings}} Text and Pattern. Return: Count(Text, Pattern)."""
count = 0
for i in range(len(text) - len(pattern) + 1):
match = True
for j in range(len(pattern)):
if text[i+j] != pattern[j]:
match = False
if match:
count += 1
return count
PatternCount('CGCGATACGTTACATACATGATAGACCGCGCGCGATCATATCGCGATTATC', 'CGCG')
5
PatternCount('GCGCG', 'GCG')
2
text = 'AATTCTTAGTGTATTCTTACATTCTTAATTCTTAAGCCGGCATTCTTATTATTCTTAATTCTTACAATTCTTACGTATTCTTATGATTCTTAATTCTTAAATTCTTAATGACGTCGATTCTTAGCTATTCTTAAGATTCTTAATTCTTAGATTCTTAGAAGTGCATTAACGCCATTCTTAATTCTTAGCGATTCTTATTCAATTCTTACCAATTCTTACAATTCTTATGATTCTTAAGGAATTCTTATATTCTTACGATTCTTATCAGATTCTTATCAATTCTTAATTCTTACCCGATTCTTAACAGCGTCCATTCTTAGCAAGCATATTCTTAGATTCTTAATTCTTAATTCTTAATTCTTAATTCTTAAGGGATTCTTATTGTGTATTCTTAACTCGGCGATTCTTATATTCTTACCAACCGACATTCTTATAGATTCTTAATTCTTACTATTCTTAAAACCCATTCTTATATTCTTAATTCTTAATTCTTATCATTCTTATTATTCTTAGGTCAGGGCTTATTCTTAATTCTTAATTCTTAAAGAACCATTCTTACATTCTTATATTCTTATTTATTCTTAATTCTTATGAATTCTTAGGCCGTAATTCTTAATTCTTATACCGAATTCTTACGGCAGATTCTTAATTCTTATTATTCTTAATTCTTACAGTCCACATTCTTATAATCATAGGATTCTTAATTCTTAGATTCTTATGTGGGTATTCTTAGGATTGGGTATTCTTATGAGAAATTCTTATATTCTTAATTCTTACTTGTCTGCATTCTTACAATTCTTAATTCTTAAGCATTCTTAATTCAGCCACTACAAAAATTCTTAATCATTCTTACCTATTCTTAATTCTTAGGGGCTAATTCTTAAGGAATTCTTAAATTCTTATTAACCCACTCTAAGACATTAGCATTCTTAATTCTTA'
pattern = 'ATTCTTAAT'
PatternCount(text, pattern)
27
text = 'GAACTCTGTTAAACTCTGCCAACTCTGAGGAACTCTGGTCCCCGAACTCTGCAGGTCTAAACTCTGTAACTCTGTCTGAACTCTGCGGAACTCTGCCCAACTCTGAAACTCTGGTGAAACTCTGAACTCTGCTGAACTCTGTTACTAACTCTGCACTCCCTATGCAGCGAACTCTGCGGTGTAGGAACTCTGGGAAACTCTGGCACAACTCTGTCAACTCTGGTGAAACTCTGAACTCTGAACTCTGAACTCTGTGTAACTCTGTGCAAAACTCTGGGTGAACTCTGCATTGGCCGAATCAATCTAACTCTGCAACTCTGAACTCTGCCAAACTCTGCCAAAAACTCTGGAAACTCTGAAACTCTGGCAACTCTGTTGACTTGGAACTCTGAAACTCTGTTGCCAAACTCTGAACTCTGTGCCAACTCTGCGCTAAACTCTGAACTCTGATGCCAACTCTGAACTCTGTCGGAACTCTGGAGAACTCTGTGTTCCCAAACTCTGCTAACTCTGGTAACTCTGAACTCTGCGGACAAGTCAACTCTGCAACTAACTCTGCAACTCTGAACTCTGAACTCTGCCTAACTCTGCAACTCTGAACTCTGTATAAACTCTGAACTCTGTTTGGCCTTCGCCCCAAACTCTGGAACTCTGATATTATAGCCCCTGGTTAACTCTGAAACTCTGAACTCTGAGAGAACTCTGGAGAACTCTGAACTCTGCCAACTCTGAACTCTGTCACCAACTCTGTGGTCGAACTCTGAACTCTGGACAACTCTGCCAACTCCCAACTCTGAAAACTCTGAACTCTGTACAGAACTCTGACCGTGGAACTCTGAGCCCAGTAACTCTGGCCAGTAACTCTGAAACTCTGATTAACTCTGAAACTCTGAACTCTGAACTCTGGTTAACTCTGAAACTCTGTAGAACTCTGAAACTCTGATGGAAACTCTGCGAACTCTGCAACTCTGGAACTCTGAACTCTGGGAACTCTGGGCAACTCTG'
pattern = 'AACTCTGAA'
PatternCount(text, pattern)
30
text = 'CCAGCGTTTTGCAATAGCGTTTAGCGTTTAGCGTTTGCCAAGCGTTTACCAGCGTTTAAGCGTTTTAAAAGCGTTTCGCAGTAGCGTTTAGCGTTTAGCGTTTGTTAGCGTTTACCTGAGCGTTTATGAGCGTTTTCAGCGTTTGGCGGAGCGTTTAGCGTTTAAGCGTTTGGTGGGATAGCGTTTAGCGTTTATGAAGCGTTTCGTAGAGCGTTTGAGCGTTTGAGCGTTTACAGCGTTTAAGCGTTTAGCGTTTAGCGTTTATAGCGTTTGAGCGTTTGAGAGTAGCGTTTGGTAAGCGTTTAGCGTTTCCATGAGCGTTTGGTAGGCCCAGCGTTTAGCGTTTAGCGTTTAGCGTTTCTCCCTAGCGTTTATAGCGTTTAGCGTTTTTGAATAGCGTTTCAGCGTTTGCCTATCAGAGCGTTTAGCGTTTGGTAGCGTTTAAAGCGTTTTCAGCGTTTAGCGTTTAGCGTTTAAGACGACCAGCGTTTCTTAGAAGCGTTTCACTAGCGTTTAGCGTTTAGTCTGGGTTAGCGTTTAGCGTTTAGCGTTTCAGCGTTTAGCGTTTGTTAGCGTTTAGCGTTTAGCGTTTATGTGAAGCGTTTAAGAGCGTTTAGCGTTTAGCGTTTGAAATGGGCTAAGCGTTTCAGCGTTTCAGCGTTTGGAGCGTTTAGCGTTTCCAGCGTTTGCCCCGGAATATTGTCCAGCGTTTGAGCGTTTAGCGTTTAAGCGTTTAAAGCGTTTCAGCGTTTAACGAGCGTTTAGCGTTTAGCGTTTAGCGTTTGTTCAAGCGTTTCTAGCGTTTTAGCGTTTATGAAGCGTTTAAGCGTTTAGCGTTTGCCAGCGTTTAGATCGCCCGAGCGTTTGGAGCGTTTAGCGTTTTCAGCGTTTAGCGTTTAGCGTTTAGCGTTTAGCGTTTAGCGTTTGGCTGCGAGCGTTTGTATGCAGCGTTTCAGCGTTT'
pattern = 'AGCGTTTAG'
PatternCount(text, pattern)
38
def FrequentWords(text, k):
""" Input: A string Text and an integer k.
Output: All most frequent k-mers in Text."""
frequentPatterns = []
max_count = None
for i in range(len(text) - k + 1):
pattern = text[i:k+i]
count = text.count(pattern)
if max_count is None:
frequentPatterns.append(pattern)
max_count = count
elif max_count == count:
frequentPatterns.append(pattern)
elif max_count < count:
frequentPatterns = []
frequentPatterns.append(pattern)
max_count = count
return set(frequentPatterns)
text = 'CGAAAGGCCGGCGTGGTTAAACGTCCTTCCTCCCTAAACGTCTAAACGTCCTGCCTTCCGGCGTGGTCGAAAGGCTAAACGTCCGGCGTGGTCTGCCTTCTAAACGTCCTGCCTTCCGAAAGGCCTGCCTTCCTTCCTCCCCGGCGTGGTTAAACGTCCTTCCTCCCCGGCGTGGTCGAAAGGCCTTCCTCCCCTTCCTCCCCGAAAGGCCTGCCTTCCGAAAGGCCGGCGTGGTCTTCCTCCCCGGCGTGGTTAAACGTCCTTCCTCCCCGAAAGGCCTTCCTCCCCGGCGTGGTCGGCGTGGTCGAAAGGCTAAACGTCTAAACGTCCGGCGTGGTCTTCCTCCCCTTCCTCCCCGGCGTGGTCTTCCTCCCCTGCCTTCCTGCCTTCCTTCCTCCCCGGCGTGGTCGAAAGGCCTGCCTTCCTGCCTTCCTTCCTCCCTAAACGTCCTGCCTTCTAAACGTCCGGCGTGGTCTTCCTCCCCGAAAGGCCGGCGTGGTCGGCGTGGTCGAAAGGCTAAACGTCCTTCCTCCCCTGCCTTCTAAACGTCCGAAAGGCTAAACGTCCTTCCTCCCCGGCGTGGTCGGCGTGGTCTTCCTCCCTAAACGTCTAAACGTCCGGCGTGGTCGGCGTGGTCTTCCTCCCCTGCCTTCTAAACGTCCGGCGTGGTCTTCCTCCCCTTCCTCCCCTGCCTTCCTGCCTTCCGGCGTGGTCTGCCTTCTAAACGTCTAAACGTCCGGCGTGGTTAAACGTCCTGCCTTCCGGCGTGGTCTTCCTCCCCGGCGTGGTCGGCGTGGTCTGCCTTCCTTCCTCCCCGAAAGGCCGAAAGGCCGAAAGGCCGGCGTGGTCGGCGTGGTCGAAAGGCCTGCCTTCCGGCGTGGTTAAACGTCCGAAAGGCCGGCGTGGTCTTCCTCCCTAAACGTCCGAAAGGCCGGCGTGGTCTTCCTCCCCTTCCTCCCTAAACGTCCGAAAGGC'
k = 13
print FrequentWords(text, k)
set(['TGGTCTTCCTCCC', 'GCGTGGTCTTCCT', 'CGTGGTCTTCCTC', 'GTGGTCTTCCTCC', 'GGCGTGGTCTTCC', 'CGGCGTGGTCTTC'])
FrequentWords('CGGAGGACTCTAGGTAACGCTTATCAGGTCCATAGGACATTCA', 3)
{'AGG'}
%%time
text = 'CTGCCCGGTAGATGCATTCAGCCAGATAGATGCATTGCATTTAGCTGCCCGGCTGCCCGGGCATTTAGCTGCCCGGAACATCGCTAAACATCGCTACAGCCAGATAGATGCATTCAGCCAGAGCATTTAGCTGCCCGGCAGCCAGATAGATGCATTAACATCGCTAGCATTTAGAACATCGCTACTGCCCGGCTGCCCGGCAGCCAGACTGCCCGGCAGCCAGAAACATCGCTATAGATGCATTCAGCCAGAGCATTTAGCAGCCAGACAGCCAGACAGCCAGAAACATCGCTAGCATTTAGTAGATGCATTTAGATGCATTGCATTTAGCAGCCAGAAACATCGCTAGCATTTAGAACATCGCTACAGCCAGATAGATGCATTGCATTTAGTAGATGCATTGCATTTAGTAGATGCATTTAGATGCATTGCATTTAGCTGCCCGGGCATTTAGCTGCCCGGTAGATGCATTCTGCCCGGCTGCCCGGAACATCGCTAAACATCGCTAGCATTTAGCAGCCAGAGCATTTAGAACATCGCTAAACATCGCTAAACATCGCTACAGCCAGACAGCCAGACAGCCAGAGCATTTAGCTGCCCGGCTGCCCGGCAGCCAGAGCATTTAGTAGATGCATTGCATTTAGGCATTTAGCAGCCAGACAGCCAGACTGCCCGGTAGATGCATTCAGCCAGATAGATGCATTCAGCCAGATAGATGCATTTAGATGCATTGCATTTAGGCATTTAGAACATCGCTACAGCCAGAAACATCGCTAGCATTTAGTAGATGCATTCAGCCAGACTGCCCGGGCATTTAGAACATCGCTATAGATGCATTCAGCCAGACAGCCAGATAGATGCATTGCATTTAGCAGCCAGAGCATTTAGAACATCGCTATAGATGCATTCAGCCAGAAACATCGCTATAGATGCATTAACATCGCTATAGATGCATTCAGCCAGACAGCCAGACTGCCCGG'
k = 11
FrequentWords(text, k)
CPU times: user 3.77 ms, sys: 191 µs, total: 3.96 ms Wall time: 3.8 ms
def patternToNumber(Pattern):
symbolToNumber = {'A':0, 'C':1, 'G':2, 'T':3}
n = len(Pattern)
if n == 0:
return 0
elif n == 1:
return symbolToNumber[Pattern]
else:
return 4*patternToNumber(Pattern[:-1]) + symbolToNumber[Pattern[-1]]
patternToNumber('ATGCAA')
912
patternToNumber('ACGGCCTCACCTTAGA')
442308552
patternToNumber('AAGCGGTAA')
9904
patternToNumber('TTAGCTGAAGGCGTACAACTAATCA')
1066391637859380
def numberToPattern (number, length):
letter_value = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}
sequence = ''
for i in range(length):
## take the remainder after dividing by 4, convert it to a base and prepend to sequence
sequence = letter_value[number % 4] + sequence
## now do a floor division to get ready for the next base
number = number // 4
return sequence
def numberToPattern(n, kmer): numberToSymbol = {0:'A', 1:'C', 2:'G', 3:'T'} pattern = '' while n > 0: remainder = n%4 pattern = numberToSymbol[remainder] + pattern n = n/4 if kmer - len(pattern) == 0: return pattern else: return (kmer - len(pattern))*'A' + pattern
numberToPattern(912, 6)
'ATGCAA'
numberToPattern(6254, 9)
'AACGACGTG'
numberToPattern(0, 1)
'A'
numberToPattern(5437, 7)
'CCCATTC'
numberToPattern(5437, 8)
'ACCCATTC'
patternToNumber('ACGCGGCTCTGAAA')
26900352
def computingFrequencies(text , k):
array =[0]*((4**k))
for i in range(len(text) - k + 1):
pattern = text[i:i+k]
j = patternToNumber(pattern)
array[j] = array[j]+1
return array
computingFrequencies('ACGCGGCTCTGAAA', 2)
#2 1 0 0 0 0 2 2 1 2 1 0 0 1 1 0
[2, 1, 0, 0, 0, 0, 2, 2, 1, 2, 1, 0, 0, 1, 1, 0]
max(ComputingFrequencies('ACGCGGCTCTGAAA', 2))
2
def fasterFrequentWords(text , k):
frequentPatterns = set()
frequencyArray = computingFrequencies(text , k)
maxCount = max(frequencyArray)
for i in range(0,4**k-1):
if frequencyArray[i] == maxCount:
pattern = numberToPattern(i, k)
frequentPatterns.add(pattern)
return frequentPatterns
%%time
text = 'CGAAAGGCCGGCGTGGTTAAACGTCCTTCCTCCCTAAACGTCTAAACGTCCTGCCTTCCGGCGTGGTCGAAAGGCTAAACGTCCGGCGTGGTCTGCCTTCTAAACGTCCTGCCTTCCGAAAGGCCTGCCTTCCTTCCTCCCCGGCGTGGTTAAACGTCCTTCCTCCCCGGCGTGGTCGAAAGGCCTTCCTCCCCTTCCTCCCCGAAAGGCCTGCCTTCCGAAAGGCCGGCGTGGTCTTCCTCCCCGGCGTGGTTAAACGTCCTTCCTCCCCGAAAGGCCTTCCTCCCCGGCGTGGTCGGCGTGGTCGAAAGGCTAAACGTCTAAACGTCCGGCGTGGTCTTCCTCCCCTTCCTCCCCGGCGTGGTCTTCCTCCCCTGCCTTCCTGCCTTCCTTCCTCCCCGGCGTGGTCGAAAGGCCTGCCTTCCTGCCTTCCTTCCTCCCTAAACGTCCTGCCTTCTAAACGTCCGGCGTGGTCTTCCTCCCCGAAAGGCCGGCGTGGTCGGCGTGGTCGAAAGGCTAAACGTCCTTCCTCCCCTGCCTTCTAAACGTCCGAAAGGCTAAACGTCCTTCCTCCCCGGCGTGGTCGGCGTGGTCTTCCTCCCTAAACGTCTAAACGTCCGGCGTGGTCGGCGTGGTCTTCCTCCCCTGCCTTCTAAACGTCCGGCGTGGTCTTCCTCCCCTTCCTCCCCTGCCTTCCTGCCTTCCGGCGTGGTCTGCCTTCTAAACGTCTAAACGTCCGGCGTGGTTAAACGTCCTGCCTTCCGGCGTGGTCTTCCTCCCCGGCGTGGTCGGCGTGGTCTGCCTTCCTTCCTCCCCGAAAGGCCGAAAGGCCGAAAGGCCGGCGTGGTCGGCGTGGTCGAAAGGCCTGCCTTCCGGCGTGGTTAAACGTCCGAAAGGCCGGCGTGGTCTTCCTCCCTAAACGTCCGAAAGGCCGGCGTGGTCTTCCTCCCCTTCCTCCCTAAACGTCCGAAAGGC'
k = 13
print fasterFrequentWords(text, k)
set(['TGGTCTTCCTCCC', 'GCGTGGTCTTCCT', 'CGTGGTCTTCCTC', 'GTGGTCTTCCTCC', 'GGCGTGGTCTTCC', 'CGGCGTGGTCTTC']) CPU times: user 8.98 s, sys: 5.95 s, total: 14.9 s Wall time: 16.1 s
def findingFrequentWordsBySorting(t,k):
frequentPatterns = set()
index = [0] * (len(t) - k + 1)
count = [0] * (len(t) - k + 1)
for i in range(len(t) -k + 1):
pattern = t[i:i+k]
index[i] = patternToNumber(pattern)
count[i] += 1
index.sort()
for i in range(1,len(t) -k + 1):
if index[i] == index[i-1]:
count[i] = count[i-1] + 1
maxCount = max(count)
for i in range(len(t) -k + 1):
if count[i] == maxCount:
pattern = numberToPattern(index[i],k)
frequentPatterns.add(pattern)
return frequentPatterns
%%time
text = 'CGAAAGGCCGGCGTGGTTAAACGTCCTTCCTCCCTAAACGTCTAAACGTCCTGCCTTCCGGCGTGGTCGAAAGGCTAAACGTCCGGCGTGGTCTGCCTTCTAAACGTCCTGCCTTCCGAAAGGCCTGCCTTCCTTCCTCCCCGGCGTGGTTAAACGTCCTTCCTCCCCGGCGTGGTCGAAAGGCCTTCCTCCCCTTCCTCCCCGAAAGGCCTGCCTTCCGAAAGGCCGGCGTGGTCTTCCTCCCCGGCGTGGTTAAACGTCCTTCCTCCCCGAAAGGCCTTCCTCCCCGGCGTGGTCGGCGTGGTCGAAAGGCTAAACGTCTAAACGTCCGGCGTGGTCTTCCTCCCCTTCCTCCCCGGCGTGGTCTTCCTCCCCTGCCTTCCTGCCTTCCTTCCTCCCCGGCGTGGTCGAAAGGCCTGCCTTCCTGCCTTCCTTCCTCCCTAAACGTCCTGCCTTCTAAACGTCCGGCGTGGTCTTCCTCCCCGAAAGGCCGGCGTGGTCGGCGTGGTCGAAAGGCTAAACGTCCTTCCTCCCCTGCCTTCTAAACGTCCGAAAGGCTAAACGTCCTTCCTCCCCGGCGTGGTCGGCGTGGTCTTCCTCCCTAAACGTCTAAACGTCCGGCGTGGTCGGCGTGGTCTTCCTCCCCTGCCTTCTAAACGTCCGGCGTGGTCTTCCTCCCCTTCCTCCCCTGCCTTCCTGCCTTCCGGCGTGGTCTGCCTTCTAAACGTCTAAACGTCCGGCGTGGTTAAACGTCCTGCCTTCCGGCGTGGTCTTCCTCCCCGGCGTGGTCGGCGTGGTCTGCCTTCCTTCCTCCCCGAAAGGCCGAAAGGCCGAAAGGCCGGCGTGGTCGGCGTGGTCGAAAGGCCTGCCTTCCGGCGTGGTTAAACGTCCGAAAGGCCGGCGTGGTCTTCCTCCCTAAACGTCCGAAAGGCCGGCGTGGTCTTCCTCCCCTTCCTCCCTAAACGTCCGAAAGGC'
k = 13
print findingFrequentWordsBySorting(text, k)
set(['TGGTCTTCCTCCC', 'GCGTGGTCTTCCT', 'CGTGGTCTTCCTC', 'GTGGTCTTCCTCC', 'GGCGTGGTCTTCC', 'CGGCGTGGTCTTC']) CPU times: user 12.3 ms, sys: 7.11 ms, total: 19.4 ms Wall time: 14 ms
def reverseComplement(s):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N':'N'}
t = ''
for base in s:
t = complement[base] + t
return t
reverseComplement('GATTACA')
'TGTAATC'
#Naive exact match algorithm
def naive(p, t):
occurences = []
for i in range(len(t) - len(p) + 1): #loop through every position in t where p could start
match = True
for j in range(len(p)): #Loop over characters
if t[i+j] != p[j]: #Compare characters
match = False #mismatch; reject alignments
break
if match:
occurences.append(i) #all characters matched record
return occurences
t = 'TGAACGTCCTGAACGTTGCGGTGAACGTAAAGGTGAACGTCTTTCGGGATGAGTGAACGTATGAACGTATTTGAACGTCCTGAACGTTGAACGTAGGTGAACGTTGAACGTTTTGAACGTATGAACGTTGAACGTGCCCGGATGAACGTTTGAACGTTGAACGTACTTGAACGTACTGAACGTTGAACGTTTGAACGTCCATGAACGTTGAACGTTGAACGTGAGTGAGTGAACGTTTTCTGAACGTCGCGGTGAACGTTGAACGTTGAACGTTGAACGTGGGAGTGAACGTCATGAACGTGTGAACGTTGAACGTTGAACGTCATTTGAACGTGAATGAACGTTGAACGTATCTGAACGTTCAACCTGAACGTCACTAACATGAACGTTGAACGTAATTGAACGTTGAACGTCTTGAACGTTGAACGTTGAACGTTGAACGTGTCTGAACGTGTGAACGTCACGGATGACGTGAACGTTGAACGTGTGAACGTATGAACGTTACTGAACGTTAGAAGGAGCGGTGAACGTTGAACGTTGAACGTTTGAACGTATGAACGTTGAACGTCTGAACGTTCCTGAACGTTAGTGAACGTCTGAACGTCTTTGAACGTGCTGAACGTATGAACGTTGAACGTTGAACGTCGGCAACGTGAACGTAATGAACGTTGAACGTATGGCATGAACGTGGTCGTGAACGTGTGAACGTCTGAACGTCGTATGAACGTGGCGTTGAACGTCTGATCGTGAACGTTGAACGTTGAACGTTGGCATACGCTATTGAACGTCTCTGAACGTATGAACGTTGATGAACGTTTGAACGTGTGTCTTGTGAACGTTTTGAACGTGTGTGAACGTTGAACGTAGGATGAACGTTCGGTGAACGTGTGAACGTTGTGAACGTTATGAACGTCACCCACTGAACGTGTGCTGAACGTCGGCTGGATGAACGTTGAACGTTGAACGTTTGAACGTTGAACGTTGTCCTGAACGTCTCTCTGAACGTTGAACGTATGAACGTTGAACGTTGAACGTGTTCACTGAACGTGTGAACGTAACATGAACGTAGACATGTGAACGTAAGGAATGAACGTTGAACGTTCCATGAACGTTGAACGTGAGAATGAACGTTGAACGTACATGAACGTCGTGAACGTCTGAACGTTGAACGTTGAACGTTGAACGTATGAACGTTGAACGTTACGTGATGAACGTTTGAACGTCCATGAACGTTGAACGTGCGTGAACGTAGATTTCTACTGAACGTTGAACGTCCTGAACGTTGAACGTTAGTGAACGTGAGGTTGAACGTATGAACGTCCATGAACGTTTGAACGTTGAACGTTGAACGTATGAAGGCTTGAACGTCATGTTGAACGTGCTGAACGTGTGAACGTAAATTGAACGTTGAACGTCAGTCTGAACGTGCTGAACGTGTGAACGTGACATTGAACGTGTACTTTCTGAACGTTTATTGAACGTTGAACGTTGAACGTCTACCATGAACGTCGTGAACGTCCTGAACGTTGAACGTTGAACGTGTGAACGTGGTTGAACGTGCATGCCAATATGAACGTGTGAACGTCGGAACGTGAACGTACGTTTGAACGTCTGAACGTCGAATGAACGTTTTCTGAACGTATGAACGTCGCAGGTGAACGTTATTCCTGAACGTTATGAACGTAATGGTGAACGTTTACTCGACGTGGGCTGAACGTTGAATGAACGTTGAACGTTGAACGTCTACGTCTGAACGTGGTGAACGTAGTGAACGTTGAACGTGATTGAACGTTTTATGAACGTACGTTGAACGTCTCAGAATTGAACGTACTCATCTTGAACGTTGAACGTGAGTGAACGTTGAACGTTGAACGTTGAACGTGGTGAACGTGCCGCAGTGAACGTGTCCCATGAACGTTGAACGTCCTGAACGTTGAACGTTTGAACGTATGAACGTGTGAACGTAGTGAACGTTGAACGTTATGAACGTTGAACGTTGAACGTCATATGGCCGTGTCCGTGAACGTCGTGAACGTGCTGAACGTGGTGAACGTCAAGCCTGAACGTGACTGCTGAACGTTCGGGAGTAATTTCGTGTGAACGTTGCACATGAACGTGGTGAACGTCACATGAACGTGTCTGAACGTTTCTGAACGTTTTAGGTGAACGTGCTGAACGTATGAACGTTGAACGTCAACTGAACGTTAGTGAACGTCCGTGAACGTTTTGAACGTTGAACGTTGGCGTGAACGTTGTGTTGAACGTTGAACGTAGGTCCCTGAACGTACATGAACGTTGAACGTTGAACGTTTGAACGTACCCATGAACGTTGAACGTGCTTATGAACGTTGAACGTATACTGAACGTGTGAACGTAATGAACGTCTCTGAACGTCCTGAACGTGTGAACGTGTGAACGTATTGAACGTTCTGAACGTAGGGACCAAGCATGAACGTTGAACGTCTGAACGTTCAATGAACGTACGCTGAACGTCTGAACGTGGACTGAACGTGGCTGAACGTTCCTTCTGAACGTATGAACGTTGAACGTCAATGAACGTATGAACGTTGAACGTACATTGAACGTTGAACGTGCCTGAACGTTACGATGAACGTAATGCTGAACGTCTGAACGTATGAACGTTGAACGTTGAACGTTGAACGTTGAACGTGTGAACGTCAAGTTGACTGAACGTCGGTTGAACGTATGAACGTGTGAACGTGTTTGAACGTAGCGAGTTTGAACGTTGAACGTTGTTGAACGTGTGATGAACGTTGAACGTTGAACGTCTTTTTGAACGTTCATGAACGTCTGAACGTTGAACGTTGAACGTGTAGCTGAACGTATGAACGTCTGAACGTTGAACGTCACCTGAACGTATCCCTGAACGTCATGAACGTATTGAACGTACCCTCGTCTGAACGTTGAACGTATGAACGTTCGACTGAACGTCCACCTGAACGTGTGAACGTTGAACGTTGAACGTTTAACTGAACGTGAAATGTTGAACGTTGTGAACGTGTTGAACGTTGAACGTATGAACGTACGGTGAACGTAGTTTGAACGTCTAGTGAACGTGTGAACGTCGTGAACGTCCGGAATGAACGTCTTTACGTTGAACGTATGAACGTACATTGGATTGAACGTCTGAACGTATAGAGTGAACGTGTGAACGTTTGAACGTAGTTTTGAACGTCGAGCTGAACGTCGGTCTGAACGTACGAGTAACCATCATTTGAACGTTTGAACGTTGAACGTTGAACGTATGAACGTTGAACGTTGAACGTCGTGTGAACGTTATCTTTGAACGTTATGAACGTTGAACGTTGGGTGAACGTGTGAACGTGATTGAACGTGTGAACGTCGTCGGCAGCGTGAACGTTGAACGTCTGAACGTACTAAACTGAACGTTTGATGTGAACGTGCCTGAACGTTACTGAACGTTGAGGTTGAACGTGACCCAGTGAACGTTTGAACGTAGGCGATGCTGAACGTCGTGAACGTGTTTTGAACGTCGGTTGCGCTGAACGTCCCTGATCCTGAATGAACGTCTGAACGTTTGAACGTCTGAACGTTGAACGTATGAACGTGGGGGTGAACGTACTGAACGTTGAACGTCGTGAACGTGCGTGAACGTTGAACGTCCCTGAACGTTGAACGTTGAACGTGATTGAACGTGTGAACGTATAGTGAACGTGACTTGAACGTTGAACGTAGATGAACGTTGAACGTACCTGAACGTTAAAGTGAACGTTGCCACCTGAACGTTGAACGTTGAACGTTATTGAACGTTTGAACGTTGAACGTTGAACGTGGATGAACGTTGAACGTGCTGAACGTTGAACGTACTGGAGGGCTGAACGTACTGAACGTCGGTGAACGTGTGAACGTTGAACGTTGAACGTGTCTCTCGATGAACGTCATGAACGTCTGAACGTTGAACGTTCATTTTGAACGTTGAACGTCCATGAACGTGCTTGAACGTTGAACGTTCTAATTGAACGTGTGAACGTTGAACGTCGTCTCACTGAACGTGTGAACGTTTTGAACGTTGAACGTTCTGAACGTGCTGAACGTTGAACGTATGAACGTCGCAAGTTGAACGTATGAACGTCGGTGAACGTGTGAACGTTTGAGTGTGAACGTGAGCATTACATTGAACGTTTGAACGTGCTGAACGTCTGTGAACGTAACGCTGAACGTTTGAACGTCATAGTGAACGTTACATGAACGTTGAACGTCTTGAACGTACCGCCCTGAACGTCAGTGAACGTTATGAACGTTGAACGTATGATGAACGTGAATGACTGAACGTTTGAACGTACTAAGGAAACTGAACGTTGAACGTTGAACGTGCTGAACGTATTCACTGAACGTTCTGAACGTTGAACGTGTGAACGTGTGAACGTTGAACGTTATGAACGTTGAACGTTGAACGTACATGAACGTTGAACGTTGAACGTAGTGTCTTCTGAACGTTTATGAACGTTGAACGTTGAACGTAACGATGAACGTTGAACGTATTGAACGTCCGACGTGAACGTTTGTTGAACGTATTGAACGTGCTTGAACGTATTATGAACGTATGAACGTCTTGAACGTTGAACGTCAGGCTGAACGTTACATGAACGTGTAATGAACGTAATGAACGTTGAACGTGGGATGAACGTTTGAACGTAATTGAACGTTCTGGTTGTGAACGTCTGAACGTTGATTGAACGTTGAACGTCTGAACGTGAAATTAATTGAACGTTTGAACGTGGAGTGAACGTTGAACGTTGAACGTGGAGAAGATATCATGAACGTTGAACGTCGGACCTTGAACGTTATGAACGTTCATGAACGTAAAGTCTGAACGTCTATGAACGTATCAGTGAACGTCGGTGTGAACGTGGTGAACGTTGAACGTTGAACGTAGTGAACGTATTGAACGTTGTGAACGTTTGAACGTAGCTGTGAACGTTAGTGAACGTTGTGTTTAGTGAACGTTGAACGTACATGAACGTACCGCTGTGAACGTTTATGAACGTTTTGAACGTTGAACGTTGAACGTGTGAACGTAGTGAACGTGTGAACGTTGAACGTGCAGGCCCTGAACGTTGAACGTCGTGAACGTTTAGTGAACGTCCGGATGAACGTTCTGAACGTGTAGCTGAACGTGGCATGAACGTACGTTGAACGTAGTGAACGTATATGAACGTTGCGCTGAACGTATTGAACGTCTCTCATTGAACGTGTGAACGTCTGAACGTTGAACGTTGAACGTTTGAACGTCTGAACGTCCCGCGGCACTGAACGTTGAACGTCTAACTGAACGTTCTGAACGTTCGCTGAACGTGTGAACGTTGAACGTGAAGTATGTGAACGTATACTGAACGTCTGAACGTCTGAACGTTTGTGAACGTGATGAACGTCCGTGAACGTCATTGAACGTTTTGAACGTCTGAACGTAGGGTTGCTGAACGTGTGAACGTTACTGAACGTTGAACGTCGCTGAACGTTTGAACGTTGAACGTCTGAACGTCATGAACGTTGAACGTACTGAACGTGACTGAACGTATGAACGTGTATCTGAACGTCCTGAACGTAACTGAACGTTGAACGTTGAACGTTGAACGTATGAACGTGGACTGAACGTTGAACGTCACTTGAACGTTGAACGTTGAACGTTTTAACTGAACGTCTGAACGTGCTGTGAACGTCTCGCTTGAACGTAACATGAACGTTCATGAACGTTGAACGTGTTGTGAGTGAACGTTGAACGTTGAACGTATTGAACGTTTGAACGTGTGAACGTCGGCTGAACGTAAGATGAACGTTGAACGTGCTGAACGTTGGGTGAACGTGTTGAACGTCTATAGTGAACGTCACTTGAACGTTGAACGTAAGTGAACGTCATGTGAACGTCACGTGGTGAACGTTGACGTGAACGTTTCTGAACGTAGTTGAACGTCAAGTGAACGTGTGAACGTCGGAAGATCTGAACGTACGCCGTTCATTGAACGTATGAACGTGTGAACGTCAGATGAACGTGTCCAGCTGAACGTCTGAACGTCGGGGCATCAATCTGAACGTACCTGGGATGAACGTTGAACGTTGAACGTTTTGAACGTTCTTGAACGTTGAACGTTGAACGTCTGAACGTCAAGTGAACGTTGAACGTGTGAACGTTGAACGTGGCTATTGTGAACGTTGAACGTCTGAACGTCATGAACGTTTGAACGTTGAACGTGAGGCGGAATGAACGTGTATGAACGTCGCAGCTGAACGTTGAACGTCTTGAACGTATGAACGTTCCTGGCAGTGAACGTGTTGACGGCTATGAACGTATGAACGTTGAACGTTGAACGTTTCCGGGGCAGTGAACGTTGAACGTGGTGAACGTCGTCTGAACGTTGTGAACGTGTGAACGTTGTTGAACGTGTGAACGTATGAACGTCTGAACGTTCTGGCCAGCTATGAACGTGTGAACGTTGAACGTGTGAACGTATGAACGTTGAACGTTATCTGAACGTCTGAGTGAAGTGAACGTGCGGCTTTTGAACGTCACATTTGAACGTTTGAACGTATGAACGTTCTGAACGTGTTGAACGTATGAACGTCGTTGAACGTGTTGAACGTTGAACGTCTGAACGTTGAACGTGTTGAACGTCTTGAACGTCGCGGTGAACGTTTTGAACGTTGAACGTCGTTGATGAACGTATGAACGTTTAGCGGTCTCTGTGAACGTTTAGGTAATTGAACGTGTGATGAACGTTGAACGTTGAACGTTGAACGTTCTGAACGTACCGGCTTTGAACGTCCATGAACGTGTGAACGTAGACGTGAACGTGTGAACGTTGAGTGAACGTATTGAACGTAATTTGAACGTTGAACGTTGAACGTGTGAACGTGCGGGAGTGAACGTTGAACGTCGTGAACGTTGGTGAACGTGGTGAACGTTGGTGAACGTTTGAACGTTCGTGGCGCATGAACGTCTGAACGTAATGAACGTCTCCTTCTTGAACGTCGTGAACGTTGAACGTCGTGAACGTTGAACGTTGCGCGTGTGAACGTATGAACGTATGAACGTGTGAACGTGTGAACGTCTGAACGTAGTGAACGTTGATTGAACGTACACGAGGACGTGTGAACGTAGTGAACGTGATGAACGTTGAACGTTGAACGTCATGAACGTTGAACGTCCTGAACGTGTAATTTGAACGTGTGAACGTCGCTGAACGTGAACATGAACGTATGAACGTATTTGAACGTTGAACGTTTTGACCTGAACGTAGTCTGAACGTGATGGATGAACGTCGTGAACGTAATGAACGTTGAACGTTATGAACGTCTGTTGAACGTAATGAACGTAGTGAACGTTGAACGTGATCTGAACGTTATGAACGTATGAACGTTTGAACGTTGAACGTCTGAACGTTGTGAACGTCTGAACGTGTGAACGTCTGAACGTTGGTGACTAACAACCGGATGCTTGAACGTCTGAACGTTTATGAACGTCTGAACGTATGTGTGAACGTTGGTGCCTGGGAGGCTGAACGTTGAACGTCGTGGCTGAACGTTGAACGTACTGAACGTCAATAATATCTGTATGTGAACGTATCCTCGCTGAACGTGGAACGCCGATTTGAACGTACAGTGAACGTATTCTTGAACGTACCTAGAAAGTGAACGTTCAGCGTGAACGTTGAACGTCCTGAACGTGGGGTGAACGTTACGTGAACGTCTCGTGAACGTGTGATGAACGTTGAACGTTGAACGTGTGAACGTTGAACGTACTACTGAACGTCTCTGAACGTTGAACGTGTGAACGTACTTGAACGTTGAACGTTGAACGTGTGAACGTATGAACGTTATGAACGTTGAACGTTAACTGTGAACGTTCATACTATATGAACGTTGCGAGGCAAGTGAACGTGTGAACGTTGAACGTTTGAACGTAATGAACGTCGTTGAACGTTGAACGTGCTGCGATGGCCCGCTGCAAGTGAACGTGATGTGAACGTAATGAACGTTGTGAACGTTGAACGTCTATGAACGTATGTCTGTGAACGTTTGAACGTACTGGATGAACGTACATGAACGTATGAACGTCAGGTGAACGTCGTGAACGTCAGTCTGAACGTGCTGAACGTGGTGAACGTGCGTAGGGCTGAACGTTGAACGTTCATGAACGTTGAACGTTTGAACGTGGGTGAACGTTAATGAACGTGATGAACGTTTGAACGTATATGAACGTACGCTGAACGTAGTGAACGTTGAACGTCGTGAACGTAGTGAACGTTGAACGTGGCGGAAGTGAACGTTCAAGTGAACGTTTGAACGTTGAACGTCAGTGAACGTGGAATTATTCCTTGATGAACGTATGAACGTTTCTGAACGTTGAACGTGTGAACGTTGAACGTATGATGAACGTCGCATGAACGTGGTGTTACCAACAGCTGTTGTGAACGTTGAACGTAGTGAACGTCTGAACGTCTGAACGTATTGCCAATGAACGTCCTGAACGTCTGGCTGAACGTTGAACGTTATGAACGTTGAACGTTGAACGTTGAACGTCTGAACGTATGAACGTCTGAACGTTGAACGTTGAACGTCGCTGAACGTTGAACGTTTGAACGTCGTCGTTGCCTCGTGAACGTTCTTGAACGTTTGAACGTTGAACGTATTCCGCCCAATGAACGTTGAACGTCTGAACGTGGGCCATTGTTAATGAACGTCATCATGTGAACGTATGAACGTAGGGTGAACGTCTCTCAAATTGAACGTGATTAATCCTGAACGTTGAACGTAATGAACGTGGTTGTGAACGTCTTGTGAACGTTATGAACGTTGAACGTTGAACGTCATGCAGACCGTGAACGTTATGAACGTGGTGAACGTTGAACGTGTGAACGTCCGTTGAACGTACAAATTGAACGTGCTGAACGTTTGAACGTCTGAACGTTGAACGTTGACTGAACGTACTTTTGAACGTTGAACGTTGAACGTTGCGGTTGAACGTCCTTGAACGGATTGAACGTTCCTGAACGTGTGAACGTGCTGAACGTGTGTGAACGTATGAACGTTGAACGTTGAACGTTGAACGTGCGCTGAACGTTGAACGTCCTGAACGTCTGAACGTCTGAACGTTGAACGTTGAACGTGACAAACAGACCTGGTGTGAACGTACAGATGAACGTCGTGAACGTTGAACGTACTCACTGAACGTCTGAACGTTGAACGTGGATGCTGAACGTGTGAACGTACTGTGAACGTTGAACGTGGAAATGAACGTCCAAGGTGAACGTCTATCTGAACGTCTTGAACGTTGGCCCACGATTTTGTGAACGTCCCAATTTGAACGTATGAACGTTGAACGTTGAACGTGATGAACGTTGAACGTTGAACGTTGAACGTCATGAACGTTGAACGTTGAACGTATAAGTGAACGTATGAACGTGACTTGAACGTTGAACGTGTGAACGTTGAACGTACTAATCCCGGATTGAACGTAGTCAATTGAACGTATGAACGTAAC'
p = 'TGAACGTTG'
x = naive(p,t)
' '.join(str(i) for i in x)
'9 80 97 121 150 176 201 208 252 259 266 302 309 337 382 399 415 422 429 472 524 531 554 624 631 662 747 754 761 799 851 888 946 953 968 975 999 1014 1021 1087 1105 1124 1158 1165 1172 1187 1226 1260 1276 1331 1338 1401 1475 1482 1520 1527 1711 1722 1729 1768 1837 1854 1861 1868 1911 1927 1967 1983 1990 2097 2180 2227 2234 2246 2258 2289 2296 2323 2342 2449 2536 2561 2579 2636 2643 2650 2657 2740 2747 2768 2775 2812 2819 2854 2918 2965 2972 3005 3023 3244 3251 3266 3273 3313 3320 3375 3437 3568 3604 3630 3647 3654 3701 3718 3747 3761 3768 3793 3800 3817 3833 3884 3891 3931 3951 3978 4006 4045 4070 4227 4276 4334 4341 4379 4402 4418 4425 4442 4449 4482 4489 4508 4595 4645 4704 4715 4765 4772 4799 4895 4902 4927 4966 4982 5032 5039 5071 5093 5194 5245 5252 5292 5339 5477 5502 5526 5591 5598 5605 5631 5649 5656 5729 5751 5758 5812 5828 5872 5914 6083 6090 6116 6123 6149 6164 6186 6218 6264 6330 6337 6362 6389 6406 6467 6490 6614 6629 6675 6750 6757 6764 6833 6864 6871 6900 6916 6935 7011 7027 7034 7098 7147 7154 7170 7247 7310 7355 7398 7413 7446 7513 7535 7555 7681 7741 7748 7763 7792 7817 7824 7856 7893 7919 7953 8008 8017 8153 8170 8252 8277 8319 8376 8391 8447 8515 8531 8538 8545 8576 8583 8600 8653 8678 8778 8826 8833 8876 8940 8947 8970 8977 8984 9061 9068 9075 9093 9125 9132 9184 9212 9251 9304 9348 9355 9371 9378 9385 9401 9408 9446 9461'
import bisect
#Book index analogy
class Index(object):
def __init__(self, t, k):
''' Create index from all substrings of size 'length' '''
self.k = k # k-mer length (k)
self.index = []
for i in range(len(t) - k + 1): # for each k-mer
self.index.append((t[i:i+k], i)) # add (k-mer, offset) pair
self.index.sort() # alphabetize by k-mer
def query(self, p):
''' Return index hits for first k-mer of P '''
kmer = p[:self.k] # query with first k-mer
i = bisect.bisect_left(self.index, (kmer, -1)) # binary search
hits = []
while i < len(self.index): # collect matching index entries
if self.index[i][0] != kmer:
break
hits.append(self.index[i][1])
i += 1
return hits
def queryIndex(p, t, index):
k = index.k
offsets = []
for i in index.query(p):
if p[k:] == t[i+k:i+len(p)]: # verify that rest of P matches
offsets.append(i)
return offsets
cholera_genome = ''
f = open('Vibrio_cholerae.txt')
for line in f:
cholera_genome += line.rstrip()
cholera_genome[:100]
'ACAATGAGGTCACTATGTTCGAGCTCTTCAAACCGGCTGCGCATACGCAGCGGCTGCCATCCGATAAGGTGGACAGCGTCTATTCACGCCTTCGTTGGCA'
p = 'CTTGATCAT'
t = cholera_genome
index = Index(t, 4)
print(queryIndex(p, t, index))
[60039, 98409, 129189, 152283, 152354, 152411, 163207, 197028, 200160, 357976, 376771, 392723, 532935, 600085, 622755, 1065555]
Our plan is to slide a window of fixed length L along the genome, looking for a region where a k-mer appears several times in short succession. The parameter value L = 500 reflects the typical length of oriC in bacterial genomes.
We defined a k-mer as a "clump" if it appears many times within a short interval of the genome. More formally, given integers L and t, a k-mer Pattern forms an (L, t)-clump inside a (longer) string Genome if there is an interval of Genome of length L in which this k-mer appears at least t times. (This definition assumes that the k-mer completely fits within the interval.) For example, TGCA forms a (25,3)-clump in the following Genome:
gatcagcataagggtccCTGCAATGCATGACAAGCCTGCAGTtgttttac From our previous examples of oriC regions, ATGATCAAG forms a (500,3)-clump in the Vibrio cholerae genome, and CCTACCACC forms a (500,3)-clump in the Thermotoga petrophila genome. We are now ready to formulate the following problem.
Clump Finding Problem: Find patterns forming clumps in a string. Input: A string Genome, and integers k, L, and t. Output: All distinct k-mers forming (L, t)-clumps in Genome.
def clumpFinding(genome, k, t, L):
frequentPatterns = set()
clump = [0]*((4**k))
text = genome[0:L]
frequencyArray = computingFrequencies(text, k)
for i in range(4**k):
if frequencyArray[i] >= t:
clump[i] = 1
for i in range(1, len(genome) - L + 1):
firstPattern = genome[i-1:i-1+k]
index = patternToNumber(firstPattern)
frequencyArray[index] = frequencyArray[index] - 1
lastPattern = genome[i+L-k:i+L]
index = patternToNumber(lastPattern)
frequencyArray[index] = frequencyArray[index] + 1
if frequencyArray[index] >= t:
clump[index] = 1
for i in range(4**k):
if clump[i] == 1:
pattern = numberToPattern(i, k)
frequentPatterns.add(pattern)
return frequentPatterns
genome = 'CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA'
print clumpFinding(genome, 5, 4, 50)
set(['CGACA', 'GAAGA'])
!wget https://stepic.org/media/attachments/lessons/4/E-coli.txt
--2015-09-26 12:54:41-- https://stepic.org/media/attachments/lessons/4/E-coli.txt Resolving stepic.org... 54.194.188.166 Connecting to stepic.org|54.194.188.166|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 4639675 (4.4M) [text/plain] Saving to: `E-coli.txt' 100%[======================================>] 4,639,675 313K/s in 15s 2015-09-26 12:54:58 (303 KB/s) - `E-coli.txt' saved [4639675/4639675]
ecoli_genome = ''
f = open('E-coli.txt')
for line in f:
ecoli_genome += line.rstrip()
ecoli_genome[:100]
'AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAAT'
genome = ecoli_genome
len(clumpFinding(genome, 9, 3, 500))
1904
The reverse half-strand of DNA lives a double stranded life most of the time. The forward half-strand spends a large portion of its life single stranded waiting to be replicated. This leads to difference in nucleotide frequency. Single stranded DNA has a much higher mutation rate than double-stranded DNA. Cytosine deamination- 100 fold higher in forward half strand. Reverse half strand - G low/C high #G-#C is negative decreasing as we walk along reverse half strand.
Since we don't know the location of oriC in a circular genome, let's linearize it (i.e., select an arbitrary position and pretend that the genome begins here), resulting in a linear string Genome. We define Skewi(Genome) as the difference between the total number of occurrences of G and the total number of occurrences of C in the first i nucleotides of Genome. The skew diagram is defined by plotting Skewi (Genome) (as i ranges from 0 to |Genome|), where Skew0 (Genome) is set equal to zero. The figure below shows a skew diagram for the DNA string CATGGGCATCGGCCATACGCC.
Note that we can compute Skewi+1(Genome) from Skewi(Genome) according to the nucleotide in position i of Genome. If this nucleotide is G, then Skewi+1(Genome) = Skewi(Genome) + 1; if this nucleotide is C, then Skewi+1(Genome)= Skewi(Genome) – 1; otherwise, Skewi+1(Genome) = Skewi(Genome).
Sample Input: CATGGGCATCGGCCATACGCC
Sample Output: 0 -1 -1 -1 0 1 2 1 1 1 0 1 2 1 0 0 0 0 -1 0 -1 -2
def skew(dna):
d = {'A':0, 'C':-1, 'G':1, 'T':0}
skewness = [0] * ((len(dna)) + 1)
for i in range(len(dna)):
skewness[i+1] = skewness[i] + d[dna[i]]
return skewness
def skew(dna): skewness = [0] * ((len(dna)) + 1) for i in range(len(dna)): if dna[i] == 'G': skewness[i+1] = skewness[i] + 1 if dna[i] == 'C': skewness[i+1] = skewness[i] - 1 if dna[i] == 'A' or dna[i] == 'T': skewness[i+1] = skewness[i] return skewness
array = skew('CATGGGCATCGGCCATACGCC')
#' '.join(str(i) for i in array)
# '0 -1 -1 -1 0 1 2 1 1 1 0 1 2 1 0 0 0 0 -1 0 -1 -2'
array
[0, -1, -1, -1, 0, 1, 2, 1, 1, 1, 0, 1, 2, 1, 0, 0, 0, 0, -1, 0, -1, -2]
def min_skew(dna):
array = skew(dna)
minarray = min(array)
min_index = []
for item in range(len(array)):
if array[item] == minarray:
min_index.append(item)
return min_index
min_skew('CATTCCAGTACTTCGATGATGGCGTGAAGA')
[14]
Let's follow the 5' → 3' direction of DNA and walk along the chromosome from terC to oriC (along a reverse half-strand), then continue on from oriC to terC (along a forward half-strand). In our previous discussion, we saw that the skew is decreasing along the reverse half-strand and increasing along the forward half-strand. Thus, the skew should achieve a minimum at the position where the reverse half-strand ends and the forward half-strand begins, which is exactly the location of oriC!
We have just developed an insight for a new algorithm for locating oriC: it should be found where the skew attains a minimum.
Minimum Skew Problem: Find a position in a genome where the skew diagram attains a minimum. Input: A DNA string Genome. Output: All integer(s) i minimizing Skewi (Genome) among all values of i (from 0 to |Genome|).
print skew('TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT')
[0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, -1, 0, 0, 1, 1, 2, 3, 2, 1, 1, 1, 0, 0, -1, 0, 0, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 3, 4, 5, 6, 5, 6, 6, 6, 6, 6, 5, 6, 5, 6, 7, 8, 8, 7, 6, 7, 7, 7]
array = min_skew('TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT')
print (' '.join(str(i) for i in array))
11 24
We say that position i in k-mers p1 … pk and q1 … qk is a mismatch if pi ≠ qi. For example, CGAAT and CGGAC have two mismatches. The number of mismatches between strings p and q is called the Hamming distance between these strings and is denoted HammingDistance(p, q).
Hamming Distance Problem: Compute the Hamming distance between two strings. Input: Two strings of equal length. Output: The Hamming distance between these strings.
def hammingDistance(x,y):
nmm = 0
for i in xrange(len(x)):
if x[i] != y[i]:
nmm += 1
return nmm
hammingDistance('CAGAAAGGAAGGTCCCCATACACCGACGCACCAGTTTA', 'CACGCCGTATGCATAAACGAGCCGCACGAACCAGAGAG')
23
p = 'GCGTTTTCGTTACCTTCAACTCCCGGACTGTAGTACATCAAAACTACGTGACGTTTTTAAGAGTATTGAAGAAATTCACATTGATACCTTTGATAGAGTTGTGGGGAGGAAGTCCGGGGACATCGGGGGGGTTCAGCTCTCTAACAGTATGTAAGCAAACCGACGTGGCAAATTGAGGATGTAACGCCACTGCTCGATGGAGGCTTACTTACTTATCGGGAAGGTAAAACGGGTTCACGCAGGTCGGCTTGACTCTGGAAGCCCGATCACGGAGGTGTCATATTAGACGGCCATAAGTTCTCTACAAGATGTTATACGGAAAGGGGATCCATTGACCTGGGGCGCGCAGTAGGCCCTTCTGGGCAACCTTCACTATAGGCAATCAAGCATGCCTCTCCCTCTGCGAGCTCGGGAGCAAGTGCGCCGCTTGTAAATTAGTGAGCGATGAGTTGCCGTTTCGCGTCCCGCCAGTGAGTTCGCATGTGTTCAAATTGGGACGTCGTTTGTACCAGCACGTTATTTTCTAGGTCGCTACACAACCTCAATACAAGCCCGAACGCGCGCGGTCGCCGAATTAGTCGAGTACGTTCCACGCATTGTAGACTCATCTGGGTGAGGTCGCTGCGACCATGTCATGGGACGTAAGCGGCTACCTCTCGGATAGGTGGTCGGCTACACATCGACAATGAAGTACGCGTCATCATCAAAATTTGTCCAATGTAGACCATTCACTGGAGGCCCAGTGGACAGAGATTTATGTTACGATAGCCCAGATAGATACCGATAACACCCCTTGGGGTGTAAGCGTGCGCGCCCTAACAATACCACCTAGAGCGCAAGTTGTAAATGCATCCGCCCTTGGAACGTCTGTACAGTATTACCAGCAACATCGACACCCCAAGTGGGGCGCCATGAGTGATTCATTGTGATTAGGGCTGAGTAACTAGCTTGTTGTTACTCCAGAAGACGCCGGTCACCCGCCTGTTAAAGTTTAAACTGTAAAATGGTCCCGTACGCTCGTTACGGCTGCTGGCCAAGCGGATCTACTCGTTCCGGTGTTCCCTGTTTACAACCTTTCGTTTGACCATACTAACAACCAGTAAGGGTCATGCCGCGTGGAACCCCACCGTAAATTATGGCAGTTGAATGGATCCCGCGCTCATACTAGAGACTCAG'
q = 'TTGCCGTCTTAAGCGGGATGAACCTGGCAGGGTGATGGGTCCTCTGACGTAGCTCCGCATTTGTGGGCGTAACTTAAGAGAGTTGGCAGAAAAATATTTCTCGGCAGGCTGGCCTCATGGCTTAAAGTTTACGTTTAAAACTGTTTACTACACTGTGCCTGCTGCCATTGGTTAAAAGCATGCTGAGGTGTCGGTTGGGTGACATCAGCTCTGAGCTCGGCATGGAATCGGCTTGCTTGCATAGACGCCAGGTCGCCGTACAAAACGTTCTAAACTGGTGGGATCCGTAAACGAATAGCTCAGAGATTGATATCAATAGCAGATAGCGCAACATCAAAACCTACATATGTTCGGTGTTAAGAGGGAAGACAGAATGGAAGGGCAGCTCTTCCGAACCCGTGGCTAGTGTCCGTTTGTACGATGTATGGAACAAATAATCAGTTTGCGTTGGAACCCTATGCCAAATCGAGTCGGCGTCACCTTTAGTAACTAGTACTTCCCCTACCTCCTGGACTTGTAGGTCACTGCCGTAATAGGTTCTCATGTGCTACGCACAAGCTGGCACCATACAACGACTGACGACTTCCCCACCTCAACTTCGTTTCCCTAAGTGGGTAGAGAGCCCATGCCCGACCATACATTCAAACGAGTAGGACAGACACGTGAGCATTTTTGTCCACACGCTCGACCAACGCTAATCCTTAGTCGGCAAAGCGACCCTAAGGAGTTGTTGCTATACATAACCCTGAGCGGTTGTGGGTTGAGTTGTGTGCTACACAATCGACGGACGTATGTGCATCAGTATGTCCGAATCCGCGTTGACCCAATAGATTATGCCACCCTATACGCTTACCTCGTTCCTCTCTAAGGGCCCCTGGTATACGACACACAATGCACAATGATACCGACAGCACCCATTCTGTGCCCCAGGGATCCCACTACGTTCGCCTGTGCTACTTGCAACATGCCCCTTTCCACTCCCTCCTTGCGATCTGCGGCGCAGTGGGCGGGGGACTCGCTGAGAACCATCGCACGGCTCGGGGCGAGGGAAGATATGGTCGGTTCTGGACGACATGGGGTGAACTCGTCCTTCTATACTATTGGGGATTCGAAACGTTCTGGACGCAAAGAATATGGCTTAGCCCATGACGAAGGCCAGGGGAATGGCGGTTGGCT'
hammingDistance(p,q)
880
We say that a k-mer Pattern appears as a substring of Text with at most d mismatches if there is some k-mer substring Pattern' of Text having d or fewer mismatches with Pattern, i.e., HammingDistance(Pattern, Pattern') ≤ d. Our observation that a DnaA box may appear with slight variations leads to the following generalization of the Pattern Matching Problem.
Approximate Pattern Matching Problem: Find all approximate occurrences of a pattern in a string. Input: Strings Pattern and Text along with an integer d. Output: All starting positions where Pattern appears as a substring of Text with at most d mismatches.
#Approximate matching
def naiveHamming(p, t, maxDistance):
occurences = []
for i in xrange(len(t) - len(p) + 1): #loop through every position in t where p could start
nmm = 0
match = True
for j in xrange(len(p)): #Loop over characters
if t[i+j] != p[j]: #Compare characters
nmm += 1 #mismatch
if nmm > maxDistance: #exceed max hamming distance
break
if nmm <= maxDistance:
occurences.append(i) #approximate match
return occurences
p = 'GTACTA'
t = 'ACGGAGTAAGGACTATAGGTTGTATAATTACGACTAAAGACAGAGCCGTTTAGATCGCTCGGTTATTAATTTAGGTGGTTGGCGTCAGTTACATCGAATGCGTATCACCCCGGATAACTCCACTTTGATAAGGTTAAGAGTACTAATGTAACTCGCAGATCTTAATTTGTGGGACACTAGGACGCGCTCCTATATCGTCTGCTTCCTAAAACGTGGGTAACCCAAACTACCATGTTAAATTGTTATAGTCCAAATGTTGCACGTCCAGACGCTAGACGAGGTTTCAGATTTTCCGATTTCAGGCTCAGGAACACCATCAAGTGCCGGTCGATAGAGCAGGAT'
len(naiveHamming(p, t, 3))
61
len(naiveHamming('CCC', 'CATGCCATTCGCATTGTCCCAGTGA', 2))
15
Our goal is to generate the d-neighborhood Neighbors(Pattern, d), the set of all k-mers whose Hamming distance from Pattern does not exceed d. As a warm up, we will first generate the 1-neigborhood of Pattern using the following pseudocode.
def immediateNeighbors(pattern):
nt = {'A':['C','G','T'], 'C':['A','G','T'], 'G':['A','C','T'], 'T':['A','C','G']}
neighboorhood = set()
for i in range(len(pattern)):
symbol = pattern[i]
for x in nt[symbol]:
neighbor = pattern[:i] + x + pattern[i+1:]
neighboorhood.add(neighbor)
return neighboorhood
immediateNeighbors('ACGA')
{'AAGA', 'ACAA', 'ACCA', 'ACGC', 'ACGG', 'ACGT', 'ACTA', 'AGGA', 'ATGA', 'CCGA', 'GCGA', 'TCGA'}
Our idea for generating Neighbors(Pattern, d) is as follows. If we remove the first symbol of Pattern (denoted FirstSymbol(Pattern)), then we will obtain a (k − 1)-mer that we denote by Suffix(Pattern). Now, consider a (k − 1)-mer Pattern’ belonging to Neighbors(Suffix(Pattern), d). By the definition of the d-neighborhood Neighbors(Suffix(Pattern), d), we know that HammingDistance(Pattern′, Suffix(Pattern)) is either equal to d or less than d. In the first case, we can add FirstSymbol(Pattern) to the beginning of Pattern’ in order to obtain a k-mer belonging to Neighbors(Pattern, d). In the second case, we can add any symbol to the beginning of Pattern’ and obtain a k-mer belonging to Neighbors(Pattern, d).
def neighbors(pattern, d):
'''Neighbors generates all k-mers of Hamming distance at most d from Pattern.'''
if d == 0:
return {pattern}
if len(pattern) == 0:
return {}
if len(pattern) == 1:
return {'A', 'C', 'G', 'T'}
neighborhood = set()
suffixneighbors = neighbors(pattern[1:], d)
for text in suffixneighbors:
if hammingDistance(text, pattern[1:]) < d:
for nt in ['A', 'C', 'G', 'T']:
neighborhood.add(nt + text)
else:
neighborhood.add(pattern[0] + text)
return neighborhood
pattern = 'AAGTATTGAGCA'
x = neighbors(pattern, 2)
' '.join(str(i) for i in x)
'AAGGATTCAGCA AAGTATTGACCC AAGTATTGACCA AAGTATTGACCG AAGCATTGAACA AATTATGGAGCA AAGTATTGATGA AATTCTTGAGCA AAGTATTGGGCT AAGTATTGACCT AAGTATTTAGCA AAGTATCGATCA AAGTATTTAGCC AAGTATTTAGCG ATGTACTGAGCA AAGAAGTGAGCA AAGTGTTGCGCA AAGTATTTAGCT ATTTATTGAGCA AACTAGTGAGCA AAGTATCAAGCA AAGCATTAAGCA AAGTACTTAGCA AAGTACTAAGCA AGGGATTGAGCA AAAAATTGAGCA AGATATTGAGCA AGGTATTAAGCA AAGTATTGGGCA AAGTACTGAGTA ATGTATTGCGCA AAATATTGCGCA AAGTGTAGAGCA AACTATTGATCA AAGTACTGACCA AAGAATTGAGCT AAATATTGGGCA AAGAATAGAGCA CAGTACTGAGCA GAGTCTTGAGCA AATTATTGAACA AAGTACTGCGCA AAGAATTGAGCG CACTATTGAGCA AAGTATTCAACA ATGTATTGAGTA ACATATTGAGCA AAGTCTGGAGCA AACTATCGAGCA ACGTATTGAGCG ACGTATTGAGCA ACGTATTGAGCC AAATATTGATCA AAGGACTGAGCA AAGTATAGAGAA ACGTATTGAGCT AGGTACTGAGCA ACGTATTAAGCA AAGAATTGTGCA GAGTAATGAGCA AAGTATGGAGCG AAGTAGTCAGCA GAGTAGTGAGCA CAGTATTAAGCA AAGTATGGGGCA AAGTACCGAGCA AAGTATTTTGCA AAGTATGAAGCA AAATATTGAACA AAGGAATGAGCA TAGTACTGAGCA AGGAATTGAGCA AGGTATTGCGCA GAGTATCGAGCA AATTATTAAGCA AATTATTGAGAA ATGTTTTGAGCA AACTATTCAGCA ACGTAGTGAGCA GAGTATTGAGCC CAGGATTGAGCA AAGTCTTGAGAA CAGTGTTGAGCA TAGTATTGACCA AAGTACAGAGCA AAGTATTGATCA AAGTATTGATCC CAGTATTGAACA AGGTATTGACCA AAGTATTGATCG AAGATTTGAGCA AAGTAGGGAGCA AAGTAGTGAGAA AAATATGGAGCA AGGTATTCAGCA AATCATTGAGCA GATTATTGAGCA AAGCATTTAGCA AGGTATTGAGTA AAGTATTGATCT AAGCCTTGAGCA AACTATTGCGCA ACGTTTTGAGCA AGGTTTTGAGCA AACGATTGAGCA AAGTAGTGTGCA AATTATTGTGCA CCGTATTGAGCA AGGTATAGAGCA AAATGTTGAGCA AAGTGTTGAGTA AAGTGTTGATCA GAGTATTGAGAA GAGCATTGAGCA AAGGATTAAGCA AATTATTGAGTA AAGGATTGAGGA AAGTATTCAGTA AAGTATATAGCA CAGTTTTGAGCA AAGTCTTGAACA ACGTATTGAGGA AAGTGATGAGCA GACTATTGAGCA AAGTATTGAGAA AAGTAGTGAACA AAGTATTGAGAC AACTATTGACCA AAGTATTGTGTA AAGTATTGAGAT AAGCATTGAGCT AAGGATCGAGCA AAGTAGTGATCA AAGCATTGAGCG AAGCATTGAGCA GAGTTTTGAGCA AGGTATTGGGCA AAGTATTGTCCA CAGTATTGACCA AAATAATGAGCA AAGTGTGGAGCA AAGTACTGGGCA TAGCATTGAGCA AAACATTGAGCA AAGAATTGAGTA AAGTATTTACCA TAGTATCGAGCA AAGGGTTGAGCA AAGTAATGTGCA AAGTCCTGAGCA AAGTATTGGACA AAGTCTTCAGCA AAGTTTTGACCA AAGTTTTGATCA AAGAATTCAGCA CAGAATTGAGCA AAGTATTCAGAA GAGTATTTAGCA AAGTATTAAGAA AACTATTGTGCA AAGTATAAAGCA AAGTATTAACCA AAGAACTGAGCA ACGTCTTGAGCA AAATATCGAGCA AAGTACTGAGCT AAGTAGTGAGTA AAGTATTCTGCA AAGGATAGAGCA AAGTATTTCGCA AAGTACTGAGCG AAGAATTGCGCA AAGTATGCAGCA AGGTCTTGAGCA GAGTATTCAGCA AACAATTGAGCA AAGTATCGAGTA AATTATTGATCA AGGTATTGATCA AAGGATTGGGCA AAGTACTGAACA AAGCAATGAGCA AAGGATTGAGCA AAGTATTGCCCA AAGGATTGAGCC GAGAATTGAGCA AAGGATTGAGCG AAGGATTTAGCA ACTTATTGAGCA AACTATTGGGCA AGGTATTGTGCA AAGTCTTGAGGA AAGAATTGGGCA CAGTATTGAGAA TTGTATTGAGCA AAATATTGAGAA AAGTATCGAACA AATTATTGACCA AATTTTTGAGCA AACTACTGAGCA AACTGTTGAGCA AAGTAATGAGAA TAGTATTAAGCA ACGAATTGAGCA AGGTAATGAGCA AAGTATTAAGTA AAGTGTTGTGCA GAGTATTGAACA AAGTATTGAACA AAGAATTGAGCA AAGTATTGAACC AAGCATTGAGGA ACGTATTGCGCA AAGTATTGAACT AAGTTTTGAGTA ATGTATTGACCA TAGTATTGTGCA AAGTCTTGTGCA AAGTGTTTAGCA AAGCATTGAGCC AAGTATTCATCA GAGTGTTGAGCA AAGTATTGGGGA AAATATTCAGCA TAGTATTGAACA AACTATGGAGCA AAGTAATGATCA AAGTATCGGGCA ACGTATTGATCA ACGTACTGAGCA ATGTATTGAGAA AAGTAAGGAGCA AAGTATGGTGCA AAGTATCGAGCG AAGTATCGAGCC TAGTGTTGAGCA AAATATTGAGCT AAGTATACAGCA TAGTATTGAGAA AAGTACTGAGGA AAGTATCGAGCT AAGAGTTGAGCA AAGGATTGCGCA AAGTTTTGCGCA AAGTTTGGAGCA AAGAATTGAGCC AAGTATTGCGCG AAGTATTGCGCA AAGTATTGCGCC AAGTGTTGAGAA AATTATAGAGCA AAGCATAGAGCA AAGTATTGCGCT AAGTATTGACTA ATGTAATGAGCA ATGTATTGAACA AAGTCTTGAGCT AATAATTGAGCA AAGTATTGACAA AAGTCTTGAGCA AAGTCTTGAGCC AAGTCTTGATCA AAGTCTTGAGCG AAGTATTAAACA GAGGATTGAGCA AAGTATGGAGTA AAGAATTAAGCA AAGTATCGAGCA AAGTATTTAGAA TAGTATTGGGCA ACGTATGGAGCA AAGTATAGAGTA AAGTAGTGGGCA GAGTACTGAGCA AAGTTTTGGGCA AAGTACTGTGCA AAGTATTGTGCG CAGTATTGAGCG AAGTATTGTGCA CAGTATTGAGCA AAGTATTGTGCC TAGTATTTAGCA CAGTATTGAGCT AAGTATTGTGCT AAGCATTGATCA CAATATTGAGCA AAAGATTGAGCA TACTATTGAGCA GAGTATTGAGGA AACTATTGAGCT ACGTGTTGAGCA AAGTATAGACCA AGGTATCGAGCA AACTATTGAGCC AACTATTGAGCA AACTATTGAGCG GAGTATAGAGCA AAGTATTGAAGA AATTATTTAGCA AAGTATTGTACA AAGTTTTGAGCG AAGTTTTGAGCA AAGTTTTGAGCC CAGTAGTGAGCA AATTAGTGAGCA AAGTTTTGAGCT AAGTATGGATCA AAGTATTGGGCC AAGTAATGAACA AAGTATTGGTCA AAGTATTGGGCG AAGTATCGTGCA CAGTCTTGAGCA AAGTATGGAGAA CAGTATTGGGCA AAGTCTTAAGCA AAGTATTGACGA TAGTAATGAGCA AGGCATTGAGCA AATTGTTGAGCA AATTATTGCGCA AAGTTTTGAACA TAGTATTCAGCA AAATACTGAGCA AGGTATTGAGAA ATGTCTTGAGCA AAGTGGTGAGCA AAGTATTGAGAG AAGTATCGAGGA AAGTACTCAGCA ATGTAGTGAGCA TAATATTGAGCA AAGTATTTAACA AAGGATTGTGCA ACGTAATGAGCA AGTTATTGAGCA AAGTATTTGGCA AAGTATTGCGGA ACGCATTGAGCA TAGTATTGATCA CAGTATCGAGCA AAGTATTTAGTA AACTATTGAGGA AAGTGTTGACCA AAGTATTGATAA AAGTAATGGGCA TAGTATTGCGCA ACGTATTGAACA CAGTAATGAGCA AAGTATTAAGGA AAGTAGTGCGCA AAGTCATGAGCA TATTATTGAGCA AAGGTTTGAGCA ACGTATTGAGTA AAGTATCGACCA AACTATTTAGCA AAGAATTGAACA AAGTATCTAGCA AAATATAGAGCA AAGTAGTGAGGA AAGTACGGAGCA AACTTTTGAGCA TAGAATTGAGCA AAGTTTCGAGCA CAGCATTGAGCA ACGTATCGAGCA AAATATTGACCA AAGTTATGAGCA AAGAATTGACCA AAGCATTGTGCA AAGTATTGTGGA CAGTATTGAGGA TAGTCTTGAGCA GTGTATTGAGCA AGGTATGGAGCA AAGTGTTCAGCA AAGTTGTGAGCA AAGGATTGACCA AAGTATGGACCA AATTATTGAGCT ACGTATTGGGCA GAGTATTGAGCT ATGGATTGAGCA AAGTTCTGAGCA AATTATTGAGCC AAATATTGAGTA AATTATTGAGCA AATTATTGAGCG GAGTATTGAGCA AATTATCGAGCA AAGTATTATGCA AAGTTTAGAGCA AATTATTGGGCA AAGTATTGAGGT CAGTATTGCGCA AAGTGTTGAACA GAGTATTAAGCA AAGTATTGAGGC AAGTATTGAGGA AAGTATTGAGGG AAGTTTTGAGGA AAATATTGAGCC AAGAATTTAGCA AAATATTGTGCA AAGTCTTGAGTA AAGAATTGAGAA TAGTATGGAGCA ACGTATTGAGAA AAGTATTGATTA AAGAATGGAGCA AGGTATTGAACA AAGTATGTAGCA AAGTATGGAGGA ATGTATTGATCA AAATATTTAGCA AAGTCTTGCGCA AAGTATTCACCA AAGTAGTAAGCA AAGTAACGAGCA AAGCATCGAGCA AAGTTTTGAGAA CAGTATTCAGCA AGGTGTTGAGCA AACTCTTGAGCA AAGCATTGAGAA AAGTATTTATCA AAGTATAGGGCA AAGAAATGAGCA CAGTATTTAGCA AAGTATTAAGCT ACCTATTGAGCA AAGAATTGATCA AACTATTGAACA AAGTATTGCTCA ATGTATCGAGCA AAGTATTAAGCA AAGTATTAAGCC AAGTATTAATCA AAGTATTAAGCG AAGTACTGAGCC ACGTATTCAGCA AAGTATTGGGTA AAGTCTTGGGCA GAGTATTGCGCA ATGTATTGAGGA AAGTAATGAGCG AAGTATCCAGCA AAGTATTGAACG AAGTAGAGAGCA AAGTACTGAGAA AAGTATAGAGCG AAGTATAGAGCA AAGCATGGAGCA AAGTATAGAGCC AAGTAATGAGTA AAGTAATGAGCA ATGAATTGAGCA AAGTATAGAGCT TAGTATTGAGTA CTGTATTGAGCA AAGTGTTAAGCA AAGTATAGAACA AAGGATGGAGCA AAGTATCGCGCA AAGTCTTGACCA AAGTAATGCGCA AAGTATTCCGCA CAGTATTGTGCA GAGTATTGAGCG TAGTAGTGAGCA AAGTATTGAATA AAGTGTCGAGCA AAGGATTGAGAA GAATATTGAGCA GAGTATTGAGTA AAGTAATGACCA AAGTCTCGAGCA AAGTAATTAGCA ATGTATGGAGCA AAGTATTGAGCG AAGTATTGAGCC AAGTATTGAGCA AAGTACTGAGCA AAATATTGAGCG AAATATTGAGCA AAGTAGTGAGCT AAGTATTGAGCT AGGTATTGAGGA ATGTATTCAGCA AAGACTTGAGCA AAGTAATCAGCA AAATATTAAGCA AAGTAGTGAGCA AAGTAGTGAGCC AAGTGTTGGGCA AAGTAATGAGCC AAGTATTCGGCA AAGTAGTGAGCG AAGTAATGAGCT AATGATTGAGCA AGGTATTTAGCA ACGGATTGAGCA AACTATTAAGCA AAGTTTTGTGCA CAGTATTGATCA ATCTATTGAGCA AAGTATTGGCCA AAGTATTCAGGA AAGTTTTCAGCA CAGTATGGAGCA TAGTATTGAGCC GAGTATTGGGCA AAATAGTGAGCA TAGTATAGAGCA GGGTATTGAGCA CATTATTGAGCA AACTATAGAGCA ATGTATTGAGCA ATGTATTGAGCG AAGTCGTGAGCA AAGTATTGCACA AAGTATTCAGCC AAGCATTGCGCA AAGTATTCAGCA AAGTATTCAGCG AAGTAAAGAGCA ATGTATTAAGCA ATGTATTGAGCT CGGTATTGAGCA AAGTATTCAGCT AAGTATAGAGGA AAATTTTGAGCA AGGTATTGAGCC AAGTATGGAGCC AAGTATGGAGCA TAGTATTGAGGA GAGTATTGACCA TGGTATTGAGCA AAGTATGGAGCT AAGCATTCAGCA AAGGATTGAACA AAGTATTGTTCA AAGTGTTGAGCG AAGTGTTGAGCA AAGTGTTGAGCC GAGTATGGAGCA AAGTGTTGAGCT AACTATTGAGTA AAGTACTGATCA AAGTATTGAGTG AAGTATTGCGTA AAGTATAGCGCA AAGTATTGAGTC AAGTATTGAGTA AAGCGTTGAGCA AAGTATTGAGTT AACCATTGAGCA AAGAATCGAGCA AAGCACTGAGCA AAATATTGAGGA AACTAATGAGCA AATTAATGAGCA ATGTGTTGAGCA AATTATTCAGCA AAGTCTTTAGCA AAGTATCGAGAA AAGTATTTAGGA AAGTAGTTAGCA ATGTATTGGGCA ACGTATAGAGCA GCGTATTGAGCA AGCTATTGAGCA TCGTATTGAGCA AAGGATTGATCA AAGTAATGAGGA AAGTATTACGCA AAGTATTGGGAA AAGCATTGAGTA AAGTAATAAGCA ACGTATTGACCA AAGTATTGTGAA AAGCTTTGAGCA TAGTTTTGAGCA CAGTATTGAGTA AAGTATTGAAAA AAGGATTGAGTA TAGGATTGAGCA AACTATTGAGAA AAGTGCTGAGCA AGGTAGTGAGCA AAGGATTGAGCT CAGTATAGAGCA AAGTTTTTAGCA ATGTATTTAGCA AAGCATTGGGCA AAGCAGTGAGCA ATGCATTGAGCA AAGGCTTGAGCA ATATATTGAGCA AAGAATTGAGGA AATTACTGAGCA ACGTATTGTGCA GAGTATTGTGCA AAGTATTAGGCA AAGTAGTGACCA ATGTATTGTGCA AAGTCTAGAGCA AAGGAGTGAGCA TAGTATTGAGCA ATGTATTGAGCC ACGTATTTAGCA AAATCTTGAGCA TAGTATTGAGCG AGGTATTGAGCA AAGTATAGATCA AAGTATGGAACA AGGTATTGAGCG TAGTATTGAGCT AAGTATAGTGCA AGGTATTGAGCT AAGTAGCGAGCA AAGTGTTGAGGA AAGTATTGCGAA ATGTATAGAGCA AAGCATTGACCA AAGTTTTAAGCA AAGTATGGCGCA AATTATTGAGGA CAGTATTGAGCC GAGTATTGATCA'
len(neighbors('CCAGTCAATG', 1))
31
pattern = ''
d = 1
result = neighbors(pattern, d)
a = repr(result).replace(",", "")
b = a.replace("\'","")
print b.replace(" ", "\n")
{}
#I need to correct this
def neighbors_with_exact_d(pattern, d):
'''generate all k-mers of Hamming distance exactly d from Pattern.'''
nt = {'A':['C','G','T'], 'C':['A','G','T'], 'G':['A','C','T'], 'T':['A','C','G']}
if d == 0:
return pattern
if len(pattern) == 1:
return {'A', 'C', 'G', 'T'}
neighborhood = set()
suffixneighbors = neighbors_with_exact_d(pattern[1:], d)
for text in suffixneighbors:
if hammingDistance(text, pattern[1:]) < d:
for x in nt[pattern[0]]:
neighborhood.add(x + text)
else:
neighborhood.add(pattern[0] + text)
return neighborhood
pattern = 'ACG'
d = 1
neighbors_with_exact_d(pattern, d)
{'AAG', 'ACA', 'ACC', 'ACT', 'AGG', 'ATG'}
To prevent having to generate all 4^k k-mers in order to solve the Frequent Words with Mismatches Problem, our goal is to consider only those k-mers that are close to a k-mer in Text, i.e., those with Hamming distance at most d from this k-mer. Given a k-mer Pattern, we therefore define its d-neighborhood Neighbors(Pattern, d) as the set of all k-mers that are close to Pattern. For example, Neighbors(ACG, 1) consists of nine 3-mers:
ACG CCG GCG TCG AAG AGG ATG ACA ACC ACT
STOP and Think: Estimate the size of Neighbors(Pattern, d).
We will also use an array Close of size 4^k whose values we initialize to zero. In the following pseudocode, we set Close(i) = 1 whenever Pattern = NumberToPattern(i, k) is close to some k-mer in Text. This allows us to apply ApproximatePatternCount only to close k-mers, a smarter approach than applying it to all k-mers. This pseudocode also calls the function Neighbors(Pattern, d) in order to generate the d-neighborhood of a k-mer Pattern. See CHARGING STATION: Generating the Neighborhood of a String to learn how to implement this function.
A most frequent k-mer with up to d mismatches in Text is simply a string Pattern maximizing Countd(Text, Pattern) among all k-mers. Note that Pattern does not need to actually appear as a substring of Text; for example, as we already saw, AAAAA is the most frequent 5-mer with 1 mismatch in AACAAGCTGATAAACATTTAAAGAG, even though it does not appear exactly in this string. Keep this in mind while solving the following problem.
Frequent Words with Mismatches Problem: Find the most frequent k-mers with mismatches in a string. Input: A string Text as well as integers k and d. (You may assume k ≤ 12 and d ≤ 3.) Output: All most frequent k-mers with up to d mismatches in Text.
def frequentWordsWithMismatches(text, k, d):
frequentPatterns = set()
count = [0]*((4**k))
for i in range(len(text) -k + 1):
neighborhood = neighbors(text[i:i+k], d)
for word in neighborhood:
index = patternToNumber(word)
count[index] += 1
maxCount = max(count)
for i in range(4**k):
if count[i] == maxCount:
pattern = numberToPattern(i,k)
frequentPatterns.add(pattern)
return frequentPatterns
text = 'AAGTTGTTTATCGCGAGTTGCGGTATCGCGCCTATAAAAAGTTGTTAAGTTGTTCCTATAAAAGTTGCGGCTGATAGTTTATCGCGAAGTTGTTCCTATAAACTGATAGTTAAGTTGTTAGTTGCGGCCTATAAAAAGTTGTTCCTATAAACCTATAAACCTATAAAAAGTTGTTCTGATAGTTAAGTTGTTAAGTTGTTAAGTTGTTAAGTTGTTAAGTTGTTCCTATAAACTGATAGTTCTGATAGTTCTGATAGTTCTGATAGTTTATCGCGAAGTTGTTAAGTTGTTCTGATAGTTAAGTTGTTCCTATAAATATCGCGCTGATAGTTAGTTGCGGTATCGCGAGTTGCGGAAGTTGTTCCTATAAAAAGTTGTTAGTTGCGGAGTTGCGGAAGTTGTTAGTTGCGGTATCGCGAAGTTGTTCTGATAGTTCTGATAGTTCTGATAGTTTATCGCGAGTTGCGGCTGATAGTTTATCGCGCTGATAGTTAGTTGCGGAGTTGCGGAGTTGCGGTATCGCGAGTTGCGGTATCGCGAGTTGCGGTATCGCGTATCGCGAGTTGCGGAAGTTGTTAGTTGCGGAAGTTGTTAAGTTGTTAAGTTGTTCCTATAAAAGTTGCGGCCTATAAAAGTTGCGGAGTTGCGGAGTTGCGGAAGTTGTTCTGATAGTTTATCGCGTATCGCGCTGATAGTTCTGATAGTTAGTTGCGGTATCGCGCCTATAAACTGATAGTTAGTTGCGGAGTTGCGGCTGATAGTTTATCGCGTATCGCGCCTATAAAAAGTTGTTAAGTTGTTCTGATAGTTCCTATAAA'
x = frequentWordsWithMismatches(text, 7, 2)
' '.join(str(i) for i in x)
'GTAGTTG'
We now redefine the Frequent Words Problem to account for both mismatches and reverse complements. Recall that Pattern' refers to the reverse complement of Pattern.
Frequent Words with Mismatches and Reverse Complements Problem: Find the most frequent k-mers (with mismatches and reverse complements) in a string. Input: A DNA string Text as well as integers k and d. Output: All k-mers Pattern maximizing the sum Countd(Text, Pattern)+ Countd(Text, Pattern') over all possible k-mers.
def frequentWordsWithMismatchesAndRC(text, k, d):
frequentPatterns = set()
count = [0]*((4**k))
for i in range(len(text) -k + 1):
neighborhood = neighbors(text[i:i+k], d)
cneighborhood = neighbors(reverseComplement(text[i:i+k]), d)
for word in neighborhood:
index = patternToNumber(word)
count[index] += 1
for word in cneighborhood:
index = patternToNumber(word)
count[index] += 1
maxCount = max(count)
for i in range(4**k):
if count[i] == maxCount:
pattern = numberToPattern(i,k)
frequentPatterns.add(pattern)
return frequentPatterns
text = 'TCGTCGTTGTTGTCTTGATTTTGAGTTGAGTTTCGATTGAGTCTCGAGTCGATTTTTCGTCTTGTCGTCGAGTCGAGTCGTCGTCTTGTTTTGTCTTTCTCTTGTTGAGAGTTTTTTGTCGTCGTCTCGTTGAGTCTCGAGTCTTTCTCGTCGAGTCTCTCTTGTCGTCGTTGTTGTCGTCGAGTTTCGTTGATTTT'
x = frequentWordsWithMismatches(text, 5, 3)
' '.join(str(i) for i in x)
'TTTTT'
text = 'GATGAGTCCCACCAACCCCTACCTCGATGAGTCCCTACCTCCCTACCTCCCACCAACCGATGAGTCGTTACTTTACAACTTTTCCACCAACCCCTACCTCCCTACCTCGATGAGTCACAACTTTTGTTACTTTCCACCAACCGTTACTTTGATGAGTCGTTACTTTCCTACCTCCCACCAACCGTTACTTTCCACCAACCGATGAGTCGATGAGTCCCACCAACCGTTACTTTCCTACCTCACAACTTTTACAACTTTTGTTACTTTCCTACCTCGTTACTTTGTTACTTTCCTACCTCACAACTTTTCCACCAACCCCTACCTCGATGAGTCGATGAGTCGTTACTTTACAACTTTTCCACCAACCCCTACCTCCCACCAACCACAACTTTTGATGAGTCGTTACTTTCCACCAACCGATGAGTCCCACCAACCACAACTTTTCCTACCTCACAACTTTTCCTACCTCACAACTTTTCCACCAACCACAACTTTTCCACCAACCCCTACCTCCCACCAACCACAACTTTTGATGAGTCCCTACCTCACAACTTTTCCACCAACCGTTACTTTGATGAGTCGATGAGTCCCTACCTCGTTACTTTCCACCAACCCCTACCTCACAACTTTTGTTACTTTCCACCAACCCCACCAACCCCACCAACCGATGAGTCACAACTTTTCCTACCTCGTTACTTTCCACCAACCCCTACCTCACAACTTTTCCTACCTCGTTACTTTCCTACCTCCCACCAACCGATGAGTCCCACCAACCGATGAGTCCCACCAACCACAACTTTTGTTACTTTGATGAGTC'
x = frequentWordsWithMismatchesAndRC(text, 7, 3)
' '.join(str(i) for i in x)
'GGGGGGG CCCCCCC'
def minIndex(array):
minarray = min(array)
min_index = []
for item in range(len(array)):
if array[item] == minarray:
min_index.append(item)
return min_index
%%time
array
minIndex(array)
CPU times: user 11.6 ms, sys: 3.58 ms, total: 15.2 ms Wall time: 12.4 ms
%%time
array
m=min(array)
[index for index,element in enumerate(array) if element==m]
CPU times: user 21.7 ms, sys: 3.33 ms, total: 25 ms Wall time: 22.3 ms