import random, re, bisect, time
def is_palindrome(s):
"Test if a string is a palindrome (only considering letters a-z)."
s1 = canonical(s)
return s1 == reversestr(s1)
def is_unique_palindrome(s):
"Test if string s is a palindrome where each comma-separated phrase is unique."
return is_palindrome(s) and is_unique(phrases(s))
def canonical(word, sub=re.compile('[^a-z]').sub):
"The canonical form for comparing: only lowercase a-z."
return sub('', word.lower())
def phrases(s):
"Break a string s into comma-separated phrases."
return [phrase.strip() for phrase in s.split(',')]
def reversestr(s):
"Reverse a string."
return s[::-1]
def is_unique(collection):
"Return true if collection has no duplicate elements."
return len(collection) == len(set(collection))
def test_utils():
assert is_unique_palindrome('A man, a plan, a canal, Panama!')
assert is_unique_palindrome('''A (man), a PLAN... a ``canal?'' -- Panama!''')
assert not is_unique_palindrome('A man, a plan, a radar, a canal, Panama.')
assert is_palindrome('A man, a plan, a canal, Panama.')
assert is_palindrome('Radar. Radar? Radar!')
assert not is_palindrome('radars')
assert phrases('A man, a plan, Panama') == ['A man', 'a plan', 'Panama']
assert canonical('A man, a plan, a canal, Panama') == 'amanaplanacanalpanama'
assert reversestr('foo') == 'oof'
assert is_unique([1, 2, 3])
assert not is_unique([1, 2, 2])
return 'test_utils passes'
test_utils()
'test_utils passes'
! [ -e npdict.txt ] || curl -O http://norvig.com/npdict.txt
! wc npdict.txt
126144 204928 1383045 npdict.txt
################ Reading in a dictionary
class PalDict:
"""A dictionary with the following fields:
words: a sorted list of words: ['ant', 'bee', 'sea']
rwords: a sorted list of reversed words: ['aes', 'eeb', 'tna']
truename: a dict of {canonical:true} pairs, e.g. {'anelk': 'an elk', 'anneelk': 'Anne Elk'}
k:
and the followng methods:
"""
def __init__(self, k=100, filename='npdict.txt'):
words, rwords, truename = [], [], {'': '', 'panama': 'Panama!'}
for tword in open(filename, 'r', encoding='ascii', errors='ignore').read().splitlines():
word = canonical(tword)
words.append(word)
rwords.append(reversestr(word))
truename[word] = tword
words.sort()
rwords.sort()
self.k = k
self.words = words
self.rwords = rwords
self.truename = truename
self.rangek = range(k)
self.tryharder = False
def startswith(self, prefix):
"""Return up to k canonical words that start with prefix.
If there are more than k, choose from them at random."""
return self._k_startingwith(self.words, prefix)
def endswith(self, rsuffix):
"""Return up to k canonical words that end with the reversed suffix.
If you want words ending in 'ing', ask for d.endswith('gni').
If there are more than k, choose from them at random."""
return [reversestr(s) for s in self._k_startingwith(self.rwords, rsuffix)]
def __contains__(self, word):
return word in self.truename
def _k_startingwith(self, words, prefix):
start = bisect.bisect_left(words, prefix)
end = bisect.bisect(words, prefix + 'zzzz')
n = end - start
if self.k >= n: # get all the words that start with prefix
results = words[start:end]
else: # sample from words starting with prefix
indexes = random.sample(range(start, end), self.k)
results = [words[i] for i in indexes]
random.shuffle(results)
## Consider words that are prefixes of the prefix.
## This is very slow, so don't use it until late in the game.
if self.tryharder:
for i in range(3, len(prefix)):
w = prefix[0:i]
if ((words == self.words and w in self.truename) or
(words == self.rwords and reversestr(w) in self.truename)):
results.append(w)
return results
paldict = PalDict()
def anpdictshort():
"Find the words that are valid when every phrase must start with 'a'"
def segment(word): return [s for s in word.split('a') if s]
def valid(word): return all(reversestr(s) in segments for s in segment(word))
words = [canonical(w) for w in open('anpdict.txt')]
segments = set(s for w in words for s in segment(w))
valid_words = [paldict.truename[w] for w in words if valid(w)]
file('anpdict-short2.txt', 'w').write('\n'.join(valid_words))
################ Search for a palindrome
class Panama:
def __init__(self, L='A man, a plan', R='a canal, Panama', dict=paldict):
## .left and .right hold lists of canonical words
## .diff holds the number of characters that are not matched,
## positive for words on left, negative for right.
## .stack holds (action, side, arg) tuples
self.left = []
self.right = []
self.best = 0
self.seen = {}
self.diff = 0
self.stack = []
self.starttime = time.clock()
self.dict = dict
self.steps = 0
for word in L.split(','):
self.add('left', canonical(word))
for rword in reversestr(R).split(','):
self.add('right', canonical(reversestr(rword)))
self.consider_candidates()
def search(self, steps=10*1000*1000):
"Search for palindromes."
for self.steps in range(steps):
if not self.stack:
return 'done'
action, dir, substr, arg = self.stack[-1]
if action == 'added': # undo the last word added
self.remove(dir, arg)
elif action == 'trying' and arg: # try the next word if there is one
self.add(dir, arg.pop()) and self.consider_candidates()
elif action == 'trying' and not arg: # otherwise backtrack
self.stack.pop()
else:
raise ValueError(action)
self.report()
return self
def add(self, dir, word):
"add a word"
if word in self.seen:
return False
else:
getattr(self, dir).append(word)
self.diff += factor[dir] * len(word)
self.seen[word] = True
self.stack.append(('added', dir, '?', word))
return True
def remove(self, dir, word):
"remove a word"
oldword = getattr(self, dir).pop()
assert word == oldword
self.diff -= factor[dir] * len(word)
del self.seen[word]
self.stack.pop()
def consider_candidates(self):
"""Push a new state with a set of candidate words onto stack."""
if self.diff > 0: # Left is longer, consider adding on right
dir = 'right'
substr = self.left[-1][-self.diff:]
candidates = self.dict.endswith(substr)
elif self.diff < 0: # Right is longer, consider adding on left
dir = 'left'
substr = reversestr(self.right[-1][0:-self.diff])
candidates = self.dict.startswith(substr)
else: # Both sides are same size
dir = 'left'
substr = ''
candidates = self.dict.startswith('')
if substr == reversestr(substr):
self.report()
self.stack.append(('trying', dir, substr, candidates))
def report(self):
"Report a new palindrome to log file (if it is sufficiently big)."
N = len(self)
if N > 13333:
self.dict.tryharder = True
if N > self.best and (N > 13000 or N > self.best+1000):
self.best = len(self)
self.bestphrase = str(self)
print('%5d phrases (%5d words) in %3d seconds (%6d steps)' % (
self.best, self.bestphrase.count(' ')+1, time.clock() - self.starttime,
self.steps))
assert is_unique_palindrome(self.bestphrase)
def __len__(self):
return len(self.left) + len(self.right)
def __str__(self):
truename = self.dict.truename
lefts = [truename[w] for w in self.left]
rights = [truename[w] for w in self.right]
return ', '.join(lefts + rights[::-1])
def __repr__(self):
return '<Panama with {} phrases>'.format(len(self))
factor = {'left': +1, 'right': -1}
# Note that we only allow one truename per canonical name. Occasionally
# this means we miss a good word (as in "a node" vs. "an ode"), but there
# are only 665 of these truename collisions, and most of them are of the
# form "a mark-up" vs. "a markup" so it seemed better to disallow them.
len(paldict.words)
126144
################ Unit Tests
def test2(p=PalDict()):
d = p.dict
def sameset(a, b): return set(a) == set(b)
assert 'panama' in d
assert d.words[0] in d
assert d.words[-1] in d
assert sameset(d.startswith('aword'), ['awording', 'awordbreak',
'awordiness', 'awordage', 'awordplay', 'awordlore', 'awordbook',
'awordlessness', 'aword', 'awordsmith'])
assert sameset(d.endswith('ytisob'), ['aglobosity', 'averbosity',
'asubglobosity', 'anonverbosity', 'agibbosity'])
d.tryharder = True
assert sameset(d.startswith('oklahoma'), ['oklahoma', 'okla'])
d.tryharder = False
assert d.startswith('oklahoma') == ['oklahoma']
assert d.startswith('fsfdsfdsfds') == []
print('all tests pass')
return p
p = Panama()
test2(p)
p.search().report()
p
all tests pass 1005 phrases ( 1239 words) in 0 seconds ( 18582 steps) 2012 phrases ( 2478 words) in 0 seconds ( 41886 steps) 3017 phrases ( 3710 words) in 0 seconds ( 64444 steps) 4020 phrases ( 4957 words) in 0 seconds ( 92989 steps) 5022 phrases ( 6184 words) in 1 seconds (128986 steps) 6024 phrases ( 7408 words) in 1 seconds (162634 steps) 7027 phrases ( 8607 words) in 1 seconds (204639 steps) 8036 phrases ( 9846 words) in 2 seconds (254992 steps) 9037 phrases (11050 words) in 2 seconds (320001 steps) 10039 phrases (12257 words) in 2 seconds (417723 steps) 11040 phrases (13481 words) in 3 seconds (565050 steps) 12043 phrases (14711 words) in 4 seconds (887405 steps)
<Panama with 12764 phrases>
! [ -e anpdict.txt ] || curl -O http://norvig.com/anpdict.txt
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 847k 100 847k 0 0 1037k 0 --:--:-- --:--:-- --:--:-- 1037k
anpdictshort()
! wc *npd*
4527 9055 39467 anpdict-short.txt 4527 9055 39467 anpdict-short2.txt 69241 138489 867706 anpdict.txt 126144 204928 1383045 npdict.txt 204439 361527 2329685 total
Can we go letter-by-letter instead of word-by-word? Advantages:
Process
Keep left- nad right- partial phrase lists; and the current state:
{left: ['aman', 'aplan'], right: ['acanal', panama'], left_word: True, right_word: True, extra_chars: +3, palindrome: True}
Now consider all ways of extending:
'a'
to the left, either as a new word or a continuation of the old word (perhaps going for 'a planaria'
).from collections import namedtuple
def do(state, action, side, L): action(state, side, L)
def add(state, side, L): getattr(state, side)[-1] += L
def new(state, side, L): getattr(state, side).append(L)
def undo(action, letter):
if action == add:
elif action == new:
else:
raise ValueError()