Notebook

Panama Palindrome¶

Utilities¶

In [39]:

import random, re, bisect, time

def is_palindrome(s):
    "Test if a string is a palindrome (only considering letters a-z)."
    s1 = canonical(s)
    return s1 == reversestr(s1)

def is_unique_palindrome(s):
    "Test if string s is a palindrome where each comma-separated phrase is unique."
    return is_palindrome(s) and is_unique(phrases(s))

def canonical(word, sub=re.compile('[^a-z]').sub):
    "The canonical form for comparing: only lowercase a-z."
    return sub('', word.lower())

def phrases(s):
    "Break a string s into comma-separated phrases."
    return [phrase.strip() for phrase in s.split(',')]

def reversestr(s):
    "Reverse a string."
    return s[::-1]

def is_unique(collection):
    "Return true if collection has no duplicate elements."
    return len(collection) == len(set(collection))

In [35]:

def test_utils():
    assert is_unique_palindrome('A man, a plan, a canal, Panama!')
    assert is_unique_palindrome('''A (man),     a   PLAN... a ``canal?'' -- Panama!''')
    assert not is_unique_palindrome('A man, a plan, a radar, a canal, Panama.')
    
    assert is_palindrome('A man, a plan, a canal, Panama.')
    assert is_palindrome('Radar. Radar? Radar!')
    assert not is_palindrome('radars')

    assert phrases('A man, a plan, Panama') == ['A man', 'a plan', 'Panama']
    assert canonical('A man, a plan, a canal, Panama') == 'amanaplanacanalpanama'
    assert reversestr('foo') == 'oof'
    assert is_unique([1, 2, 3])
    assert not is_unique([1, 2, 2])
    return 'test_utils passes'

test_utils()

Out[35]:

'test_utils passes'

The Dictionary¶

In [6]:

! [ -e npdict.txt ] || curl -O http://norvig.com/npdict.txt

In [7]:

! wc npdict.txt

  126144  204928 1383045 npdict.txt

In [45]:

################ Reading in a dictionary

class PalDict:
    """A dictionary with the following fields:
    words: a sorted list of words: ['ant', 'bee', 'sea']
    rwords: a sorted list of reversed words: ['aes', 'eeb', 'tna']
    truename: a dict of {canonical:true} pairs, e.g. {'anelk': 'an elk', 'anneelk': 'Anne Elk'}
    k:
    and the followng methods:
    """
    
    def __init__(self, k=100, filename='npdict.txt'):
        words, rwords, truename = [], [], {'': '', 'panama': 'Panama!'}
        for tword in open(filename, 'r', encoding='ascii', errors='ignore').read().splitlines():
            word = canonical(tword)
            words.append(word)
            rwords.append(reversestr(word))
            truename[word] = tword
        words.sort()
        rwords.sort()
        self.k = k
        self.words = words
        self.rwords = rwords
        self.truename = truename
        self.rangek = range(k)
        self.tryharder = False

    def startswith(self, prefix):
        """Return up to k canonical words that start with prefix.
        If there are more than k, choose from them at random."""
        return self._k_startingwith(self.words, prefix)

    def endswith(self, rsuffix):
        """Return up to k canonical words that end with the reversed suffix.
        If you want words ending in 'ing', ask for d.endswith('gni').
        If there are more than k, choose from them at random."""
        return [reversestr(s) for s in self._k_startingwith(self.rwords, rsuffix)]

    def __contains__(self, word):
        return word in self.truename

    def _k_startingwith(self, words, prefix):
        start = bisect.bisect_left(words, prefix)
        end = bisect.bisect(words, prefix + 'zzzz')
        n = end - start
        if self.k >= n: # get all the words that start with prefix
            results = words[start:end]
        else: # sample from words starting with prefix 
            indexes = random.sample(range(start, end), self.k)
            results = [words[i] for i in indexes]
        random.shuffle(results)
        ## Consider words that are prefixes of the prefix.
        ## This is very slow, so don't use it until late in the game.
        if self.tryharder:
            for i in range(3, len(prefix)):
                w = prefix[0:i]
                if ((words == self.words and w in self.truename) or
                    (words == self.rwords and reversestr(w) in self.truename)):
                    results.append(w)
        return results

paldict = PalDict() 

def anpdictshort():
    "Find the words that are valid when every phrase must start with 'a'"
    def segment(word):  return [s for s in word.split('a') if s]
    def valid(word): return all(reversestr(s) in segments for s in segment(word))
    words = [canonical(w) for w in open('anpdict.txt')]
    segments = set(s for w in words for s in segment(w))
    valid_words = [paldict.truename[w] for w in words if valid(w)]
    file('anpdict-short2.txt', 'w').write('\n'.join(valid_words))

################ Search for a palindrome

class Panama:
    def __init__(self, L='A man, a plan', R='a canal, Panama', dict=paldict):
        ## .left and .right hold lists of canonical words
        ## .diff holds the number of characters that are not matched,
        ##  positive for words on left, negative for right.
        ## .stack holds (action, side, arg) tuples
        self.left = []
        self.right = []
        self.best = 0
        self.seen = {}
        self.diff = 0
        self.stack = []
        self.starttime = time.clock()
        self.dict = dict
        self.steps = 0
        for word in L.split(','):
            self.add('left', canonical(word))
        for rword in reversestr(R).split(','):
            self.add('right', canonical(reversestr(rword)))
        self.consider_candidates()
        
    def search(self, steps=10*1000*1000):
        "Search for palindromes."
        for self.steps in range(steps):
            if not self.stack:
                return 'done'
            action, dir, substr, arg = self.stack[-1]
            if action == 'added': # undo the last word added
                self.remove(dir, arg)
            elif action == 'trying' and arg: # try the next word if there is one
                self.add(dir, arg.pop()) and self.consider_candidates()
            elif action == 'trying' and not arg: # otherwise backtrack
                self.stack.pop()
            else:
                raise ValueError(action)
        self.report()
        return self

    def add(self, dir, word):
        "add a word"
        if word in self.seen:
            return False
        else:
            getattr(self, dir).append(word)
            self.diff += factor[dir] * len(word)
            self.seen[word] = True
            self.stack.append(('added', dir, '?', word))
            return True

    def remove(self, dir, word):
        "remove a word"
        oldword = getattr(self, dir).pop()
        assert word == oldword
        self.diff -= factor[dir] * len(word)
        del self.seen[word]
        self.stack.pop()
        
    def consider_candidates(self):
        """Push a new state with a set of candidate words onto stack."""
        if self.diff > 0: # Left is longer, consider adding on right
            dir = 'right'
            substr =  self.left[-1][-self.diff:]
            candidates = self.dict.endswith(substr)
        elif self.diff < 0: # Right is longer, consider adding on left
            dir = 'left'
            substr =  reversestr(self.right[-1][0:-self.diff])
            candidates = self.dict.startswith(substr)
        else: # Both sides are same size
            dir = 'left'
            substr = ''
            candidates = self.dict.startswith('')
        if substr == reversestr(substr):
            self.report()
        self.stack.append(('trying', dir, substr, candidates))
                
    def report(self):
        "Report a new palindrome to log file (if it is sufficiently big)."
        N = len(self)
        if N > 13333:
            self.dict.tryharder = True
        if N > self.best and (N > 13000 or N > self.best+1000):
            self.best = len(self)
            self.bestphrase = str(self)
            print('%5d phrases (%5d words) in %3d seconds (%6d steps)' % (
                self.best, self.bestphrase.count(' ')+1, time.clock() - self.starttime,
                self.steps))
            assert is_unique_palindrome(self.bestphrase)

    def __len__(self):
        return len(self.left) + len(self.right)

    def __str__(self):
        truename = self.dict.truename
        lefts =  [truename[w] for w in self.left]
        rights = [truename[w] for w in self.right]
        return ', '.join(lefts + rights[::-1])
    
    def __repr__(self):
        return '<Panama with {} phrases>'.format(len(self))

factor = {'left': +1, 'right': -1}

# Note that we only allow one truename per canonical name.  Occasionally
# this means we miss a good word (as in "a node" vs. "an ode"), but there
# are only 665 of these truename collisions, and most of them are of the
# form "a mark-up" vs. "a markup" so it seemed better to disallow them.

In [30]:

len(paldict.words)

Out[30]:

In [46]:

################ Unit Tests
 
def test2(p=PalDict()):
    d = p.dict
    def sameset(a, b): return set(a) == set(b)
    assert 'panama' in d
    assert d.words[0] in d
    assert d.words[-1] in d
    assert sameset(d.startswith('aword'), ['awording', 'awordbreak',
        'awordiness', 'awordage', 'awordplay', 'awordlore', 'awordbook',
        'awordlessness', 'aword', 'awordsmith'])
    assert sameset(d.endswith('ytisob'), ['aglobosity', 'averbosity',
        'asubglobosity', 'anonverbosity', 'agibbosity'])
    d.tryharder = True
    assert sameset(d.startswith('oklahoma'), ['oklahoma', 'okla'])
    d.tryharder = False
    assert d.startswith('oklahoma') == ['oklahoma']
    assert d.startswith('fsfdsfdsfds') == []
    print('all tests pass')
    return p

p = Panama()
test2(p)
p.search().report()
p

all tests pass
 1005 phrases ( 1239 words) in   0 seconds ( 18582 steps)
 2012 phrases ( 2478 words) in   0 seconds ( 41886 steps)
 3017 phrases ( 3710 words) in   0 seconds ( 64444 steps)
 4020 phrases ( 4957 words) in   0 seconds ( 92989 steps)
 5022 phrases ( 6184 words) in   1 seconds (128986 steps)
 6024 phrases ( 7408 words) in   1 seconds (162634 steps)
 7027 phrases ( 8607 words) in   1 seconds (204639 steps)
 8036 phrases ( 9846 words) in   2 seconds (254992 steps)
 9037 phrases (11050 words) in   2 seconds (320001 steps)
10039 phrases (12257 words) in   2 seconds (417723 steps)
11040 phrases (13481 words) in   3 seconds (565050 steps)
12043 phrases (14711 words) in   4 seconds (887405 steps)

Out[46]:

<Panama with 12764 phrases>

In [ ]:

In [21]:

! [ -e anpdict.txt ] || curl -O http://norvig.com/anpdict.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  847k  100  847k    0     0  1037k      0 --:--:-- --:--:-- --:--:-- 1037k

In [26]:

anpdictshort()

In [27]:

! wc *npd*

    4527    9055   39467 anpdict-short.txt
    4527    9055   39467 anpdict-short2.txt
   69241  138489  867706 anpdict.txt
  126144  204928 1383045 npdict.txt
  204439  361527 2329685 total

Letter-By-Letter Approach¶

Can we go letter-by-letter instead of word-by-word? Advantages:

We can (if need be) be exhaustive at each decision point, trying all 26 possibilities.
We can try the most likely letters first.

Process

Keep left- nad right- partial phrase lists; and the current state:

{left: ['aman', 'aplan'], right: ['acanal', panama'], left_word: True, right_word: True, extra_chars: +3, palindrome: True}
Now consider all ways of extending:
- Add the letter 'a' to the left, either as a new word or a continuation of the old word (perhaps going for 'a planaria').
- Add a letter, any letter, to the right, either as a new word or a continuation of

In [ ]:

from collections import namedtuple


def do(state, action, side, L): action(state, side, L)
def add(state, side, L): getattr(state, side)[-1]  += L
def new(state, side, L): getattr(state, side).append(L)
def undo(action, letter):
    if action == add:
    elif action == new:
    else:
        raise ValueError()