#!/usr/bin/env python # coding: utf-8 # # Bananagrams-related explorations of words # # Using the enable wordlist from [puzzlers.org](http://www.puzzlers.org/dokuwiki/doku.php?id=solving:wordlists:about:start&s=enable1) # # We could use the dictionary on all unix machines, but it's not great: # In[1]: import os import re import sys # wordlist on all unix machines with open('/usr/share/dict/words') as f: words = [ line.strip() for line in f ] len(words) # Instead, we will download the wordlist from puzzlers.org: # In[2]: from urllib.request import urlopen def words_from_file(f): return [ line.strip().decode() for line in f ] # url = 'http://www.puzzlers.org/pub/wordlists/ospd.txt' # with urlopen(url) as f: # ospd = words_from_file(f) # Get enable wordlist url = 'http://www.puzzlers.org/pub/wordlists/enable1.txt' with urlopen(url) as f: enable = words_from_file(f) words = enable len(words) # In[3]: def _match_slots(word, slots): """Check if a word matches substrings at a given index.""" for idx, subs in slots.items(): if idx >= 0 and len(word) < (idx + len(subs)): return False if word[idx:idx+len(subs) or None] != subs: return False return True def find_words(length=None, min_length=None, max_length=None, slots=None, end=None, start=None, mask=None): """Find words matching given criteria Parameters ---------- length: int exact length match min_length: int minimum length of words max_length: int max length start: str text at the beginning of the word end: str text at the end of the word mask: str of the form 'a**l*' '*'-wildcard matching mask, will find substring with holes at any point in the word slots: dict(int:str) dict of substrings at a given index. slots={1:'x'} will match words where word[1] = 'x' """ if mask: mask = re.compile(mask.replace('*', '.')) for word in words: n = len(word) if length and n != length: continue if min_length and n < min_length: continue if max_length and n > max_length: continue if end and not word.endswith(end): continue if start and not word.startswith(start): continue if slots and not _match_slots(word, slots): continue if mask and not mask.search(word): continue yield word def print_words(word_list): """Print word list""" n = 0 for word in word_list: n += 1 print('%30s' % word) print("%i words" % n) # 12 letter words that start with r and end with d. # In[4]: print_words(find_words(length=12, start='r', end='d')) # 12 letter words that start with 'r' and end with 'd', # identifying the ones that are made up of 're-[word]-ed' # In[5]: subwords = 0 not_subwords = 0 for word in find_words(end='d', length=12, start='r'): if not word.startswith('re') and word.endswith('ed'): print(word) not_subwords += 1 continue subword1 = word[2:-2] subword2 = word[2:-1] if subword1 in words: subwords += 1 print('re-' + subword1 + '-ed') elif subword2 in words: subwords += 1 print('re-' + subword2 + '-d') else: not_subwords += 1 print(word) print(subwords, not_subwords, subwords + not_subwords) # Words that end in '-ship' of length at least 10 # In[6]: print_words(find_words(end='ship', min_length=10)) # Words of length at least 8, ending with 'p', where the 6th letter from the end is 'r': # In[7]: print_words(find_words(end='p', min_length=8, slots={-6: 'r'})) # All words with 'x' two letters before 'y', with max length 6: # In[8]: print_words(find_words(mask='x**y', max_length=6)) # All words with 'q' and not 'qu': # In[9]: haveq = [ w for w in words if 'q' in w ] qnou = [ w for w in haveq if 'qu' not in w ] qnou # # Distribution of word lengths # In[10]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt import seaborn # In[11]: lengths = np.array([len(w) for w in words]) lengths.sort() counts = np.bincount(lengths) total_counts = np.cumsum(counts) total_counts counts_at_least_n = len(words) - total_counts counts_at_least_n # In[12]: plt.hist(lengths, bins=len(counts)-2) plt.xlabel("length") plt.ylabel("number of words") plt.title("Distribution of word lengths"); # In[13]: plt.plot(counts_at_least_n) plt.title("Words at least a given length") plt.xlabel("length") plt.ylabel("words"); # Zooming in on the tail: # In[14]: n=20 plt.plot(1 + np.arange(n-1, len(counts_at_least_n)), counts_at_least_n[n-1:]) plt.title("Words at least a given length") plt.xlabel("length") plt.ylabel("words"); # So there are 120 words of at least 21 characters, and ~25 at least 24. # The longest word is 28 characters. # # These are the longest 100 words: # In[15]: long_words = sorted(words, key=len) print_words(long_words[-100:]) # All the words of at least 20 characters: # In[16]: print_words(find_words(min_length=21))