#coding=utf8
from __future__ import unicode_literals
import itertools
import re
import nltk
nltk.download('cmudict')
from nltk.corpus import cmudict
[nltk_data] Downloading package cmudict to /Users/tim/nltk_data... [nltk_data] Package cmudict is already up-to-date!
d = cmudict.dict()
def lookup(w):
return d[w][0]
This is the CMU pronouncing dictionary. For each word, it has a list of phonemes:
words = [lookup('hat'), lookup('failure')]
words
[[u'HH', u'AE1', u'T'], [u'F', u'EY1', u'L', u'Y', u'ER0']]
First, lets get rid of the numbers. I think they indicate stress. Whatever, we don't need 'em!
def strip_phonemes(phs):
return [s.rstrip('0123456789') for s in phs]
words = map(strip_phonemes, words)
words
[[u'HH', u'AE', u'T'], [u'F', u'EY', u'L', u'Y', u'ER']]
Some of these are dipthongs. Benjamin reckons we better split those up, so here goes. I looked at the docs for cmudict:
print("\n\n".join(cmudict.readme().split("\n\n")[3:-6]))
File Format: Each line consists of an uppercased word, a counter (for alternative pronunciations), and a transcription. Vowels are marked for stress (1=primary, 2=secondary, 0=no stress). E.g.: NATURAL 1 N AE1 CH ER0 AH0 L The dictionary contains 127069 entries. Of these, 119400 words are assigned a unique pronunciation, 6830 words have two pronunciations, and 839 words have three or more pronunciations. Many of these are fast-speech variants. Phonemes: There are 39 phonemes, as shown below: Phoneme Example Translation Phoneme Example Translation ------- ------- ----------- ------- ------- ----------- AA odd AA D AE at AE T AH hut HH AH T AO ought AO T AW cow K AW AY hide HH AY D B be B IY CH cheese CH IY Z D dee D IY DH thee DH IY EH Ed EH D ER hurt HH ER T EY ate EY T F fee F IY G green G R IY N HH he HH IY IH it IH T IY eat IY T JH gee JH IY K key K IY L lee L IY M me M IY N knee N IY NG ping P IH NG OW oat OW T OY toy T OY P pee P IY R read R IY D S sea S IY SH she SH IY T tea T IY TH theta TH EY T AH UH hood HH UH D UW two T UW V vee V IY W we W IY Y yield Y IY L D Z zee Z IY ZH seizure S IY ZH ER (For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2 are contiguous, and not separated by FIRE'S 1.)
Based on this, Benjamin came up with the following table:
dipthongs_etc = {
'AW': ['AE', 'UW'],
'OW': ['AO', 'UW'],
'EY': ['EH', 'IY'],
'AY': ['AA', 'IY'],
'OY': ['AO', 'IY'],
'JH': ['D', 'ZH'],
'CH': ['T', 'SH'],
}
def dipthify(words):
return sum([dipthongs_etc.get(s, [s]) for s in words], [])
words = map(dipthify, words)
words
[[u'HH', u'AE', u'T'], [u'F', u'EH', u'IY', u'L', u'Y', u'ER']]
We could use reversed()
to get the result, but it's hard to read…
list(reversed([list(reversed(w)) for w in words]))
[[u'ER', u'Y', u'L', u'IY', u'EH', u'F'], [u'T', u'AE', u'HH']]
Benjamin can read IPA, so let's try that!
ipa = {
'AA': 'ɑː',
'AH': 'ʌ',
'AW': 'ou',
'B': 'b',
'D': 'd',
'EH': 'e',
'EY': 'eɪ',
'G': 'g',
'IH': 'ɪ',
'JH': 'dʒ',
'L': 'l',
'N': 'n',
'OW': 'əʊ',
'P': 'p',
'S': 's',
'T': 't',
'UH': 'ʊ',
'V': 'v',
'Y': 'j',
'ZH': 'ʒ',
'AE': 'æ',
'AO': 'ɔː',
'AY': 'ʌɪ',
'CH': 'tʃ',
'DH': 'ð',
'ER': 'əː',
'F': 'f',
'HH': 'h',
'IY': 'iː',
'K': 'k',
'M': 'm',
'NG': 'ŋ',
'OY': 'ɔɪ',
'R': 'r',
'SH': 'ʃ',
'TH': 'θ',
'UW': 'uː',
'W': 'w',
'Z': 'z',
}
def ipaify(word):
return ''.join(map(ipa.get, word))
result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))
print(result_ipa)
əːjliːef tæh
I can't read that, but it seemed to sound ok when Benjamin tried it!
It'd be nice if we could make version which uses English words where possible…
So let's reverse the cmudict to get a dictionary mapping phonemes to words.
backwards = {}
for word, pronounciations in d.items():
word = word_pat.search(word).group(1)
for phonemes in pronounciations:
key = tuple(strip_phonemes(phonemes))
backwards[key] = word
backwards[tuple(reversed(strip_phonemes(lookup('ra'))))]
u'are'
We need single-phonemes to fall back on…
english = {
'AA': 'o', # 'ah' not 'aw'
'AH': 'uh',
'AW': 'ow',
'B': 'b',
'D': 'd',
'EH': 'eh',
'EY': 'ay',
'G': 'g',
'IH': 'ih',
'JH': 'jh',
'L': 'l',
'N': 'n',
'OW': 'oah',
'P': 'p',
'S': 's',
'T': 't',
'UH': 'ooh',
'V': 'v',
'Y': 'y',
'ZH': 'zz',
'AE': 'aa',
'AO': 'aww',
'AY': 'eye',
'CH': 'ch',
'DH': 'th',
'ER': 'er',
'F': 'f',
'HH': 'h',
'IY': 'ee', # 'ee' would be better than 'e'
'K': 'k',
'M': 'm',
'NG': 'ng',
'OY': 'oy',
'R': 'r',
'SH': 'sh',
'TH': 'th',
'UW': 'ooo', # 'oo' would be better than 'ou'
'W': 'w',
'Z': 'z',
}
for phoneme, word in english.items():
backwards[phoneme] = word
Then we can do a dumb greedy search.
def engify(word):
result = []
word = list(word)
while word:
for i in range(len(word), 0, -1):
key = tuple(word[:i])
if key in backwards:
result.append(backwards[key])
word = word[i:]
break
else:
result.append(english[word.pop(0)])
return '-'.join(result)
result_english = ' '.join(reversed(map(engify, map(reversed, words))))
result_english
u'are-y-leigh-f t-aa-h'
word_pat = re.compile(u'([A-z]+)')
text = unicode("I have milk, eggs and juice if any of that suits")
raw_words = filter(word_pat.match, word_pat.split(text))
words = [] # really phonemes
for w in raw_words:
try:
words.append(d[w.lower()][0])
except KeyError:
raise UnknownWord(w)
words = map(strip_phonemes, words)
words = map(dipthify, words)
result_english = ' '.join(reversed(map(engify, map(reversed, words))))
result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))
print(result_english)
print(result_ipa)
stu-s t-aa-th v-uhh e-n-eh f-ih sioux-zz-d d-n-uhh z-g-eh clim v-aa-h e-awe stuːs tæð vʌ iːne fɪ suːʒd dnʌ zge klɪm væh iːɑː
bugreport: ɑː is prounced “ah”, not “aw”
and “oo” would be better than “ou” for uː
and for that matter,
class UnknownWord(Exception): pass