#!/usr/bin/env python # coding: utf-8 # In[4]: #coding=utf8 from __future__ import unicode_literals import itertools import re # In[5]: import nltk nltk.download('cmudict') from nltk.corpus import cmudict # In[3]: d = cmudict.dict() # In[81]: def lookup(w): return d[w][0] # This is the CMU pronouncing dictionary. For each word, it has a list of phonemes: # In[102]: words = [lookup('hat'), lookup('failure')] words # First, lets get rid of the numbers. I think they indicate stress. Whatever, we don't need 'em! # In[103]: def strip_phonemes(phs): return [s.rstrip('0123456789') for s in phs] # In[104]: words = map(strip_phonemes, words) words # Some of these are dipthongs. Benjamin reckons we better split those up, so here goes. I looked at the docs for cmudict: # In[140]: print("\n\n".join(cmudict.readme().split("\n\n")[3:-6])) # Based on this, Benjamin came up with the following table: # In[141]: dipthongs_etc = { 'AW': ['AE', 'UW'], 'OW': ['AO', 'UW'], 'EY': ['EH', 'IY'], 'AY': ['AA', 'IY'], 'OY': ['AO', 'IY'], 'JH': ['D', 'ZH'], 'CH': ['T', 'SH'], } # In[142]: def dipthify(words): return sum([dipthongs_etc.get(s, [s]) for s in words], []) # In[143]: words = map(dipthify, words) words # We could use `reversed()` to get the result, but it's hard to read… # In[144]: list(reversed([list(reversed(w)) for w in words])) # ## IPA output # Benjamin can read IPA, so let's try that! # In[112]: ipa = { 'AA': 'ɑː', 'AH': 'ʌ', 'AW': 'ou', 'B': 'b', 'D': 'd', 'EH': 'e', 'EY': 'eɪ', 'G': 'g', 'IH': 'ɪ', 'JH': 'dʒ', 'L': 'l', 'N': 'n', 'OW': 'əʊ', 'P': 'p', 'S': 's', 'T': 't', 'UH': 'ʊ', 'V': 'v', 'Y': 'j', 'ZH': 'ʒ', 'AE': 'æ', 'AO': 'ɔː', 'AY': 'ʌɪ', 'CH': 'tʃ', 'DH': 'ð', 'ER': 'əː', 'F': 'f', 'HH': 'h', 'IY': 'iː', 'K': 'k', 'M': 'm', 'NG': 'ŋ', 'OY': 'ɔɪ', 'R': 'r', 'SH': 'ʃ', 'TH': 'θ', 'UW': 'uː', 'W': 'w', 'Z': 'z', } # In[121]: def ipaify(word): return ''.join(map(ipa.get, word)) result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words)))) print(result_ipa) # I can't read that, but it seemed to sound ok when Benjamin tried it! # ## English output # It'd be nice if we could make version which uses English words where possible… # So let's reverse the cmudict to get a dictionary mapping phonemes to words. # In[126]: backwards = {} for word, pronounciations in d.items(): word = word_pat.search(word).group(1) for phonemes in pronounciations: key = tuple(strip_phonemes(phonemes)) backwards[key] = word # In[167]: backwards[tuple(reversed(strip_phonemes(lookup('ra'))))] # In[ ]: We need single-phonemes to fall back on… # In[122]: english = { 'AA': 'o', # 'ah' not 'aw' 'AH': 'uh', 'AW': 'ow', 'B': 'b', 'D': 'd', 'EH': 'eh', 'EY': 'ay', 'G': 'g', 'IH': 'ih', 'JH': 'jh', 'L': 'l', 'N': 'n', 'OW': 'oah', 'P': 'p', 'S': 's', 'T': 't', 'UH': 'ooh', 'V': 'v', 'Y': 'y', 'ZH': 'zz', 'AE': 'aa', 'AO': 'aww', 'AY': 'eye', 'CH': 'ch', 'DH': 'th', 'ER': 'er', 'F': 'f', 'HH': 'h', 'IY': 'ee', # 'ee' would be better than 'e' 'K': 'k', 'M': 'm', 'NG': 'ng', 'OY': 'oy', 'R': 'r', 'SH': 'sh', 'TH': 'th', 'UW': 'ooo', # 'oo' would be better than 'ou' 'W': 'w', 'Z': 'z', } # In[168]: for phoneme, word in english.items(): backwards[phoneme] = word # Then we can do a dumb greedy search. # In[169]: def engify(word): result = [] word = list(word) while word: for i in range(len(word), 0, -1): key = tuple(word[:i]) if key in backwards: result.append(backwards[key]) word = word[i:] break else: result.append(english[word.pop(0)]) return '-'.join(result) result_english = ' '.join(reversed(map(engify, map(reversed, words)))) result_english # ## Put it all together # In[177]: word_pat = re.compile(u'([A-z]+)') text = unicode("I have milk, eggs and juice if any of that suits") raw_words = filter(word_pat.match, word_pat.split(text)) words = [] # really phonemes for w in raw_words: try: words.append(d[w.lower()][0]) except KeyError: raise UnknownWord(w) words = map(strip_phonemes, words) words = map(dipthify, words) result_english = ' '.join(reversed(map(engify, map(reversed, words)))) result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words)))) print(result_english) print(result_ipa) # ``` # bugreport: ɑː is prounced “ah”, not “aw” # and “oo” would be better than “ou” for uː # and for that matter, # ``` # In[ ]: class UnknownWord(Exception): pass