#!/usr/bin/env python
# coding: utf-8

# In[4]:


#coding=utf8
from __future__ import unicode_literals

import itertools
import re


# In[5]:


import nltk
nltk.download('cmudict')
from nltk.corpus import cmudict


# In[3]:


d = cmudict.dict()


# In[81]:


def lookup(w):
    return d[w][0]


# This is the CMU pronouncing dictionary. For each word, it has a list of phonemes:

# In[102]:


words = [lookup('hat'), lookup('failure')]
words


# First, lets get rid of the numbers. I think they indicate stress. Whatever, we don't need 'em!

# In[103]:


def strip_phonemes(phs):
    return [s.rstrip('0123456789') for s in phs]


# In[104]:


words = map(strip_phonemes, words)
words


# Some of these are dipthongs. Benjamin reckons we better split those up, so here goes. I looked at the docs for cmudict:

# In[140]:


print("\n\n".join(cmudict.readme().split("\n\n")[3:-6]))


# Based on this, Benjamin came up with the following table:

# In[141]:


dipthongs_etc = {
    'AW': ['AE', 'UW'],
    'OW': ['AO', 'UW'],
    'EY': ['EH', 'IY'],
    'AY': ['AA', 'IY'],
    'OY': ['AO', 'IY'],
    'JH': ['D', 'ZH'],
    'CH': ['T', 'SH'],
}


# In[142]:


def dipthify(words):
    return sum([dipthongs_etc.get(s, [s]) for s in words], [])


# In[143]:


words = map(dipthify, words)
words


# We could use `reversed()` to get the result, but it's hard to read…

# In[144]:


list(reversed([list(reversed(w)) for w in words]))


# ## IPA output

# Benjamin can read IPA, so let's try that!

# In[112]:


ipa = {
    'AA': 'ɑː',
    'AH': 'ʌ',
    'AW': 'ou',
    'B':  'b',
    'D':  'd',
    'EH': 'e',
    'EY': 'eɪ',
    'G':  'g',
    'IH': 'ɪ',
    'JH': 'dʒ',
    'L':  'l',
    'N':  'n',
    'OW': 'əʊ',
    'P':  'p',
    'S':  's',
    'T':  't',
    'UH': 'ʊ',
    'V':  'v',
    'Y':  'j',
    'ZH': 'ʒ',
    'AE': 'æ',
    'AO': 'ɔː',
    'AY': 'ʌɪ',
    'CH': 'tʃ',
    'DH': 'ð',
    'ER': 'əː',
    'F':  'f',
    'HH': 'h',
    'IY': 'iː',
    'K':  'k',
    'M':  'm',
    'NG': 'ŋ',
    'OY': 'ɔɪ',
    'R':  'r',
    'SH': 'ʃ',
    'TH': 'θ',
    'UW': 'uː',
    'W':  'w',
    'Z':  'z',
}


# In[121]:


def ipaify(word):
    return ''.join(map(ipa.get, word))

result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))
print(result_ipa)


# I can't read that, but it seemed to sound ok when Benjamin tried it!

# ## English output

# It'd be nice if we could make version which uses English words where possible…

# So let's reverse the cmudict to get a dictionary mapping phonemes to words.

# In[126]:


backwards = {}
for word, pronounciations in d.items():
    word = word_pat.search(word).group(1)
    for phonemes in pronounciations:
        key = tuple(strip_phonemes(phonemes))
        backwards[key] = word


# In[167]:


backwards[tuple(reversed(strip_phonemes(lookup('ra'))))]


# In[ ]:


We need single-phonemes to fall back on…


# In[122]:


english = {
    'AA': 'o', # 'ah' not 'aw'
    'AH': 'uh',
    'AW': 'ow',
    'B':  'b',
    'D':  'd',
    'EH': 'eh',
    'EY': 'ay',
    'G':  'g',
    'IH': 'ih',
    'JH': 'jh',
    'L':  'l',
    'N':  'n',
    'OW': 'oah',
    'P':  'p',
    'S':  's',
    'T':  't',
    'UH': 'ooh',
    'V':  'v',
    'Y':  'y',
    'ZH': 'zz',
    'AE': 'aa',
    'AO': 'aww',
    'AY': 'eye',
    'CH': 'ch',
    'DH': 'th',
    'ER': 'er',
    'F':  'f',
    'HH': 'h',
    'IY': 'ee', # 'ee' would be better than 'e'
    'K':  'k',
    'M':  'm',
    'NG': 'ng',
    'OY': 'oy',
    'R':  'r',
    'SH': 'sh',
    'TH': 'th',
    'UW': 'ooo', # 'oo' would be better than 'ou'
    'W':  'w',
    'Z':  'z',
}


# In[168]:


for phoneme, word in english.items():
    backwards[phoneme] = word


# Then we can do a dumb greedy search.

# In[169]:


def engify(word):
    result = []
    word = list(word)
    while word:
        for i in range(len(word), 0, -1):
            key = tuple(word[:i])
            if key in backwards:
                result.append(backwards[key])
                word = word[i:]
                break
        else:
            result.append(english[word.pop(0)])
    return '-'.join(result)

result_english = ' '.join(reversed(map(engify, map(reversed, words))))
result_english


# ## Put it all together

# In[177]:


word_pat = re.compile(u'([A-z]+)')

text = unicode("I have milk, eggs and juice if any of that suits")
raw_words = filter(word_pat.match, word_pat.split(text))

words = [] # really phonemes
for w in raw_words:
    try:
        words.append(d[w.lower()][0])
    except KeyError:
        raise UnknownWord(w)

words = map(strip_phonemes, words)

words = map(dipthify, words)
result_english = ' '.join(reversed(map(engify, map(reversed, words))))

result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))

print(result_english)
print(result_ipa)


# ```
# bugreport: ɑː is prounced “ah”, not “aw”
# and “oo” would be better than “ou” for uː
# and for that matter, 
# ```

# In[ ]:


class UnknownWord(Exception): pass