#!/usr/bin/env python
# coding: utf-8

# In[5]:


get_ipython().run_cell_magic('html', '', '<script>\n  function code_toggle() {\n    if (code_shown){\n      $(\'div.input\').hide(\'500\');\n      $(\'#toggleButton\').val(\'Show Code\')\n    } else {\n      $(\'div.input\').show(\'500\');\n      $(\'#toggleButton\').val(\'Hide Code\')\n    }\n    code_shown = !code_shown\n  }\n\n  $( document ).ready(function(){\n    code_shown=false;\n    $(\'div.input\').hide()\n  });\n</script>\n<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>\n<style>\n.rendered_html td {\n    font-size: xx-large;\n    text-align: left; !important\n}\n.rendered_html th {\n    font-size: xx-large;\n    text-align: left; !important\n}\n</style>\n')


# In[6]:


get_ipython().run_cell_magic('capture', '', 'import sys\nsys.path.append("..")\nimport statnlpbook.util as util\nimport matplotlib\nmatplotlib.rcParams[\'figure.figsize\'] = (10.0, 6.0)\n')


# <!---
# Latex Macros
# -->
# $$
# \newcommand{\Xs}{\mathcal{X}}
# \newcommand{\Ys}{\mathcal{Y}}
# \newcommand{\y}{\mathbf{y}}
# \newcommand{\balpha}{\boldsymbol{\alpha}}
# \newcommand{\bbeta}{\boldsymbol{\beta}}
# \newcommand{\aligns}{\mathbf{a}}
# \newcommand{\align}{a}
# \newcommand{\source}{\mathbf{s}}
# \newcommand{\target}{\mathbf{t}}
# \newcommand{\ssource}{s}
# \newcommand{\starget}{t}
# \newcommand{\repr}{\mathbf{f}}
# \newcommand{\repry}{\mathbf{g}}
# \newcommand{\x}{\mathbf{x}}
# \newcommand{\prob}{p}
# \newcommand{\bar}{\,|\,}
# \newcommand{\vocab}{V}
# \newcommand{\params}{\boldsymbol{\theta}}
# \newcommand{\param}{\theta}
# \DeclareMathOperator{\perplexity}{PP}
# \DeclareMathOperator{\argmax}{argmax}
# \DeclareMathOperator{\argmin}{argmin}
# \newcommand{\train}{\mathcal{D}}
# \newcommand{\counts}[2]{\#_{#1}(#2) }
# \newcommand{\length}[1]{\text{length}(#1) }
# \newcommand{\indi}{\mathbb{I}}
# $$

# In[7]:


get_ipython().run_line_magic('load_ext', 'tikzmagic')


# # Please complete the course evaluation
# 
# ### [evaluering.ku.dk](https://evaluering.ku.dk)
# 
# Deadline: November 6

# # What languages do you speak?
# 
# ### [ucph.page.link/lang](https://ucph.page.link/lang)
# 
# ([Responses](https://www.mentimeter.com/s/389360b38fa508b4ffd4e40bf47003e4/f43e4a2212e8/edit))

# # Machine Translation

# + Challenges
# + History
# + Statistical MT
# + Neural MT
# + Evaluation
# 

# ## Languages are hard (even for humans)!
# 
# <center style="padding-top:3em;">
#   <img src="mt_figures/whatever.jpg" />
#     
#   <span style="font-size:50%;">(Source: <a href="https://www.flickr.com/photos/98991392@N00/8729849093/sizes/z/in/pool-47169589@N00/">Flickr</a>)</span>
# </center>
# 
# [随便](https://translate.google.com/#view=home&op=translate&sl=zh-CN&tl=en&text=%E9%9A%8F%E4%BE%BF)

# ## Automatic machine translation is hard!
# 
# <center style="padding-top:3em;">
# <img src="../chapters/mt_figures/avocado.png" width="100%"/>
# </center>
# 
# [J'ai besoin d'un avocat pour mon procés de guacamole.](https://translate.google.com/?sl=fr&tl=en&text=J%27ai%20besoin%20d%27un%20avocat%20pour%20mon%20proc%C3%A9s%20de%20guacamole.&op=translate)
# 
# [guacamole lawsuit](https://www.latimes.com/archives/la-xpm-2006-dec-10-fi-letters10.2-story.html)

# Many things could go wrong.

# <center><img src="../img/quiz_time.png"></center>
# 
# ### [ucph.page.link/mt](https://ucph.page.link/mt)
# 
# ([Responses](https://docs.google.com/forms/d/10UUpQqlduvYuz7-SXSvzMRDAs5qxdliKJmgnkWiHNK8/edit#responses), [Solution](https://docs.google.com/document/d/1l_NQNz6ME1mm2ra5nKMYErwI98d3TJ3aC_bXct0PNhg/edit?usp=sharing))
# 

# ## Challenges
# 
# Divergences between languages include:
# 
# * Word order
# * Grammatical (morphological) marking (e.g., gender)
# * Division of concept space (e.g., English "wall" vs. German "Mauer"/"Wand")
# 
# Addressing them requires resolving ambiguities:
# 
# * Word sense (e.g., "bass")
# * Attributes with grammatical marking (e.g., formality in Japanese)
# * Reference in pro-drop contexts (e.g., in Mandarin Chinese)

# ## History
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/l6Av1jJRcgtfL5n9vEvrspxpEJJalwHAusRx" width="100%" />
# </center>
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ### Direct Machine Translation
# 
# Just use a **dictionary** and translate word-by-word.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/gfF1OTGY1E6QQBDW6l6kh1Daa4iingQmt86V" width="100%" />
# </center>
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ### Transfer-based Machine Translation
# 
# Add **rules** to inflect/join/split words based on source and target syntax.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/AfcjFhnoMFmb8koHf42YQeE6WfKdFNjqpkGL" width="100%" />
# </center>
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ## Interlingua
# 
# The ideal: transform to and from a language-independent representation.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/vEPJYMmjDV0LLXIy07LksIiOsecXdlHcs8nI" width="80%" />
# </center>
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)
# 
# Too hard to achieve with rules!

# ### Example-based Machine Translation (EBMT)
# 
# Retrieve a similar example from a translation database, and make adjustments as necessary.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/H-VtzpHN02SMIhwYjqmn04Uyd-nGWLwLmBwW" width="100%" />
# </center>
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# # Statistical Machine Translation (SMT)

# ## IBM Translation Models
# 
# In the late 80s and early 90s, IBM researchers revolutionised MT using statistical approaches instead of rules.
# 
# <img src="https://cdn-media-1.freecodecamp.org/images/1MJctJSHzUaYpUNIhvQdQtz4RKFq06nN7FJ9"/>
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ### IBM Model 1
# 
# Simple word-by-word translation, but with **statistical** dictionaries.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/4dBTKxFLuXkmeALMuxILitzmBd0zVOQVrhnP" width="100%" />
# </center>
# 
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/uF96He1UZaLMkC1TEuBGzoHAhw3PAvF8mgam" width="60%" />
# </center>
# 
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ### IBM Model 2
# Statistical translation and **reordering** model.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/AWurqrK2Zag9dIZSgYVGZCFxklsrZot7WLZ2" width="90%" />
# </center>
# 
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ### IBM Model 3
# Allows inserting new words.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/aPxpW2ssFd2wDio9C51Zbb0sIdXLBAV8DoYP" width="70%" />
# </center>
# 
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ## MT as Structured Prediction
# 
# **Model** \\(p(\target,\source)\\): how likely the target \\(\target\\) is to be a translation of source $\source$.
# 
# $$p(\textrm{I like music}, \textrm{音楽 が 好き}) \gg p(\textrm{I like persimmons}, \textrm{音楽 が 好き})$$

# * How is the scoring function defined (**modeling**)?
# $$\prob(\target,\source)\approx s_\params(\target,\source)$$
# 
# * How are the parameters \\(\params\\) learned (**training**)?
# 
# $$
# \argmax_\params \prod_{(\target,\source) \in \train} s_\params(\target, \source)
# $$
# 
# * How is translation \\(\argmax\\) found (**decoding**)?
# 
# $$\argmax_\target s_\params(\target,\source)$$

# ### Training
# Learn the parameters \\(\params\\) from data.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/5qBpooShbY6xVngSotA6KINHyHP7NKeTJryb" width="100%" />
# </center>
# 
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ## Generative Models 
# Estimate $\prob(\target,\source)$: how is the $(\target,\source)$ data **generated**?

# ### Noisy Channel
# 
# * Imagine a message $\target$ is sent through a noisy channel and $\source$ is received at the end.
# * Can we recover what was $\target$?
# * **Language model** $\prob(\target)$: does the target $\target$ look like real language?
# * **Translation model**: $\prob(\source|\target)$: does the source $\source$ match the target $\target$?
# 
# This defines a **joint** distribution
# 
# $$\prob(\target,\source) = \prob(\target) \prob(\source|\target)$$

# ## Word-based SMT
# 
# Decompose source and target to words, with statistical **alignment**.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/jG95Sgc2W4VJbwi4LFlJeMHnjLZbdGydCCzI" width="100%" />
# </center>
# 
# 
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ## Phrase-based SMT
# 
# Decompose source and target to **phrases** and look them up in phrase tables.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/lGJNqYGZOJMjs23F8-xf6E4buXptvm2IBzjg" width="100%" />
# </center>
# 
# 
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# # Neural Machine Translation (NMT)
# 
# Model $s_\params(\target,\source)$ directly using a neural network.
# 
# <center>
#    <img src="https://cdn-media-1.freecodecamp.org/images/DD6GvRmtxZGhC9toN1CHXUBWLhUXLpJSiJF5" width="100%" />
# </center>
# 
# 
# (Source: <a href="https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/">freeCodeCamp</a>)

# ## Sequence-to-sequence models (seq2seq)
# 
# Encoder–decoder architecture first "read" the input sequence and then generate an output sequence
# ([Sutskever et al., 2014](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf), [Cho et al., 2014](https://arxiv.org/abs/1406.1078)).
# 
# &nbsp;
# 
# <center>
#     <img src="mt_figures/encdec.svg" width="70%" />
# </center>
# 
# (Examples are Basque–English)

# ### We can use RNNs for that!
# 
# Example architecture:
# * Encoder: word embedding layer + Bi-LSTM to capture contextual information
# * Decoder: Uni-directional LSTM (because we need to *decode* word by word) + softmax layer on top
# 
# <center>
#     <img src="mt_figures/encdec_rnn1.svg" width="70%" />
# </center>
# 
# The end-of-sequence symbol `</S>` is necessary to know when to start and stop generating.

# ### But something's missing (again)...
# 
# &nbsp;
# 
# <center>
#     <img src="mt_figures/encdec_rnn1.svg" width="70%" />
# </center>

# Output words **depend on each other**!

# ## Autoregressive MT
# 
# At each step, feed the predicted word to the decoder as input for predicting the next word.
# 
# <center>
#     <img src="mt_figures/encdec_rnn2.svg" width="70%" />
# </center>

# ### Training
# 
# + Loss function: negative log-likelihood
# + **Teacher forcing:**  always feed the ground truth into the decoder.
# 
# *Alternative:*
# 
# + **Scheduled sampling:** with a certain probability, use model predictions instead.
# 

# ### Decoding
# 
# + Greedy decoding:
#     + Always pick the **most likely word** (according to the softmax)
#     + Continue generating more words **until the `</S>` symbol is predicted**

# + Beam search:
#     * In each step chooses **best next source word to translate**
#     * **Append a target word** based on source word
#     * Maintains a list of top-$k$ hypotheses in a **beam**

# ## More problems with our approach
# 
# &nbsp;
# 
# <center>
#     <img src="mt_figures/encdec_rnn3.svg" width="70%" />
# </center>

# ### Word alignment
# 
# In rule-based and statistical MT, word alignments are crucial.
# 
# <center style="padding: 1em 0;">
#     <img src="mt_figures/align.svg" width="20%" />
# </center>
# 
# Can we use them in neural MT?

# ### Attention mechanism
# 
# Jointly learning to align and to translate.
# 
# <center>
#     <img src="mt_figures/encdec_att.svg" width="40%" />
# </center>

# ### Attention matrix
# 
# <center>
#     <img src="mt_figures/att_matrix.png" width="40%" />
# </center>
# 
# <div style="text-align: right;">
#     (from <a href="https://arxiv.org/abs/1409.0473">Bahdanau et al., 2014</a>)
# </div>

# ### Transformers
# 
# Replace LSTMs by self-attention. Attend to encoded input *and* to partial output (autoregressive).
# 
# <center>
#     <img src="http://jalammar.github.io/images/xlnet/transformer-encoder-decoder.png" width=80%/>
# </center>
# 
# <div style="text-align: right;">
#     (from <a href="http://jalammar.github.io/illustrated-gpt2/">The Illustrated GPT-2</a>)
# </div>

# # MT evaluation
# 
# We're training the model with *negative log-likelihood*, but that's not the best way to *evaluate* it.

# Consider:
#     
# + After lunch, he went to the gym.
# + After he had lunch, he went to the gym.
# + He went to the gym after lunch.
# + He went to the gym after lunchtime.
# 
# In machine translation, there are often **several acceptable variations!**

# ## Human evaluation
# 
# * **Faithfulness** (or meaning preservation) to evaluate the "translation model"
# * **Fluency** to evaluate the "target language model"

# In general, manual evaluation is the best way, but it is not scalable. **Automatic** metrics are therefore often used.

# ## BLEU score
# 
# A widely used *reference-based* metric ([Papineni et al., 2002](https://aclanthology.org/P02-1040/)):
# 
# + Compare the prediction to one or more reference translations.
# + Count the number of matching $n$-grams between them.
#   - It is common to consider $1 \le n \le 4$
# + Divide by total number of $n$-grams.
# 
# The BLEU score will range between 0 (*no match at all*) and 1.0 (*perfect match*, 100%)

# Recommended library: [sacreBLEU](https://github.com/mjpost/sacrebleu)

# ### BLEU score examples

# In[2]:


get_ipython().system('pip install sacrebleu')


# In[15]:


from sacrebleu.metrics import BLEU
bleu = BLEU()

refs = [["After lunch, he went to the gym.",
        "He went to the gym after lunch."]]
bleu.corpus_score(["After lunch, he went to the gym."], refs).score


# In[17]:


bleu.corpus_score(["Turtles are great animals to the gym."], refs).score


# In[18]:


bleu.corpus_score(["After he had lunch, he went to the gym."], refs).score


# In[19]:


bleu.corpus_score(["Before lunch, he went to the gym."], refs).score


# + BLEU is very simplistic
# + Many alternatives have been proposed, including chrF ([Popović, 2015](https://aclanthology.org/W15-3049/)) and BERTScore ([Zhang et al., 2020](https://arxiv.org/abs/1904.09675))
# + ...but BLEU still remains very popular

# ## Improving efficiency and quality in MT
# 
# + More data
# + Bigger models
# + Better neural network architectures
# + Semi-supervised learning
# + **Transfer learning**

# ## Further reading
# 
# * Non-neural machine translation:
#   + Ilya Pestov's article [A history of machine translation from the Cold War to deep learning](https://www.freecodecamp.org/news/a-history-of-machine-translation-from-the-cold-war-to-deep-learning-f1d335ce8b5/)
#   + [Slides on SMT from this repo](word_mt_slides.ipynb)
#   + Mike Collins's [Lecture notes on IBM Model 1 and 2](http://www.cs.columbia.edu/~mcollins/courses/nlp2011/notes/ibm12.pdf)
# 
# * Sequence-to-sequence models:
#   + Graham Neubig, [Neural Machine Translation and Sequence-to-sequence Models: A Tutorial](https://arxiv.org/abs/1703.01619)
# 
# * And beyond...
#   + Philipp Koehn, [Neural Machine Translation, §13.6–13.8](https://arxiv.org/abs/1709.07809) gives a great overview of further refinements and challenges