In these exercises you will develop a machine translation system that can turn modern English into Shakespeare.
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys, os
_snlp_book_dir = ".."
sys.path.append(_snlp_book_dir)
import statnlpbook.word_mt as word_mt
# %cd ..
import sys
sys.path.append("..")
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
from collections import defaultdict
import statnlpbook.util as util
from statnlpbook.lm import *
from statnlpbook.util import safe_log as log
import statnlpbook.mt as mt
%%sh
cd ../data
if [ ! -d "shakespeare" ]; then
git clone https://github.com/tokestermw/tensorflow-shakespeare.git shakespeare
cd shakespeare
cat ./data/shakespeare/sparknotes/merged/*_modern.snt.aligned > modern.txt
cat ./data/shakespeare/sparknotes/merged/*_original.snt.aligned > original.txt
cd ..
fi
head -n 1 shakespeare/modern.txt
head -n 1 shakespeare/original.txt
I have half a mind to hit you before you speak again. I have a mind to strike thee ere thou speak’st.
Write methods for loading and tokenizing the aligned corpus.
import re
NULL = "NULL"
def tokenize(sentence):
return [] # todo
def pre_process(sentence):
return [] # todo
def load_shakespeare(corpus):
with open("../data/shakespeare/%s.txt" % corpus, "r") as f:
return [pre_process(x.rstrip('\n')) for x in f.readlines()]
modern = load_shakespeare("modern")
original = load_shakespeare("original")
MAX_LENGTH = 6
def create_wordmt_pairs(modern, original):
alignments = []
for i in range(len(modern)):
if len(modern[i]) <= MAX_LENGTH and len(original[i]) <= MAX_LENGTH:
alignments.append(([NULL] + modern[i], original[i]))
return alignments
train = create_wordmt_pairs(modern, original)
for i in range(10):
(mod, org) = train[i]
print(" ".join(mod), "|", " ".join(org))
print("\nTotal number of aligned sentence pairs", len(train))
NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | Total number of aligned sentence pairs 21079
# todo
Try a better language model for machine translation. How does the translation quality change for the examples you found earlier?
# todo
How can you change the decoder to work to translate to shorter or longer target sequences than the source?
# todo