#@title
from IPython.display import HTML
HTML('')
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")
def extract_languages(examples):
inputs = [ex["en"] for ex in examples["translation"]]
targets = [ex["fr"] for ex in examples["translation"]]
return {"inputs": inputs, "targets": targets}
raw_datasets = raw_datasets.map(extract_languages, batched=True, remove_columns=["id", "translation"])
raw_datasets
print(raw_datasets["train"][10])
print(raw_datasets["train"][11])
print(raw_datasets["train"][12])
from transformers import AutoTokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
sample = raw_datasets["train"][12]
inputs = tokenizer(sample["inputs"])
targets = tokenizer(sample["targets"])
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))
from transformers import AutoTokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
sample = raw_datasets["train"][12]
inputs = tokenizer(sample["inputs"])
with tokenizer.as_target_tokenizer():
targets = tokenizer(sample["targets"])
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))
max_input_length = 128
max_target_length = 128
def preprocess_function(examples):
model_inputs = tokenizer(examples["inputs"], max_length=max_input_length, truncation=True)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(examples["targets"], max_length=max_target_length, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = raw_datasets.map(
preprocess_function, batched=True, remove_columns=["inputs", "targets"]
)
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)