This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, DatasetDict
ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="train")
raw_datasets = DatasetDict(
{
"train": ds_train,
"valid": ds_valid,
}
)
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
model = AutoModelForCausalLM.from_pretrained("huggingface-course/codeparrot-ds")
batch = tokenizer(["import numpy as np"], return_tensors="pt")
text = "import numpy as np\n"*20
context_length = 128
outputs = tokenizer(
text,
truncation=True,
max_length=16,
return_overflowing_tokens=True,
return_length=True,
)
print(f"Input chunk lengths: {(outputs['length'])}")
def tokenize(element):
outputs = tokenizer(
element["content"],
truncation=True,
max_length=context_length,
return_overflowing_tokens=True,
return_length=True,
)
input_batch = []
for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
if length == context_length:
input_batch.append(input_ids)
return {"input_ids": input_batch}
tokenized_datasets = raw_datasets.map(
tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
output = model(input_ids=batch["input_ids"], labels=batch["input_ids"])
loss = output.loss