This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

In [ ]:

#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/JwISwTCPPWo?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Out[ ]:

Install the Transformers and Datasets libraries to run this notebook.

In [ ]:

! pip install datasets transformers[sentencepiece]

In [ ]:

from datasets import load_dataset

data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
large_dataset = load_dataset("json", data_files=data_files, split="train")
size_gb = large_dataset.dataset_size / (1024 ** 3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

In [ ]:

import psutil

# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

In [ ]:

import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(large_dataset), batch_size):
    _ = large_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(large_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

In [ ]:

large_dataset_streamed = load_dataset(
    "json", data_files=data_files, split="train", streaming=True)

next(iter(large_dataset_streamed))

In [ ]:

type(large_dataset_streamed)

In [ ]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = large_dataset_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

In [ ]:

# Select the first 5 examples 
dataset_head = large_dataset_streamed.take(5)
list(dataset_head)

In [ ]:

# Skip the first 1,000 examples and include the rest in the training set
train_dataset = large_dataset_streamed.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = large_dataset_streamed.take(1000)