!pip install datasets evaluate transformers[sentencepiece] !pip install zstandard from datasets import load_dataset # This takes a few minutes to run, so go grab a tea or coffee while you wait :) data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst" pubmed_dataset = load_dataset("json", data_files=data_files, split="train") pubmed_dataset pubmed_dataset[0] !pip install psutil import psutil # Process.memory_info is expressed in bytes, so convert to megabytes print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB") print(f"Number of files in dataset : {pubmed_dataset.dataset_size}") size_gb = pubmed_dataset.dataset_size / (1024**3) print(f"Dataset size (cache file) : {size_gb:.2f} GB") import timeit code_snippet = """batch_size = 1000 for idx in range(0, len(pubmed_dataset), batch_size): _ = pubmed_dataset[idx:idx + batch_size] """ time = timeit.timeit(stmt=code_snippet, number=1, globals=globals()) print( f"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in " f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s" ) pubmed_dataset_streamed = load_dataset( "json", data_files=data_files, split="train", streaming=True ) next(iter(pubmed_dataset_streamed)) from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x["text"])) next(iter(tokenized_dataset)) shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42) next(iter(shuffled_dataset)) dataset_head = pubmed_dataset_streamed.take(5) list(dataset_head) # Skip the first 1,000 examples and include the rest in the training set train_dataset = shuffled_dataset.skip(1000) # Take the first 1,000 examples for the validation set validation_dataset = shuffled_dataset.take(1000) law_dataset_streamed = load_dataset( "json", data_files="https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst", split="train", streaming=True, ) next(iter(law_dataset_streamed)) from itertools import islice from datasets import interleave_datasets combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed]) list(islice(combined_dataset, 2)) base_url = "https://the-eye.eu/public/AI/pile/" data_files = { "train": [base_url + "train/" + f"{idx:02d}.jsonl.zst" for idx in range(30)], "validation": base_url + "val.jsonl.zst", "test": base_url + "test.jsonl.zst", } pile_dataset = load_dataset("json", data_files=data_files, streaming=True) next(iter(pile_dataset["train"]))