This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/blF9uxYcKHo?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset
raw_datasets = load_dataset("allocine")
raw_datasets.cache_files
raw_datasets.save_to_disk("my-arrow-datasets")
from datasets import load_from_disk
arrow_datasets_reloaded = load_from_disk("my-arrow-datasets")
arrow_datasets_reloaded
for split, dataset in raw_datasets.items():
dataset.to_csv(f"my-dataset-{split}.csv", index=None)
data_files = {
"train": "my-dataset-train.csv",
"validation": "my-dataset-validation.csv",
"test": "my-dataset-test.csv",
}
csv_datasets_reloaded = load_dataset("csv", data_files=data_files)
csv_datasets_reloaded
# Save in JSON Lines format
for split, dataset in raw_datasets.items():
dataset.to_json(f"my-dataset-{split}.jsonl")
# Save in Parquet format
for split, dataset in raw_datasets.items():
dataset.to_parquet(f"my-dataset-{split}.parquet")
json_data_files = {
"train": "my-dataset-train.jsonl",
"validation": "my-dataset-validation.jsonl",
"test": "my-dataset-test.jsonl",
}
parquet_data_files = {
"train": "my-dataset-train.parquet",
"validation": "my-dataset-validation.parquet",
"test": "my-dataset-test.parquet",
}
# Reload with the `json` script
json_datasets_reloaded = load_dataset("json", data_files=json_data_files)
# Reload with the `parquet` script
parquet_datasets_reloaded = load_dataset("parquet", data_files=parquet_data_files)