This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

In [ ]:

#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/HyQgpJTkRdE?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Out[ ]:

Install the Transformers and Datasets libraries to run this notebook.

In [ ]:

! pip install datasets transformers[sentencepiece]

In [ ]:

!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv

In [ ]:

from datasets import load_dataset

local_csv_dataset = load_dataset("csv", data_files="winequality-white.csv", sep=";")
local_csv_dataset["train"]

In [ ]:

# Load the dataset from the URL directly
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
remote_csv_dataset = load_dataset("csv", data_files=dataset_url, sep=";")
remote_csv_dataset

In [ ]:

dataset_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text_dataset = load_dataset("text", data_files=dataset_url)
text_dataset["train"][:5]

In [ ]:

dataset_url = "https://raw.githubusercontent.com/hirupert/sede/main/data/sede/train.jsonl"
json_lines_dataset = load_dataset("json", data_files=dataset_url)
json_lines_dataset["train"][:2]

In [ ]:

dataset_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
json_dataset = load_dataset("json", data_files=dataset_url, field="data")
json_dataset

In [ ]:

url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
data_files = {"train": f"{url}train-v2.0.json", "validation": f"{url}dev-v2.0.json"}
json_dataset = load_dataset("json", data_files=data_files, field="data")
json_dataset

In [ ]: