#@title from IPython.display import HTML HTML('') ! pip install datasets transformers[sentencepiece] from datasets import load_dataset dataset = load_dataset("swiss_judgment_prediction", "all_languages", split="train") dataset[0] # Convert the output format to pandas.DataFrame dataset.set_format("pandas") dataset[0] dataset.__getitem__(0) dataset.set_format("pandas") dataset.__getitem__(0) df = dataset.to_pandas() df.head() # How are languages distributed across regions? df.groupby("region")["language"].value_counts() # Which legal area is most common? df["legal area"].value_counts() from transformers import AutoTokenizer # Load a pretrained tokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Tokenize the `text` column dataset.map(lambda x : tokenizer(x["text"])) # Reset back to Arrow format dataset.reset_format() # Now we can tokenize! dataset.map(lambda x : tokenizer(x["text"]))