#@title from IPython.display import HTML HTML('') ! pip install datasets transformers[sentencepiece] #@title from IPython.display import HTML HTML('') ! pip install datasets transformers[sentencepiece] from datasets import load_dataset from transformers import AutoTokenizer import numpy as np raw_datasets = load_dataset("glue", "mrpc") checkpoint = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_dataset(dataset): encoded = tokenizer( dataset["sentence1"], dataset["sentence2"], max_length=128, truncation=True, ) return encoded.data tokenized_datasets = raw_datasets.map(tokenize_dataset, batched=True) train_dataset = tokenized_datasets["train"].to_tf_dataset( columns=["input_ids", "attention_mask", "token_type_ids"], label_cols=["label"], shuffle=True, batch_size=8) validation_dataset = tokenized_datasets["validation"].to_tf_dataset( columns=["input_ids", "attention_mask", "token_type_ids"], label_cols=["label"], shuffle=True, batch_size=8) next(iter(train_dataset))[1] import tensorflow as tf from transformers import TFAutoModelForSequenceClassification checkpoint = 'bert-base-cased' model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) model.compile(optimizer='adam', loss=loss) model.fit( train_dataset, validation_data=validation_dataset, epochs=3 )