#@title
from IPython.display import HTML
HTML('')
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_dataset(dataset):
encoded = tokenizer(
dataset["sentence1"],
dataset["sentence2"],
padding=True,
truncation=True,
return_tensors='np',
)
return encoded.data
tokenized_datasets = {
split: tokenize_dataset(raw_datasets[split]) for split in raw_datasets.keys()
}
train_tokens = tokenized_datasets['train']['input_ids']
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
checkpoint = 'bert-base-cased'
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
from tensorflow.keras.optimizers.schedules import PolynomialDecay
batch_size = 8
num_epochs = 3
num_train_steps = (len(train_tokens) // batch_size) * num_epochs
lr_scheduler = PolynomialDecay(
initial_learning_rate=5e-5,
end_learning_rate=0.,
decay_steps=num_train_steps
)
from tensorflow.keras.optimizers import Adam
opt = Adam(learning_rate=lr_scheduler)
model.compile(loss=loss, optimizer=opt)
model.fit(
tokenized_datasets['train'],
np.array(raw_datasets['train']['label']),
validation_data=(tokenized_datasets['validation'], np.array(raw_datasets['validation']['label'])),
batch_size=8,
epochs=3
)
preds = model.predict(tokenized_datasets['validation'])['logits']
probabilities = tf.nn.softmax(preds)
class_preds = np.argmax(probabilities, axis=1)
from datasets import load_metric
metric = load_metric("glue", "mrpc")
metric.compute(predictions=class_preds, references=raw_datasets['validation']['label'])