This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/pUh5cGmNV8Y?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from huggingface_hub import notebook_login
notebook_login()
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("glue", "cola")
raw_datasets
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["sentence"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
tokenized_datasets
from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tokenizer=tokenizer,
return_tensors='tf')
train_dataset = tokenized_datasets['train'].to_tf_dataset(
columns=['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
collate_fn=collator,
batch_size=32,
shuffle=True
)
validation_dataset = tokenized_datasets['validation'].to_tf_dataset(
columns=['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
collate_fn=collator,
batch_size=32,
shuffle=False
)
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)
from transformers import AdamWeightDecay
optimizer = AdamWeightDecay(2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)
from transformers import PushToHubCallback
callbacks = [PushToHubCallback("model_output/",
tokenizer=tokenizer,
hub_model_id="bert-fine-tuned-cola")]
model.fit(train_dataset, validation_data=validation_dataset, epochs=2, callbacks=callbacks)
model.push_to_hub("bert-fine-tuned-cola", commit_message="End of training")
label_names = raw_datasets["train"].features["label"].names
label_names
model.config.id2label = {str(i): lbl for i, lbl in enumerate(label_names)}
model.config.label2id = {lbl: str(i) for i, lbl in enumerate(label_names)}
repo_name = "bert-fine-tuned-cola"
model.config.push_to_hub(repo_name)
loaded_model = TFAutoModelForSequenceClassification.from_pretrained('Rocketknight1/bert-fine-tuned-cola')