#@title from IPython.display import HTML HTML('') ! pip install datasets transformers[sentencepiece] from datasets import load_dataset, load_metric raw_datasets = load_dataset("glue", "cola") raw_datasets from transformers import AutoTokenizer model_checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["sentence"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) from transformers import AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) import numpy as np from datasets import load_metric metric = load_metric("glue", "cola") def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=-1) return metric.compute(predictions=predictions, references=labels) from transformers import TrainingArguments args = TrainingArguments( "bert-fine-tuned-cola", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, push_to_hub=True, ) from transformers import Trainer trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], compute_metrics=compute_metrics, tokenizer=tokenizer, ) trainer.train() trainer.push_to_hub("End of training") repo_name = "bert-fine-tuned-cola" model.push_to_hub(repo_name) tokenizer.push_to_hub(repo_name) label_names = raw_datasets["train"].features["label"].names label_names model.config.id2label = {str(i): lbl for i, lbl in enumerate(label_names)} model.config.label2id = {lbl: str(i) for i, lbl in enumerate(label_names)} repo_name = "bert-fine-tuned-cola" model.config.push_to_hub(repo_name)