!pip install transformers datasets evaluate # here we are setting up the dataset from datasets import load_dataset dataset = load_dataset("yelp_review_full") print(dataset) # here we are printing out the dataset print(dataset['train']) print(dataset['train'].features) print(dataset['train'].num_rows) print(dataset['train'][0]) # here we are setting up the model from transformers import AutoTokenizer from transformers import AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5).to("cpu") # here we are printing out the model test_output1 = tokenizer("hello world", return_tensors="pt").to("cpu") test_output2 = model(**test_output1) print(test_output1) print(test_output2) # making the train/test dataset to length train_dataset = dataset["train"].select(range(1000)) test_dataset = dataset["test"].select(range(100)) # making the train/test dataset tokenized def tokenize_function(example): return tokenizer(example["text"], padding="max_length", truncation=True) train_dataset = train_dataset.map(tokenize_function, batched=True) test_dataset = test_dataset.map(tokenize_function, batched=True) # let's see what output of tokenize_function is print(dataset["train"][0]) print(dataset["train"][0].keys()) print("\n") print(tokenize_function(dataset["train"][0])) print(tokenize_function(dataset["train"][0]).keys()) # let's see what output of tokenize_function after map print(train_dataset[0]) print(train_dataset[0].keys()) # preparing the metrics for evaluation import numpy as np import evaluate metric = evaluate.load("accuracy") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) # preparing the trainer for training from transformers import TrainingArguments from transformers import Trainer training_args = TrainingArguments( output_dir="./some_local_dir", overwrite_output_dir=True, per_device_train_batch_size=4, dataloader_num_workers=2, max_steps=100, logging_steps=1, evaluation_strategy="steps", eval_steps=5 ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset, compute_metrics=compute_metrics, ) # what is the accuracy BEFORE training trainer.evaluate() trainer.train() # what is the accuracy AFTER training trainer.evaluate()