!pip install datasets evaluate transformers[sentencepiece] !pip install accelerate # To run the training on TPU, you will need to uncomment the following line: # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl !apt install git-lfs !git config --global user.email "you@example.com" !git config --global user.name "Your Name" from huggingface_hub import notebook_login notebook_login() from datasets import load_dataset raw_datasets = load_dataset("squad") raw_datasets print("Context: ", raw_datasets["train"][0]["context"]) print("Question: ", raw_datasets["train"][0]["question"]) print("Answer: ", raw_datasets["train"][0]["answers"]) raw_datasets["train"].filter(lambda x: len(x["answers"]["text"]) != 1) print(raw_datasets["validation"][0]["answers"]) print(raw_datasets["validation"][2]["answers"]) print(raw_datasets["validation"][2]["context"]) print(raw_datasets["validation"][2]["question"]) from transformers import AutoTokenizer model_checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) tokenizer.is_fast context = raw_datasets["train"][0]["context"] question = raw_datasets["train"][0]["question"] inputs = tokenizer(question, context) tokenizer.decode(inputs["input_ids"]) inputs = tokenizer( question, context, max_length=100, truncation="only_second", stride=50, return_overflowing_tokens=True, ) for ids in inputs["input_ids"]: print(tokenizer.decode(ids)) inputs = tokenizer( question, context, max_length=100, truncation="only_second", stride=50, return_overflowing_tokens=True, return_offsets_mapping=True, ) inputs.keys() inputs["overflow_to_sample_mapping"] inputs = tokenizer( raw_datasets["train"][2:6]["question"], raw_datasets["train"][2:6]["context"], max_length=100, truncation="only_second", stride=50, return_overflowing_tokens=True, return_offsets_mapping=True, ) print(f"The 4 examples gave {len(inputs['input_ids'])} features.") print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.") answers = raw_datasets["train"][2:6]["answers"] start_positions = [] end_positions = [] for i, offset in enumerate(inputs["offset_mapping"]): sample_idx = inputs["overflow_to_sample_mapping"][i] answer = answers[sample_idx] start_char = answer["answer_start"][0] end_char = answer["answer_start"][0] + len(answer["text"][0]) sequence_ids = inputs.sequence_ids(i) # Find the start and end of the context idx = 0 while sequence_ids[idx] != 1: idx += 1 context_start = idx while sequence_ids[idx] == 1: idx += 1 context_end = idx - 1 # If the answer is not fully inside the context, label is (0, 0) if offset[context_start][0] > start_char or offset[context_end][1] < end_char: start_positions.append(0) end_positions.append(0) else: # Otherwise it's the start and end token positions idx = context_start while idx <= context_end and offset[idx][0] <= start_char: idx += 1 start_positions.append(idx - 1) idx = context_end while idx >= context_start and offset[idx][1] >= end_char: idx -= 1 end_positions.append(idx + 1) start_positions, end_positions idx = 0 sample_idx = inputs["overflow_to_sample_mapping"][idx] answer = answers[sample_idx]["text"][0] start = start_positions[idx] end = end_positions[idx] labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1]) print(f"Theoretical answer: {answer}, labels give: {labeled_answer}") idx = 4 sample_idx = inputs["overflow_to_sample_mapping"][idx] answer = answers[sample_idx]["text"][0] decoded_example = tokenizer.decode(inputs["input_ids"][idx]) print(f"Theoretical answer: {answer}, decoded example: {decoded_example}") max_length = 384 stride = 128 def preprocess_training_examples(examples): questions = [q.strip() for q in examples["question"]] inputs = tokenizer( questions, examples["context"], max_length=max_length, truncation="only_second", stride=stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", ) offset_mapping = inputs.pop("offset_mapping") sample_map = inputs.pop("overflow_to_sample_mapping") answers = examples["answers"] start_positions = [] end_positions = [] for i, offset in enumerate(offset_mapping): sample_idx = sample_map[i] answer = answers[sample_idx] start_char = answer["answer_start"][0] end_char = answer["answer_start"][0] + len(answer["text"][0]) sequence_ids = inputs.sequence_ids(i) # Find the start and end of the context idx = 0 while sequence_ids[idx] != 1: idx += 1 context_start = idx while sequence_ids[idx] == 1: idx += 1 context_end = idx - 1 # If the answer is not fully inside the context, label is (0, 0) if offset[context_start][0] > start_char or offset[context_end][1] < end_char: start_positions.append(0) end_positions.append(0) else: # Otherwise it's the start and end token positions idx = context_start while idx <= context_end and offset[idx][0] <= start_char: idx += 1 start_positions.append(idx - 1) idx = context_end while idx >= context_start and offset[idx][1] >= end_char: idx -= 1 end_positions.append(idx + 1) inputs["start_positions"] = start_positions inputs["end_positions"] = end_positions return inputs train_dataset = raw_datasets["train"].map( preprocess_training_examples, batched=True, remove_columns=raw_datasets["train"].column_names, ) len(raw_datasets["train"]), len(train_dataset) def preprocess_validation_examples(examples): questions = [q.strip() for q in examples["question"]] inputs = tokenizer( questions, examples["context"], max_length=max_length, truncation="only_second", stride=stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", ) sample_map = inputs.pop("overflow_to_sample_mapping") example_ids = [] for i in range(len(inputs["input_ids"])): sample_idx = sample_map[i] example_ids.append(examples["id"][sample_idx]) sequence_ids = inputs.sequence_ids(i) offset = inputs["offset_mapping"][i] inputs["offset_mapping"][i] = [ o if sequence_ids[k] == 1 else None for k, o in enumerate(offset) ] inputs["example_id"] = example_ids return inputs validation_dataset = raw_datasets["validation"].map( preprocess_validation_examples, batched=True, remove_columns=raw_datasets["validation"].column_names, ) len(raw_datasets["validation"]), len(validation_dataset) small_eval_set = raw_datasets["validation"].select(range(100)) trained_checkpoint = "distilbert-base-cased-distilled-squad" tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint) eval_set = small_eval_set.map( preprocess_validation_examples, batched=True, remove_columns=raw_datasets["validation"].column_names, ) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) import torch from transformers import AutoModelForQuestionAnswering eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"]) eval_set_for_model.set_format("torch") device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names} trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to( device ) with torch.no_grad(): outputs = trained_model(**batch) start_logits = outputs.start_logits.cpu().numpy() end_logits = outputs.end_logits.cpu().numpy() import collections example_to_features = collections.defaultdict(list) for idx, feature in enumerate(eval_set): example_to_features[feature["example_id"]].append(idx) import numpy as np n_best = 20 max_answer_length = 30 predicted_answers = [] for example in small_eval_set: example_id = example["id"] context = example["context"] answers = [] for feature_index in example_to_features[example_id]: start_logit = start_logits[feature_index] end_logit = end_logits[feature_index] offsets = eval_set["offset_mapping"][feature_index] start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() for start_index in start_indexes: for end_index in end_indexes: # Skip answers that are not fully in the context if offsets[start_index] is None or offsets[end_index] is None: continue # Skip answers with a length that is either < 0 or > max_answer_length. if ( end_index < start_index or end_index - start_index + 1 > max_answer_length ): continue answers.append( { "text": context[offsets[start_index][0] : offsets[end_index][1]], "logit_score": start_logit[start_index] + end_logit[end_index], } ) best_answer = max(answers, key=lambda x: x["logit_score"]) predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]}) import evaluate metric = evaluate.load("squad") theoretical_answers = [ {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set ] print(predicted_answers[0]) print(theoretical_answers[0]) metric.compute(predictions=predicted_answers, references=theoretical_answers) from tqdm.auto import tqdm def compute_metrics(start_logits, end_logits, features, examples): example_to_features = collections.defaultdict(list) for idx, feature in enumerate(features): example_to_features[feature["example_id"]].append(idx) predicted_answers = [] for example in tqdm(examples): example_id = example["id"] context = example["context"] answers = [] # Loop through all features associated with that example for feature_index in example_to_features[example_id]: start_logit = start_logits[feature_index] end_logit = end_logits[feature_index] offsets = features[feature_index]["offset_mapping"] start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() for start_index in start_indexes: for end_index in end_indexes: # Skip answers that are not fully in the context if offsets[start_index] is None or offsets[end_index] is None: continue # Skip answers with a length that is either < 0 or > max_answer_length if ( end_index < start_index or end_index - start_index + 1 > max_answer_length ): continue answer = { "text": context[offsets[start_index][0] : offsets[end_index][1]], "logit_score": start_logit[start_index] + end_logit[end_index], } answers.append(answer) # Select the answer with the best score if len(answers) > 0: best_answer = max(answers, key=lambda x: x["logit_score"]) predicted_answers.append( {"id": example_id, "prediction_text": best_answer["text"]} ) else: predicted_answers.append({"id": example_id, "prediction_text": ""}) theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples] return metric.compute(predictions=predicted_answers, references=theoretical_answers) compute_metrics(start_logits, end_logits, eval_set, small_eval_set) model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint) from huggingface_hub import notebook_login notebook_login() from transformers import TrainingArguments args = TrainingArguments( "bert-finetuned-squad", evaluation_strategy="no", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, fp16=True, push_to_hub=True, ) from transformers import Trainer trainer = Trainer( model=model, args=args, train_dataset=train_dataset, eval_dataset=validation_dataset, tokenizer=tokenizer, ) trainer.train() predictions, _, _ = trainer.predict(validation_dataset) start_logits, end_logits = predictions compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"]) trainer.push_to_hub(commit_message="Training complete") from torch.utils.data import DataLoader from transformers import default_data_collator train_dataset.set_format("torch") validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"]) validation_set.set_format("torch") train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=8, ) eval_dataloader = DataLoader( validation_set, collate_fn=default_data_collator, batch_size=8 ) model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint) from torch.optim import AdamW optimizer = AdamW(model.parameters(), lr=2e-5) from accelerate import Accelerator accelerator = Accelerator(fp16=True) model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) from transformers import get_scheduler num_train_epochs = 3 num_update_steps_per_epoch = len(train_dataloader) num_training_steps = num_train_epochs * num_update_steps_per_epoch lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, ) from huggingface_hub import Repository, get_full_repo_name model_name = "bert-finetuned-squad-accelerate" repo_name = get_full_repo_name(model_name) repo_name output_dir = "bert-finetuned-squad-accelerate" repo = Repository(output_dir, clone_from=repo_name) from tqdm.auto import tqdm import torch progress_bar = tqdm(range(num_training_steps)) for epoch in range(num_train_epochs): # Training model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) # Evaluation model.eval() start_logits = [] end_logits = [] accelerator.print("Evaluation!") for batch in tqdm(eval_dataloader): with torch.no_grad(): outputs = model(**batch) start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy()) end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy()) start_logits = np.concatenate(start_logits) end_logits = np.concatenate(end_logits) start_logits = start_logits[: len(validation_dataset)] end_logits = end_logits[: len(validation_dataset)] metrics = compute_metrics( start_logits, end_logits, validation_dataset, raw_datasets["validation"] ) print(f"epoch {epoch}:", metrics) # Save and upload accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False ) accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) from transformers import pipeline # Replace this with your own checkpoint model_checkpoint = "huggingface-course/bert-finetuned-squad" question_answerer = pipeline("question-answering", model=model_checkpoint) context = """ 🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other. """ question = "Which deep learning libraries back 🤗 Transformers?" question_answerer(question=question, context=context)