!pip install datasets evaluate transformers[sentencepiece] !pip install accelerate # To run the training on TPU, you will need to uncomment the following line: # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl !apt install git-lfs !git config --global user.email "you@example.com" !git config --global user.name "Your Name" from huggingface_hub import notebook_login notebook_login() from transformers import AutoModelForMaskedLM model_checkpoint = "distilbert-base-uncased" model = AutoModelForMaskedLM.from_pretrained(model_checkpoint) distilbert_num_parameters = model.num_parameters() / 1_000_000 print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'") print(f"'>>> BERT number of parameters: 110M'") text = "This is a great [MASK]." from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) import torch inputs = tokenizer(text, return_tensors="pt") token_logits = model(**inputs).logits # Tìm vị trí [MASK] và trích xuất logit mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] mask_token_logits = token_logits[0, mask_token_index, :] # Chọn ứng viên cho [MASK] với logit cao nhất top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() for token in top_5_tokens: print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'") from datasets import load_dataset imdb_dataset = load_dataset("imdb") imdb_dataset sample = imdb_dataset["train"].shuffle(seed=42).select(range(3)) for row in sample: print(f"\n'>>> Review: {row['text']}'") print(f"'>>> Label: {row['label']}'") def tokenize_function(examples): result = tokenizer(examples["text"]) if tokenizer.is_fast: result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] return result # Dùng batched=True để kích hoạt đa luồng nhanh! tokenized_datasets = imdb_dataset.map( tokenize_function, batched=True, remove_columns=["text", "label"] ) tokenized_datasets tokenizer.model_max_length chunk_size = 128 # Tạo ra một danh sách các danh sách cho từng đặc trưng tokenized_samples = tokenized_datasets["train"][:3] for idx, sample in enumerate(tokenized_samples["input_ids"]): print(f"'>>> Review {idx} length: {len(sample)}'") concatenated_examples = { k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys() } total_length = len(concatenated_examples["input_ids"]) print(f"'>>> Concatenated reviews length: {total_length}'") chunks = { k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items() } for chunk in chunks["input_ids"]: print(f"'>>> Chunk length: {len(chunk)}'") def group_texts(examples): # Nối tất cả các văn bản concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} # Tính độ dài của các văn bản được nối total_length = len(concatenated_examples[list(examples.keys())[0]]) # Chúng tôi bỏ đoạn cuối cùng nếu nó nhỏ hơn chunk_size total_length = (total_length // chunk_size) * chunk_size # Chia phần theo max_len result = { k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items() } # Tạo cột nhãn mới result["labels"] = result["input_ids"].copy() return result lm_datasets = tokenized_datasets.map(group_texts, batched=True) lm_datasets tokenizer.decode(lm_datasets["train"][1]["input_ids"]) from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) samples = [lm_datasets["train"][i] for i in range(2)] for sample in samples: _ = sample.pop("word_ids") for chunk in data_collator(samples)["input_ids"]: print(f"\n'>>> {tokenizer.decode(chunk)}'") import collections import numpy as np from transformers import default_data_collator wwm_probability = 0.2 def whole_word_masking_data_collator(features): for feature in features: word_ids = feature.pop("word_ids") # Tạo ra ánh xạ giữa các từ và chỉ mục token tương ứng mapping = collections.defaultdict(list) current_word_index = -1 current_word = None for idx, word_id in enumerate(word_ids): if word_id is not None: if word_id != current_word: current_word = word_id current_word_index += 1 mapping[current_word_index].append(idx) # Che ngẫu nhiền từ mask = np.random.binomial(1, wwm_probability, (len(mapping),)) input_ids = feature["input_ids"] labels = feature["labels"] new_labels = [-100] * len(labels) for word_id in np.where(mask)[0]: word_id = word_id.item() for idx in mapping[word_id]: new_labels[idx] = labels[idx] input_ids[idx] = tokenizer.mask_token_id feature["labels"] = new_labels return default_data_collator(features) samples = [lm_datasets["train"][i] for i in range(2)] batch = whole_word_masking_data_collator(samples) for chunk in batch["input_ids"]: print(f"\n'>>> {tokenizer.decode(chunk)}'") train_size = 10_000 test_size = int(0.1 * train_size) downsampled_dataset = lm_datasets["train"].train_test_split( train_size=train_size, test_size=test_size, seed=42 ) downsampled_dataset from huggingface_hub import notebook_login notebook_login() from transformers import TrainingArguments batch_size = 64 # In ra sự mất mát khi huấn luyện ở mỗi epoch logging_steps = len(downsampled_dataset["train"]) // batch_size model_name = model_checkpoint.split("/")[-1] training_args = TrainingArguments( output_dir=f"{model_name}-finetuned-imdb", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=2e-5, weight_decay=0.01, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, push_to_hub=True, fp16=True, logging_steps=logging_steps, ) from transformers import Trainer trainer = Trainer( model=model, args=training_args, train_dataset=downsampled_dataset["train"], eval_dataset=downsampled_dataset["test"], data_collator=data_collator, tokenizer=tokenizer, ) import math eval_results = trainer.evaluate() print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}") trainer.train() eval_results = trainer.evaluate() print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}") trainer.push_to_hub() def insert_random_mask(batch): features = [dict(zip(batch, t)) for t in zip(*batch.values())] masked_inputs = data_collator(features) # Tạo ra một cột "masked" mới cho mỗi cột trong bộ dữ liệu return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()} downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"]) eval_dataset = downsampled_dataset["test"].map( insert_random_mask, batched=True, remove_columns=downsampled_dataset["test"].column_names, ) eval_dataset = eval_dataset.rename_columns( { "masked_input_ids": "input_ids", "masked_attention_mask": "attention_mask", "masked_labels": "labels", } ) from torch.utils.data import DataLoader from transformers import default_data_collator batch_size = 64 train_dataloader = DataLoader( downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator, ) eval_dataloader = DataLoader( eval_dataset, batch_size=batch_size, collate_fn=default_data_collator ) from torch.optim import AdamW optimizer = AdamW(model.parameters(), lr=5e-5) from accelerate import Accelerator accelerator = Accelerator() model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) from transformers import get_scheduler num_train_epochs = 3 num_update_steps_per_epoch = len(train_dataloader) num_training_steps = num_train_epochs * num_update_steps_per_epoch lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, ) from huggingface_hub import get_full_repo_name model_name = "distilbert-base-uncased-finetuned-imdb-accelerate" repo_name = get_full_repo_name(model_name) repo_name from huggingface_hub import Repository output_dir = model_name repo = Repository(output_dir, clone_from=repo_name) from tqdm.auto import tqdm import torch import math progress_bar = tqdm(range(num_training_steps)) for epoch in range(num_train_epochs): # Huấn luyện model.train() for batch in train_dataloader: outputs = model(**batch) loss = outputs.loss accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) # Đánh giá model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append(accelerator.gather(loss.repeat(batch_size))) losses = torch.cat(losses) losses = losses[: len(eval_dataset)] try: perplexity = math.exp(torch.mean(losses)) except OverflowError: perplexity = float("inf") print(f">>> Epoch {epoch}: Perplexity: {perplexity}") # Lưu và tải accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False ) from transformers import pipeline mask_filler = pipeline( "fill-mask", model="huggingface-course/distilbert-base-uncased-finetuned-imdb" ) preds = mask_filler(text) for pred in preds: print(f">>> {pred['sequence']}")