!pip install datasets evaluate transformers[sentencepiece] !apt install git-lfs !git config --global user.email "you@example.com" !git config --global user.name "Your Name" from huggingface_hub import notebook_login notebook_login() from transformers import TFAutoModelForMaskedLM model_checkpoint = "distilbert-base-uncased" model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint) model(model.dummy_inputs) # Build the model model.summary() text = "This is a great [MASK]." from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) import numpy as np import tensorflow as tf inputs = tokenizer(text, return_tensors="np") token_logits = model(**inputs).logits # Find the location of [MASK] and extract its logits mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1] mask_token_logits = token_logits[0, mask_token_index, :] # Pick the [MASK] candidates with the highest logits # We negate the array before argsort to get the largest, not the smallest, logits top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist() for token in top_5_tokens: print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}") from datasets import load_dataset imdb_dataset = load_dataset("imdb") imdb_dataset sample = imdb_dataset["train"].shuffle(seed=42).select(range(3)) for row in sample: print(f"\n'>>> Review: {row['text']}'") print(f"'>>> Label: {row['label']}'") def tokenize_function(examples): result = tokenizer(examples["text"]) if tokenizer.is_fast: result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] return result # Use batched=True to activate fast multithreading! tokenized_datasets = imdb_dataset.map( tokenize_function, batched=True, remove_columns=["text", "label"] ) tokenized_datasets tokenizer.model_max_length chunk_size = 128 # Slicing produces a list of lists for each feature tokenized_samples = tokenized_datasets["train"][:3] for idx, sample in enumerate(tokenized_samples["input_ids"]): print(f"'>>> Review {idx} length: {len(sample)}'") concatenated_examples = { k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys() } total_length = len(concatenated_examples["input_ids"]) print(f"'>>> Concatenated reviews length: {total_length}'") chunks = { k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items() } for chunk in chunks["input_ids"]: print(f"'>>> Chunk length: {len(chunk)}'") def group_texts(examples): # Concatenate all texts concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} # Compute length of concatenated texts total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the last chunk if it's smaller than chunk_size total_length = (total_length // chunk_size) * chunk_size # Split by chunks of max_len result = { k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items() } # Create a new labels column result["labels"] = result["input_ids"].copy() return result lm_datasets = tokenized_datasets.map(group_texts, batched=True) lm_datasets tokenizer.decode(lm_datasets["train"][1]["input_ids"]) from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) samples = [lm_datasets["train"][i] for i in range(2)] for sample in samples: _ = sample.pop("word_ids") for chunk in data_collator(samples)["input_ids"]: print(f"\n'>>> {tokenizer.decode(chunk)}'") import collections import numpy as np from transformers.data.data_collator import tf_default_data_collator wwm_probability = 0.2 def whole_word_masking_data_collator(features): for feature in features: word_ids = feature.pop("word_ids") # Create a map between words and corresponding token indices mapping = collections.defaultdict(list) current_word_index = -1 current_word = None for idx, word_id in enumerate(word_ids): if word_id is not None: if word_id != current_word: current_word = word_id current_word_index += 1 mapping[current_word_index].append(idx) # Randomly mask words mask = np.random.binomial(1, wwm_probability, (len(mapping),)) input_ids = feature["input_ids"] labels = feature["labels"] new_labels = [-100] * len(labels) for word_id in np.where(mask)[0]: word_id = word_id.item() for idx in mapping[word_id]: new_labels[idx] = labels[idx] input_ids[idx] = tokenizer.mask_token_id feature["labels"] = new_labels return tf_default_data_collator(features) samples = [lm_datasets["train"][i] for i in range(2)] batch = whole_word_masking_data_collator(samples) for chunk in batch["input_ids"]: print(f"\n'>>> {tokenizer.decode(chunk)}'") train_size = 10_000 test_size = int(0.1 * train_size) downsampled_dataset = lm_datasets["train"].train_test_split( train_size=train_size, test_size=test_size, seed=42 ) downsampled_dataset from huggingface_hub import notebook_login notebook_login() tf_train_dataset = downsampled_dataset["train"].to_tf_dataset( columns=["input_ids", "attention_mask", "labels"], collate_fn=data_collator, shuffle=True, batch_size=32, ) tf_eval_dataset = downsampled_dataset["test"].to_tf_dataset( columns=["input_ids", "attention_mask", "labels"], collate_fn=data_collator, shuffle=False, batch_size=32, ) from transformers import create_optimizer from transformers.keras_callbacks import PushToHubCallback import tensorflow as tf num_train_steps = len(tf_train_dataset) optimizer, schedule = create_optimizer( init_lr=2e-5, num_warmup_steps=1_000, num_train_steps=num_train_steps, weight_decay_rate=0.01, ) model.compile(optimizer=optimizer) # Train in mixed-precision float16 tf.keras.mixed_precision.set_global_policy("mixed_float16") callback = PushToHubCallback( output_dir=f"{model_name}-finetuned-imdb", tokenizer=tokenizer ) import math eval_loss = model.evaluate(tf_eval_dataset) print(f"Perplexity: {math.exp(eval_loss):.2f}") model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback]) eval_loss = model.evaluate(tf_eval_dataset) print(f"Perplexity: {math.exp(eval_loss):.2f}") from transformers import pipeline mask_filler = pipeline( "fill-mask", model="huggingface-course/distilbert-base-uncased-finetuned-imdb" ) preds = mask_filler(text) for pred in preds: print(f">>> {pred['sequence']}")