#!pip install transformers
#!pip install tensorflow
#!pip install scikit-learn
CONFIG = {
"model": "distilbert-base-uncased",
"seq_length": 512,
"num_classes": 20,
"batch_size": 64,
}
import tensorflow as tf
# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")
# Check TensorFlow version
print(f"TensorFlow version: {tf.__version__}")
TensorFlow has access to the following devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')] TensorFlow version: 2.8.0
Exercise: load the 20NG datasets using the sklearn.datasets
module.
from sklearn.datasets import fetch_20newsgroups
raw_data = fetch_20newsgroups(
data_home="./",
remove=("headers", "footers", "quotes"),
subset="all"
)
class_mapping = dict(enumerate(raw_data["target_names"]))
print(class_mapping)
print(raw_data.keys())
print(raw_data["data"][0])
print(class_mapping[raw_data["target"][0]])
{0: 'alt.atheism', 1: 'comp.graphics', 2: 'comp.os.ms-windows.misc', 3: 'comp.sys.ibm.pc.hardware', 4: 'comp.sys.mac.hardware', 5: 'comp.windows.x', 6: 'misc.forsale', 7: 'rec.autos', 8: 'rec.motorcycles', 9: 'rec.sport.baseball', 10: 'rec.sport.hockey', 11: 'sci.crypt', 12: 'sci.electronics', 13: 'sci.med', 14: 'sci.space', 15: 'soc.religion.christian', 16: 'talk.politics.guns', 17: 'talk.politics.mideast', 18: 'talk.politics.misc', 19: 'talk.religion.misc'} dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR']) I am sure some bashers of Pens fans are pretty confused about the lack of any kind of posts about the recent Pens massacre of the Devils. Actually, I am bit puzzled too and a bit relieved. However, I am going to put an end to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they are killing those Devils worse than I thought. Jagr just showed you why he is much better than his regular season stats. He is also a lot fo fun to watch in the playoffs. Bowman should let JAgr have a lot of fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final regular season game. PENS RULE!!! rec.sport.hockey
Exercise: split the data into training, validation, and test data. (80/10/10)
from sklearn.model_selection import train_test_split
X = raw_data["data"]
y = raw_data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 1/9, random_state=42)
Exercise: Load the DistilBertTokenizer
pretrained tokenizer and apply it to tokenize the 20NG data.
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained(CONFIG["model"])
tokenizer_config = {
"return_tensors": "tf",
"return_attention_mask": True,
"return_token_type_ids": True,
"padding": "max_length",
"truncation": True,
"max_length": CONFIG["seq_length"],
}
X_train_tokenized = tokenizer.batch_encode_plus(X_train, **tokenizer_config).data
X_val_tokenized = tokenizer.batch_encode_plus(X_val, **tokenizer_config).data
X_test_tokenized = tokenizer.batch_encode_plus(X_test, **tokenizer_config).data
/opt/homebrew/Caskroom/miniforge/base/envs/bdlt/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Metal device set to: Apple M1
2022-05-02 13:57:58.865870: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support. 2022-05-02 13:57:58.865989: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
print(" ".join(tokenizer.convert_ids_to_tokens(X_train_tokenized["input_ids"][0][:50])), "...")
[CLS] your ignorance is showing . the bat ##f warrant was un ##sea ##led . the entire operation was illegal from day one . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] ...
Exercise: convert the dataset into a tf.data.Dataset
train_ds = tf.data.Dataset.from_tensor_slices((X_train_tokenized, y_train))
val_ds = tf.data.Dataset.from_tensor_slices((X_val_tokenized, y_val))
test_ds = tf.data.Dataset.from_tensor_slices((X_test_tokenized, y_test))
print(tf.data.DatasetSpec.from_value(train_ds))
DatasetSpec(({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None)), TensorShape([]))
Exercise: shuffle, repeate and batch the training data, batch the validation and test data.
batched_train_ds = (
train_ds
.shuffle(2 * CONFIG["batch_size"])
.repeat()
.batch(CONFIG["batch_size"])
)
batched_val_ds = (
val_ds
.batch(CONFIG["batch_size"])
)
batched_test_ds = (
test_ds
.batch(CONFIG["batch_size"])
)
Exercise: import the TFDistilBertModel
from huggingface, with the model name as specified in the config.
from transformers import TFDistilBertModel
transformer = TFDistilBertModel.from_pretrained(CONFIG["model"])
2022-05-02 13:58:14.397492: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them. Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector'] - This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased. If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
Exercise: define two input layers to feed the input_id
and attention_mask
sequences into the transformer layer.
input_ids = tf.keras.layers.Input(shape=(CONFIG["seq_length"],), name='input_ids', dtype='int32')
attention_mask = tf.keras.layers.Input(shape=(CONFIG["seq_length"],), name='attention_mask', dtype='int32')
Exercise: extract the hidden representation of the [CLS]
special token (always the first of every document).
E1 = transformer(input_ids=input_ids, attention_mask=attention_mask)
# We only care about DistilBERT's hidden-state of the last layer
seq_embeddings = E1.last_hidden_state
# We only care about DistilBERT's output for the [CLS] token,
# which is located at index 0 of every encoded sequence
# seq_embeddings.shape = (batch_size, seq_length, representation_length) -> we want the first of the sequence
cls_embedding = seq_embeddings[:,0,:]
Exercise: define a classification head with 5 layers: Dropout, Dense, Dropout, Dense, Output. The output layer needs to have the same number of dimensions as there are classes (20).
x = tf.keras.layers.Dropout(0.2)(cls_embedding)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
output = tf.keras.layers.Dense(CONFIG["num_classes"], activation='softmax')(x)
Exercise: use the previously defined layer stack to define a model.
# Define the model
model = tf.keras.Model([input_ids, attention_mask], output)
Exercise: set the pretrained layer (tf_distilbert_model
) to not trainable (we only want to train the head).
# Set the pretrained part as non-trainable (we only want to train the head)
#model.get_layer("input_ids").trainable = False
#model.get_layer("attention_mask").trainable = False
model.get_layer("tf_distil_bert_model").trainable = False
model.summary(show_trainable=True)
Model: "model" _____________________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to Trainable ============================================================================================================= input_ids (InputLayer) [(None, 512)] 0 [] Y attention_mask (InputLayer) [(None, 512)] 0 [] Y tf_distil_bert_model (TFDistil TFBaseModelOutput(l 66362880 ['input_ids[0][0]', N BertModel) ast_hidden_state=(N 'attention_mask[0][0]'] one, 512, 768), hidden_states=None , attentions=None) tf.__operators__.getitem (Slic (None, 768) 0 ['tf_distil_bert_model[0][0]'] Y ingOpLambda) dropout_19 (Dropout) (None, 768) 0 ['tf.__operators__.getitem[0][0 Y ]'] dense (Dense) (None, 128) 98432 ['dropout_19[0][0]'] Y dropout_20 (Dropout) (None, 128) 0 ['dense[0][0]'] Y dense_1 (Dense) (None, 64) 8256 ['dropout_20[0][0]'] Y dense_2 (Dense) (None, 20) 1300 ['dense_1[0][0]'] Y ============================================================================================================= Total params: 66,470,868 Trainable params: 107,988 Non-trainable params: 66,362,880 _____________________________________________________________________________________________________________
Exercise: compile the model with SparseCategoricalCrossentropy
as loss function and the Adam
optimizer.
# Compile the model
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy']
)
Exercise: define three callbacks: ModelCheckpoint
, EarlyStopping
, and TensorBoard
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
filepath="finetuned_bert_20ng.hdf5",
save_best_only=True
)
early_stopping = tf.keras.callbacks.EarlyStopping(
patience=3, # Stop after 3 epochs of no improvement
monitor='val_loss', # Look at validation_loss
min_delta=0.01, # After 0 change
mode='min', # Stop when quantity has stopped decreasing
verbose=1
)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir='./logs',)
Exercise: fit the model.
history = model.fit(
batched_train_ds,
validation_data = batched_val_ds,
epochs = 10,
# Define how many steps make up an epoch (bc. dataset is infinite)
steps_per_epoch = len(X_train) // CONFIG["batch_size"],
callbacks = [model_checkpoint, early_stopping, tensorboard]
)
Exercise: load the best checkpoint from disk.
Notes:
TFDistilBertModel
pretrained class when loading.#!curl https://files.webis.de/bdlt-ss22/finetuned_bert_20ng.hdf5 --output finetuned_bert_20ng.hdf5
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 254M 100 254M 0 0 5596k 0 0:00:46 0:00:46 --:--:-- 7908k26 0:00:27 4761k:00:26 4843k46 --:--:-- 8004k
m = tf.keras.models.load_model(
"finetuned_bert_20ng.hdf5",
custom_objects={
'TFDistilBertModel': TFDistilBertModel,
}
)
WARNING:tensorflow:Error in loading the saved optimizer state. As a result, your model is starting with a freshly initialized optimizer.
#!jupyter nbconvert --to python 401-Finetuned-Classification.ipynb