A tutorial about how to train an NLP model with the huggingface's pretrained BERT in TF/Keras
This notebook shows how to train a neural network model with pre-trained BERT in Tensorflow/Keras. It is based on @xhlulu's Disaster NLP: Keras BERT using TFHub notebook and Text Extraction with BERT example at Keras.
This competition is a code competition without access to internet. So we add the transformers
tokenizer and pre-trained BERT model through Kaggle Datasets instead.
Hope it helps.
Version | CV Score | Public Score | Changes | Comment |
---|---|---|---|---|
v9 | to be updated | to be updated | use transformers' tokenizer | |
v8 | 0.653635 | 0.606 | add 5-fold CV + early-stopping back. | |
v7 | N/A | 0.617 | fix the bug in learning rate scheduler | overfitting to train? (n=20) |
v6 | N/A | 0.566 | add the warm-up learning rate scheduler | With a bug. Don't use it |
v5 | N/A | 0.531 | roll back to v3 | |
v4 | N/A | 0.573 | add early-stopping | seemed to stop too early with patience=1 (n=5) |
v3 | N/A | 0.530 | initial baseline |
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from copy import copy
import joblib
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import sys
from warnings import simplefilter
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel, BertConfig, BertTokenizerFast
simplefilter('ignore')
plt.style.use('fivethirtyeight')
# limit the GPU memory growth
gpu = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu))
if len(gpu) > 0:
tf.config.experimental.set_memory_growth(gpu[0], True)
Num GPUs Available: 1
model_name = 'bert_v9'
data_dir = Path('../input/commonlitreadabilityprize')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
build_dir = Path('../build/')
output_dir = build_dir / model_name
trn_encoded_file = output_dir / 'trn.enc.joblib'
tokenizer_file = output_dir / 'tokenizer.joblib'
val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = 'submission.csv'
module_url = "../input/bert-en-uncased-l24-h1024-a16"
id_col = 'id'
target_col = 'target'
text_col = 'excerpt'
max_len = 205
n_fold = 5
n_est = 2
n_stop = 2
batch_size = 8
seed = 42
output_dir.mkdir(parents=True, exist_ok=True)
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
y = trn[target_col].values
print(trn.shape, y.shape, tst.shape)
trn.head()
(2834, 5) (2834,) (7, 3)
url_legal | license | excerpt | target | standard_error | |
---|---|---|---|---|---|
id | |||||
c12129c31 | NaN | NaN | When the young people returned to the ballroom... | -0.340259 | 0.464009 |
85aa80a4c | NaN | NaN | All through dinner time, Mrs. Fayre was somewh... | -0.315372 | 0.480805 |
b69ac6792 | NaN | NaN | As Roger had predicted, the snow departed as q... | -0.580118 | 0.476676 |
dd1000b26 | NaN | NaN | And outside before the palace a great garden w... | -1.054013 | 0.450007 |
37c1b32fb | NaN | NaN | Once upon a time there were Three Bears who li... | 0.247197 | 0.510845 |
transformers
¶pretrained_dir = output_dir / "bert_base_uncased/"
pretrained_dir.mkdir(exist_ok=True)
def load_tokenizer():
if not os.path.exists(pretrained_dir / 'vocab.txt'):
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.save_pretrained(pretrained_dir)
else:
print('loading the saved pretrained tokenizer')
tokenizer = BertTokenizerFast.from_pretrained(str(pretrained_dir))
model_config = BertConfig.from_pretrained(str(pretrained_dir))
model_config.output_hidden_states = True
return tokenizer, model_config
def load_bert(config):
if not os.path.exists(pretrained_dir / 'tf_model.h5'):
bert_model = TFBertModel.from_pretrained("bert-base-uncased", config=config)
bert_model.save_pretrained(pretrained_dir)
else:
print('loading the saved pretrained model')
bert_model = TFBertModel.from_pretrained(pretrained_dir, config=config)
return bert_model
def bert_encode(texts, tokenizer, max_len=max_len):
input_ids = []
token_type_ids = []
attention_mask = []
for text in texts:
token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
add_special_tokens=True)
input_ids.append(token['input_ids'])
token_type_ids.append(token['token_type_ids'])
attention_mask.append(token['attention_mask'])
return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)
tokenizer, bert_config = load_tokenizer()
X = bert_encode(trn[text_col].values, tokenizer, max_len=max_len)
X_tst = bert_encode(tst[text_col].values, tokenizer, max_len=max_len)
y = trn[target_col].values
print(X[0].shape, X_tst[0].shape, y.shape)
loading the saved pretrained tokenizer (2834, 205) (7, 205) (2834,)
joblib.dump(X, trn_encoded_file)
joblib.dump(tokenizer, tokenizer_file)
['../build/bert_v9/tokenizer.joblib']
Simple model with only an output dense layer added to the pre-trained BERT model.
def build_model(bert_model, max_len=max_len):
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
token_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
sequence_output = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
clf_output = sequence_output[:, 0, :]
clf_output = Dropout(.1)(clf_output)
out = Dense(1, activation='linear')(clf_output)
model = Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
model.compile(Adam(lr=1e-5), loss='mean_squared_error', metrics=[RootMeanSquaredError()])
return model
Training the model with early stopping and a learning-rate scheduler
def scheduler(epoch, lr, warmup=5, decay_start=10):
if epoch <= warmup:
return lr / (warmup - epoch + 1)
elif warmup < epoch <= decay_start:
return lr
else:
return lr * tf.math.exp(-.1)
ls = LearningRateScheduler(scheduler)
es = EarlyStopping(patience=n_stop, restore_best_weights=True)
cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)
p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((X_tst[0].shape[0], ), dtype=float)
for i, (i_trn, i_val) in enumerate(cv.split(X[0]), 1):
print(f'training CV #{i}:')
tf.random.set_seed(seed + i)
bert_model = load_bert(bert_config)
clf = build_model(bert_model, max_len=max_len)
if i == 1:
print(clf.summary())
history = clf.fit([x[i_trn] for x in X], y[i_trn],
validation_data=([x[i_val] for x in X], y[i_val]),
epochs=n_est,
batch_size=batch_size,
callbacks=[ls])
clf.save_weights(f'{model_name}_cv{i}.h5')
p[i_val] = clf.predict([x[i_val] for x in X]).flatten()
p_tst += clf.predict(X_tst).flatten() / n_fold
training CV #1: loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel. All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased. If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. Model: "model_1" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_ids (InputLayer) [(None, 205)] 0 __________________________________________________________________________________________________ attention_mask (InputLayer) [(None, 205)] 0 __________________________________________________________________________________________________ token_type_ids (InputLayer) [(None, 205)] 0 __________________________________________________________________________________________________ tf_bert_model_1 (TFBertModel) TFBaseModelOutputWit 109482240 input_ids[0][0] attention_mask[0][0] token_type_ids[0][0] __________________________________________________________________________________________________ tf.__operators__.getitem_1 (Sli (None, 768) 0 tf_bert_model_1[0][13] __________________________________________________________________________________________________ dropout_75 (Dropout) (None, 768) 0 tf.__operators__.getitem_1[0][0] __________________________________________________________________________________________________ dense_1 (Dense) (None, 1) 769 dropout_75[0][0] ================================================================================================== Total params: 109,483,009 Trainable params: 109,483,009 Non-trainable params: 0 __________________________________________________________________________________________________ None Epoch 1/2 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_1/bert/pooler/dense/kernel:0', 'tf_bert_model_1/bert/pooler/dense/bias:0'] when minimizing the loss. WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_1/bert/pooler/dense/kernel:0', 'tf_bert_model_1/bert/pooler/dense/bias:0'] when minimizing the loss. 284/284 [==============================] - ETA: 0s - loss: 1.0671WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. 284/284 [==============================] - 118s 366ms/step - loss: 1.0662 - val_loss: 0.4776 Epoch 2/2 284/284 [==============================] - 104s 367ms/step - loss: 0.5244 - val_loss: 0.4544 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. training CV #2: loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel. All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased. If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. Epoch 1/2 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_2/bert/pooler/dense/kernel:0', 'tf_bert_model_2/bert/pooler/dense/bias:0'] when minimizing the loss. WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_2/bert/pooler/dense/kernel:0', 'tf_bert_model_2/bert/pooler/dense/bias:0'] when minimizing the loss. 284/284 [==============================] - ETA: 0s - loss: 1.0675WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. 284/284 [==============================] - 117s 365ms/step - loss: 1.0667 - val_loss: 0.5301 Epoch 2/2 284/284 [==============================] - 101s 356ms/step - loss: 0.5712 - val_loss: 0.4714 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. training CV #3: loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel. All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased. If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. Epoch 1/2 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_3/bert/pooler/dense/kernel:0', 'tf_bert_model_3/bert/pooler/dense/bias:0'] when minimizing the loss. WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_3/bert/pooler/dense/kernel:0', 'tf_bert_model_3/bert/pooler/dense/bias:0'] when minimizing the loss. 284/284 [==============================] - ETA: 0s - loss: 0.9928WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. 284/284 [==============================] - 118s 365ms/step - loss: 0.9922 - val_loss: 0.5096 Epoch 2/2 284/284 [==============================] - 102s 358ms/step - loss: 0.5822 - val_loss: 0.5252 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. training CV #4: loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel. All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased. If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. Epoch 1/2 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_4/bert/pooler/dense/kernel:0', 'tf_bert_model_4/bert/pooler/dense/bias:0'] when minimizing the loss. WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_4/bert/pooler/dense/kernel:0', 'tf_bert_model_4/bert/pooler/dense/bias:0'] when minimizing the loss. 284/284 [==============================] - ETA: 0s - loss: 1.0345WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. 284/284 [==============================] - 120s 375ms/step - loss: 1.0337 - val_loss: 0.5380 Epoch 2/2 284/284 [==============================] - 102s 358ms/step - loss: 0.5264 - val_loss: 0.4960 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. training CV #5: loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel. All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased. If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. Epoch 1/2 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_5/bert/pooler/dense/kernel:0', 'tf_bert_model_5/bert/pooler/dense/bias:0'] when minimizing the loss. WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_5/bert/pooler/dense/kernel:0', 'tf_bert_model_5/bert/pooler/dense/bias:0'] when minimizing the loss. 284/284 [==============================] - ETA: 0s - loss: 1.0071WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`. 284/284 [==============================] - 120s 371ms/step - loss: 1.0063 - val_loss: 0.5089 Epoch 2/2 284/284 [==============================] - 103s 363ms/step - loss: 0.4906 - val_loss: 0.5032 WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`). WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
print(f'CV RMSE: {mean_squared_error(y, p, squared=False):.6f}')
np.savetxt(val_predict_file, p, fmt='%.6f')
sub = pd.read_csv(sample_file, index_col=id_col)
sub[target_col] = p_tst
sub.to_csv(submission_file)
sub.head()