from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_matching/ant/main')
!pip install transformers
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics import classification_report
import os
import json
import time
import logging
import pprint
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())
params = {
'pretrain_path': 'bert-base-chinese',
'train_path': '../data/train.json',
'test_path': '../data/dev.json',
'batch_size': 32,
'max_len': 128,
'buffer_size': 34334,
'init_lr': 1e-5,
'max_lr': 4e-5,
'n_epochs': 12,
'num_patience': 7,
}
tokenizer = BertTokenizer.from_pretrained(params['pretrain_path'],
lowercase = True,
add_special_tokens = True)
# stream data from text files
def data_generator(f_path, params):
with open(f_path) as f:
print('Reading', f_path)
for line in f:
line = json.loads(line.rstrip())
text1, text2, label = line['sentence1'], line['sentence2'], line['label']
if len(text1) + len(text2) + 3 > params['max_len']:
_max_len = (params['max_len'] - 3) // 2
text1 = text1[:_max_len]
text2 = text2[:_max_len]
text1 = list(text1)
text2 = list(text2)
text = ['[CLS]'] + text1 + ['[SEP]'] + text2 + ['[SEP]']
seg = [0] + [0] * len(text1) + [0] + [1] * len(text2) + [1]
text = tokenizer.convert_tokens_to_ids(text)
yield (text, seg), int(label)
def dataset(is_training, params):
_shapes = (([None], [None]), ())
_types = ((tf.int32, tf.int32), tf.int32)
_pads = ((0, 0), -1)
if is_training:
ds = tf.data.Dataset.from_generator(
lambda: data_generator(params['train_path'], params),
output_shapes = _shapes,
output_types = _types,)
ds = ds.shuffle(params['buffer_size'])
ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
else:
ds = tf.data.Dataset.from_generator(
lambda: data_generator(params['test_path'], params),
output_shapes = _shapes,
output_types = _types,)
ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
return ds
# input stream ids check
(text, seg), _ = next(data_generator(params['train_path'], params))
print(text)
print(seg)
class BertFinetune(tf.keras.Model):
def __init__(self, params):
super(BertFinetune, self).__init__()
self.bert = TFBertModel.from_pretrained(params['pretrain_path'],
trainable = True)
self.bert.load_weights('../model/bert_further_pretrain.h5',
by_name = True,
skip_mismatch = True)
self.drop_1 = tf.keras.layers.Dropout(.1)
self.fc = tf.keras.layers.Dense(300, tf.nn.swish, name='down_stream/fc')
self.drop_2 = tf.keras.layers.Dropout(.1)
self.out = tf.keras.layers.Dense(1, name='down_stream/out')
def call(self, bert_inputs, training):
bert_inputs = [tf.cast(inp, tf.int32) for inp in bert_inputs]
x = self.bert(bert_inputs, training=training)
x = x[1]
x = self.drop_1(x, training=training)
x = self.fc(x)
x = self.drop_2(x, training=training)
x = self.out(x)
x = tf.squeeze(x, 1)
return x
model = BertFinetune(params)
model.build([[None, None], [None, None], [None, None]])
print(model.weights[5])
step_size = 2 * params['buffer_size'] // params['batch_size']
decay_lr = tfa.optimizers.Triangular2CyclicalLearningRate(
initial_learning_rate = params['init_lr'],
maximal_learning_rate = params['max_lr'],
step_size = step_size,)
optim = tf.optimizers.Adam(params['init_lr'])
global_step = 0
best_acc = .0
count = 0
t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)
for _ in range(params['n_epochs']):
# TRAINING
for ((text, seg), labels) in dataset(is_training=True, params=params):
with tf.GradientTape() as tape:
logits = model([text, tf.sign(text), seg], training=True)
labels = tf.cast(labels, tf.float32)
num_neg = tf.reduce_sum(tf.cast(tf.equal(labels, 0.), tf.float32)).numpy()
num_pos = tf.reduce_sum(labels).numpy()
if num_pos == 0.:
pos_weight = 1.
else:
pos_weight = num_neg / num_pos
loss = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(
labels = labels,
logits = logits,
pos_weight = pos_weight))
optim.lr.assign(decay_lr(global_step))
grads = tape.gradient(loss, model.trainable_variables)
grads, _ = tf.clip_by_global_norm(grads, 5.)
optim.apply_gradients(zip(grads, model.trainable_variables))
if global_step % 100 == 0:
logger.info("Step {} | Loss: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
global_step, loss.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
t0 = time.time()
global_step += 1
# EVALUATION
m = tf.keras.metrics.Accuracy()
intent_true = []
intent_pred = []
for ((text, seg), labels) in dataset(is_training=False, params=params):
logits = tf.sigmoid(model([text, tf.sign(text), seg], training=False))
y_pred = tf.cast(tf.math.greater_equal(logits, .5), tf.int32)
m.update_state(y_true=labels, y_pred=y_pred)
intent_true += labels.numpy().flatten().tolist()
intent_pred += y_pred.numpy().flatten().tolist()
acc = m.result().numpy()
logger.info("Evaluation: Testing Accuracy: {:.3f}".format(acc))
logger.info('\n'+classification_report(y_true = intent_true,
y_pred = intent_pred,
labels = [0, 1],
target_names = ['Not Matched', 'Matched'],
digits = 3))
if acc > best_acc:
best_acc = acc
# you can save model here
count = 0
else:
count += 1
logger.info("Best Accuracy: {:.3f}".format(best_acc))
if count == params['num_patience']:
print(params['num_patience'], "times not improve the best result, therefore stop training")
break