class AlbertConfig:
def __init__(
self,
vocab_size,
embedding_size=128,
hidden_size=4096,
num_hidden_layers=12,
num_hidden_groups=1,
num_attention_heads=64,
intermediate_size=16384,
inner_group_num=1,
down_scale_factor=1,
hidden_act="gelu",
hidden_dropout_prob=0,
attention_probs_dropout_prob=0,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02):
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_hidden_groups = num_hidden_groups
self.num_attention_heads = num_attention_heads
self.inner_group_num = inner_group_num
self.down_scale_factor = down_scale_factor
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
config = AlbertConfig(vocab_size=32000,
hidden_size=512,
num_hidden_layers=8,
num_attention_heads=6,
intermediate_size=1024)
import collections
import copy
import json
import math
import re
import numpy as np
import six
from six.moves import range
import tensorflow.compat.v1 as tf
class AlbertModel:
def __init__(
self,
config,
is_training,
input_ids,
input_mask=None,
token_type_ids=None,
use_one_hot_embeddings=False,
use_einsum=True,
scope=None):
config = copy.deepcopy(config)
input_shape = get_shape_list(input_ids, expected_rank=2)
batch_size, seq_length = input_shape
with tf.variable_scope(scope, default_name="bert"):
with tf.variable_scope("embeddings"):
self.word_embedding_output, self.output_embedding_table = embedding_lookup(
input_ids=input_ids,
vocab_size=config.vocab_size,
embedding_size=config.embedding_size,
initializer_range=config.initializer_range,
word_embedding_name="word_embeddings",
use_one_hot_embeddings=use_one_hot_embeddings)
# Add positional embeddings and token type embeddings,
# then layer normalize and perform dropout.
self.embedding_output = embedding_postprocessor(
input_tensor=self.word_embedding_output,
use_token_type=True,
token_type_ids=token_type_ids,
token_type_vocab_size=config.type_vocab_size,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=config.initializer_range,
max_position_embeddings=config.max_position_embeddings,
dropout_prob=config.hidden_dropout_prob,
use_one_hot_embeddings=use_one_hot_embeddings)
with tf.variable_scope("encoder"):
# Run the stacked transformer.
# `sequence_output` shape = [batch_size, seq_length, hidden_size].
self.all_encoder_layers = transformer_model(
input_tensor=self.embedding_output,
attention_mask=input_mask,
hidden_size=config.hidden_size,
num_hidden_layers=config.num_hidden_layers,
num_hidden_groups=config.num_hidden_groups,
num_attention_heads=config.num_attention_heads,
intermediate_size=config.intermediate_size,
inner_group_num=config.inner_group_num,
intermediate_act_fn=get_activation(config.hidden_act),
hidden_dropout_prob=config.hidden_dropout_prob,
attention_probs_dropout_prob=config.attention_probs_dropout_prob,
initializer_range=config.initializer_range,
do_return_all_layers=True,
use_einsum=use_einsum)
self.sequence_output = self.all_encoder_layers[-1]
# The "pooler" converts the encoded sequence tensor of shape
# [batch_size, seq_length, hidden_size] to a tensor of shape
# [batch_size, hidden_size]. This is necessary for segment-level
# (or segment-pair-level) classification tasks where we need a fixed
# dimensional representation of the segment.
with tf.variable_scope("pooler"):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token. We assume that this has been pre-trained
first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
self.pooled_output = tf.layers.dense(
first_token_tensor,
config.hidden_size,
activation=tf.tanh,
kernel_initializer=create_initializer(config.initializer_range))
# (batch_size, seq_length)
input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
token_type_ids = tf.constant([[0, 0, 1], [0, 0, 1]])
def embedding_lookup(input_ids,
vocab_size,
embedding_size=128,
word_embedding_name="word_embeddings"):
# This function assumes that the input is of shape [batch_size, seq_length, num_inputs].
# If the input is a 2D tensor of shape [batch_size, seq_length], we
# reshape to [batch_size, seq_length, 1].
if input_ids.shape.ndims == 2:
# [batch_size, seq_length, 1]
input_ids = tf.expand_dims(input_ids, axis=[-1])
embedding_table = tf.get_variable(
name=word_embedding_name,
shape=[vocab_size, embedding_size],
initializer=create_initializer(0.02))
output = tf.nn.embedding_lookup(embedding_table, input_ids)
print(output.shape)
input_shape = get_shape_list(input_ids)
output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
return (output, embedding_table)
input_ids.shape.ndims
2
tf.expand_dims(input_ids, axis=[-1]).shape
TensorShape([2, 3, 1])
input_shape = get_shape_list(input_ids)
word_embedding_output, output_embedding_table = embedding_lookup(input_ids, 100)
(2, 3, 1, 128)
output_embedding_table.shape
TensorShape([100, 128])
word_embedding_output.shape
TensorShape([2, 3, 128])
def embedding_postprocessor(input_tensor,
token_type_ids,
token_type_vocab_size=2,
max_position_embeddings=512,
dropout_prob=0.1,
token_type_embedding_name="token_type_embeddings",
position_embedding_name="position_embeddings"):
input_shape = get_shape_list(input_tensor, expected_rank=3)
batch_size, seq_length, width = input_shape
output = input_tensor
token_type_table = tf.get_variable(
token_type_embedding_name,
shape=[token_type_vocab_size, width],
initializer=create_initializer(0.02)
)
#
token_type_embeddings = tf.nn.embedding_lookup(token_type_table, token_type_ids)
print(token_type_embeddings.shape)
output += token_type_embeddings
full_position_embeddings = tf.get_variable(
name=position_embedding_name,
shape=[max_position_embeddings, width],
initializer=create_initializer(0.02))
position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1])
num_dims = len(output.shape.as_list())
position_broadcast_shape = []
for _ in range(num_dims - 2):
position_broadcast_shape.append(1)
position_broadcast_shape.extend([seq_length, width])
position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape)
print(position_embeddings.shape)
output += position_embeddings
output = layer_norm_and_dropout(output, dropout_prob)
return output
full_position_embeddings = tf.get_variable(
name="position_embedding_name",
shape=[512, 128],
initializer=create_initializer(0.02))
full_position_embeddings.shape
TensorShape([512, 128])
position_broadcast_shape
[1, 3, 128]
position_embeddings = tf.slice(full_position_embeddings, [0, 0], [3, -1])
position_embeddings.shape
TensorShape([3, 128])
token_type_ids = tf.zeros(shape=[2, 3], dtype=tf.int32)
embedding_output = embedding_postprocessor(word_embedding_output, token_type_ids)
(2, 3, 128) (1, 3, 128)
embedding_output.shape
TensorShape([2, 3, 128])
def transformer_model(input_tensor,
attention_mask=None,
hidden_size=768,
num_hidden_layers=12,
num_hidden_groups=1,
num_attention_heads=12,
intermediate_size=3072,
inner_group_num=1,
intermediate_act_fn="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False,
use_einsum=True):
attention_head_size = hidden_size // num_attention_heads
input_shape = get_shape_list(input_tensor, expected_rank=3)
input_width = input_shape[2]
all_layer_outputs = []
if input_width != hidden_size:
# 本文的情况(第一个调整点)
prev_output = dense_layer_2d(
input_tensor, hidden_size, create_initializer(initializer_range),
None, use_einsum=use_einsum, name="embedding_hidden_mapping_in")
else:
# 正常情况(如 Bert)
prev_output = input_tensor
with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
for layer_idx in range(num_hidden_layers):
group_idx = int(layer_idx / num_hidden_layers * num_hidden_groups)
with tf.variable_scope("group_%d" % group_idx):
with tf.name_scope("layer_%d" % layer_idx):
layer_output = prev_output
for inner_group_idx in range(inner_group_num):
with tf.variable_scope("inner_group_%d" % inner_group_idx):
layer_output = attention_ffn_block(
layer_input=layer_output,
hidden_size=hidden_size,
attention_mask=attention_mask,
num_attention_heads=num_attention_heads,
attention_head_size=attention_head_size,
attention_probs_dropout_prob=attention_probs_dropout_prob,
intermediate_size=intermediate_size,
intermediate_act_fn=intermediate_act_fn,
initializer_range=initializer_range,
hidden_dropout_prob=hidden_dropout_prob,
use_einsum=use_einsum)
prev_output = layer_output
all_layer_outputs.append(layer_output)
if do_return_all_layers:
return all_layer_outputs
else:
return all_layer_outputs[-1]
embedding_output.shape
TensorShape([2, 3, 128])
prev_output = dense_layer_2d(
embedding_output, 768, create_initializer(0.02),
None, use_einsum=True, name="embedding_hidden_mapping_in")
prev_output.shape
TensorShape([2, 3, 768])
def attention_ffn_block(layer_input,
attention_mask,
hidden_size=768,
num_attention_heads=12,
attention_head_size=64,
attention_probs_dropout_prob=0.1,
intermediate_size=3072,
intermediate_act_fn="gleu",
initializer_range=0.02,
hidden_dropout_prob=0.1,
use_einsum=True):
with tf.variable_scope("attention_1"):
with tf.variable_scope("self"):
attention_output = attention_layer(
from_tensor=layer_input,
to_tensor=layer_input,
attention_mask=attention_mask,
num_attention_heads=num_attention_heads,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
use_einsum=use_einsum
)
# Run a linear projection of `hidden_size` then add a residual
# with `layer_input`.
with tf.variable_scope("output"):
attention_output = dense_layer_3d_proj(
attention_output,
hidden_size,
attention_head_size,
create_initializer(initializer_range),
None,
use_einsum=use_einsum,
name="dense"
)
attention_output = dropout(attention_output, hidden_dropout_prob)
attention_output = layer_norm(attention_output + layer_input)
with tf.variable_scope("ffn_1"):
with tf.variable_scope("intermediate"):
intermediate_output = dense_layer_2d(
attention_output,
intermediate_size,
create_initializer(initializer_range),
intermediate_act_fn,
use_einsum=use_einsum,
num_attention_heads=num_attention_heads,
name="dense")
with tf.variable_scope("output"):
ffn_output = dense_layer_2d(
intermediate_output,
hidden_size,
create_initializer(initializer_range),
None,
use_einsum=use_einsum,
num_attention_heads=num_attention_heads,
name="dense")
ffn_output = dropout(ffn_output, hidden_dropout_prob)
ffn_output = layer_norm(ffn_output + attention_output)
return ffn_output
get_shape_list(prev_output, expected_rank=[2, 3])
[2, 3, 768]
def attention_layer(from_tensor,
to_tensor,
attention_mask,
num_attention_heads=12,
query_act=None,
key_act=None,
value_act=None,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
batch_size=None,
from_seq_length=None,
to_seq_length=None,
use_einsum=True):
# (batch_size, seq_length, hidden_size)
from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
# 768/12 = 64
size_per_head = int(from_shape[2]/num_attention_heads)
batch_size = from_shape[0]
from_seq_length = from_shape[1]
to_seq_length = to_shape[1]
# `query_layer` = [B, F, N, H]
q = dense_layer_3d(from_tensor, num_attention_heads, size_per_head,
create_initializer(initializer_range), query_act, use_einsum, "query")
# `key_layer` = [B, T, N, H]
k = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
create_initializer(initializer_range), key_act, use_einsum, "key")
# `value_layer` = [B, T, N, H]
v = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
create_initializer(initializer_range), value_act, use_einsum, "value")
q = tf.transpose(q, [0, 2, 1, 3])
k = tf.transpose(k, [0, 2, 1, 3])
v = tf.transpose(v, [0, 2, 1, 3])
if attention_mask is not None:
attention_mask = tf.reshape(attention_mask, [batch_size, 1, to_seq_length, 1])
# 'new_embeddings = [B, N, F, H]'
new_embeddings = dot_product_attention(q, k, v, attention_mask, attention_probs_dropout_prob)
return tf.transpose(new_embeddings, [0, 2, 1, 3])
q = dense_layer_3d(prev_output, 12, 64, create_initializer(0.02), None, True, "query")
q.shape
TensorShape([2, 3, 12, 64])
from_shape=get_shape_list(q)
from_shape
[2, 3, 12, 64]
def dot_product_attention(q, k, v, mask, dropout_rate=0.1):
# (seq_length, num_heads, q_length, kv_length)
logits = tf.matmul(q, k, transpose_b=True) # [..., length_q, length_kv]
logits = tf.multiply(logits, 1.0 / math.sqrt(float(get_shape_list(q)[-1])))
if mask is not None:
# `attention_mask` = [B, T]
from_shape = get_shape_list(q)
broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], 1], tf.float32)
mask = tf.matmul(broadcast_ones,
tf.cast(mask, tf.float32), transpose_b=True)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
adder = (1.0 - mask) * -10000.0
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
logits += adder
else:
adder = 0.0
attention_probs = tf.nn.softmax(logits, name="attention_probs")
attention_probs = dropout(attention_probs, dropout_rate)
return tf.matmul(attention_probs, v)
q = tf.transpose(q, [0, 2, 1, 3])
q.shape
TensorShape([2, 12, 3, 64])
broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], 1], tf.float32)
broadcast_ones.shape
TensorShape([2, 1, 12, 1])
attention_mask = tf.reshape(input_mask, [2, 1, 3, 1])
attention_mask.shape
TensorShape([2, 1, 3, 1])
bias = tf.matmul(broadcast_ones, tf.cast(attention_mask, tf.float32), transpose_b=True)
bias.shape
TensorShape([2, 1, 12, 3])
input_mask
<tf.Tensor: shape=(2, 3), dtype=int32, numpy= array([[1, 1, 1], [1, 1, 0]], dtype=int32)>
bias
<tf.Tensor: shape=(2, 1, 12, 3), dtype=float32, numpy= array([[[[1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.]]], [[[1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.]]]], dtype=float32)>
adder = (1.0 - bias) * -10000.0
adder
<tf.Tensor: shape=(2, 1, 12, 3), dtype=float32, numpy= array([[[[ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.], [ -0., -0., -0.]]], [[[ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.], [ -0., -0., -10000.]]]], dtype=float32)>
logits = tf.matmul(q, q, transpose_b=True) # [..., length_q, length_kv]
logits = tf.multiply(logits, 1.0 / math.sqrt(float(get_shape_list(q)[-1])))
logits.shape
TensorShape([2, 12, 3, 3])
# 最后一层
all_layer_outputs = transformer_model(embedding_output, input_mask)
all_layer_outputs.shape
TensorShape([2, 3, 768])
first_token_tensor = tf.squeeze(all_layer_outputs[:, 0:1, :], axis=1)
first_token_tensor.shape
TensorShape([2, 768])
pool = tf.keras.layers.Dense(768, activation=tf.tanh, kernel_initializer=create_initializer(0.02))
pooled_output = pool(first_token_tensor)
pooled_output.shape
TensorShape([2, 768])
output_weights = tf.get_variable(
"output_weights",
shape=[2, 768],
initializer=create_initializer(0.02))
output_bias = tf.get_variable(
"output_bias", shape=[2], initializer=tf.zeros_initializer())
output_weights
<tf.Variable 'output_weights:0' shape=(2, 768) dtype=float32, numpy= array([[ 0.00258045, 0.00957985, -0.00063457, ..., -0.00678589, 0.01690069, 0.02294636], [ 0.00786927, 0.0039197 , -0.00218187, ..., -0.026448 , -0.03057633, -0.01545808]], dtype=float32)>
output_bias
<tf.Variable 'output_bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>
logits = tf.matmul(pooled_output, output_weights, transpose_b=True)
logits.shape
TensorShape([2, 2])
logits = tf.nn.bias_add(logits, output_bias)
logits.shape
TensorShape([2, 2])
logits
<tf.Tensor: shape=(2, 2), dtype=float32, numpy= array([[ 0.28498146, -0.3474557 ], [ 0.38374817, -0.06567392]], dtype=float32)>
log_probs = tf.nn.log_softmax(logits, axis=-1)
log_probs.shape
TensorShape([2, 2])
log_probs
<tf.Tensor: shape=(2, 2), dtype=float32, numpy= array([[-0.426114 , -1.0585512 ], [-0.49347395, -0.942896 ]], dtype=float32)>
tf.nn.softmax(logits, axis=-1)
<tf.Tensor: shape=(2, 2), dtype=float32, numpy= array([[0.6530419 , 0.34695813], [0.6105018 , 0.38949817]], dtype=float32)>
labels = tf.constant([[0], [1]])
labels.shape
TensorShape([2, 1])
labels = tf.reshape(labels, [-1])
labels.shape
TensorShape([2])
one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
one_hot_labels.shape
TensorShape([2, 2])
one_hot_labels
<tf.Tensor: shape=(2, 2), dtype=float32, numpy= array([[1., 0.], [0., 1.]], dtype=float32)>
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
per_example_loss.shape
TensorShape([2])
per_example_loss
<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.426114, 0.942896], dtype=float32)>
loss
<tf.Tensor: shape=(), dtype=float32, numpy=0.684505>
def layer_norm(input_tensor, name=None):
return tf.keras.layers.LayerNormalization(name=name,axis=-1,epsilon=1e-12,dtype=tf.float32)(input_tensor)
def dropout(input_tensor, dropout_prob):
if dropout_prob is None or dropout_prob == 0.0:
return input_tensor
output = tf.nn.dropout(input_tensor, rate=dropout_prob)
return output
def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
output_tensor = layer_norm(input_tensor, name)
output_tensor = dropout(output_tensor, dropout_prob)
return output_tensor
def get_shape_list(tensor, expected_rank=None, name=None):
return tensor.shape.as_list()
def create_initializer(initializer_range=0.02):
return tf.truncated_normal_initializer(stddev=initializer_range)
def dense_layer_3d_proj(input_tensor,
hidden_size,
head_size,
initializer,
activation,
use_einsum,
name=None):
input_shape = get_shape_list(input_tensor)
num_attention_heads = input_shape[2]
with tf.variable_scope(name):
w = tf.get_variable(
name="kernel", shape=[num_attention_heads * head_size, hidden_size], initializer=initializer)
w = tf.reshape(w, [num_attention_heads, head_size, hidden_size])
b = tf.get_variable(
name="bias", shape=[hidden_size], initializer=tf.zeros_initializer)
if use_einsum:
ret = tf.einsum("BFND,NDH->BFH", input_tensor, w)
else:
ret = einsum_via_matmul(input_tensor, w, 2)
ret += b
if activation is not None:
return activation(ret)
else:
return ret
def dense_layer_3d(input_tensor,
num_attention_heads,
head_size,
initializer,
activation,
use_einsum,
name=None):
input_shape = get_shape_list(input_tensor)
hidden_size = input_shape[2]
with tf.variable_scope(name):
w = tf.get_variable(
name="kernel", shape=[hidden_size, num_attention_heads * head_size], initializer=initializer)
w = tf.reshape(
w, [hidden_size, num_attention_heads, head_size])
b = tf.get_variable(
name="bias", shape=[num_attention_heads * head_size], initializer=tf.zeros_initializer)
b = tf.reshape(b, [num_attention_heads, head_size])
if use_einsum:
ret = tf.einsum("BFH,HND->BFND", input_tensor, w)
else:
ret = einsum_via_matmul(input_tensor, w, 1)
ret += b
if activation is not None:
return get_activation(activation)(ret)
else:
return ret
def dense_layer_2d(input_tensor,
output_size,
initializer,
activation,
use_einsum,
num_attention_heads=1,
name=None):
input_shape = get_shape_list(input_tensor)
hidden_size = input_shape[2]
with tf.variable_scope(name):
w = tf.get_variable(name="kernel", shape=[hidden_size, output_size], initializer=initializer)
b = tf.get_variable(name="bias", shape=[output_size], initializer=tf.zeros_initializer)
if use_einsum:
ret = tf.einsum("BFH,HO->BFO", input_tensor, w)
else:
ret = tf.matmul(input_tensor, w)
ret += b
if activation is not None:
return get_activation(activation)(ret)
else:
return ret
def gelu(x):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
Original paper: https://arxiv.org/abs/1606.08415
Args:
x: float Tensor to perform activation.
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
def get_activation(activation_string):
if not isinstance(activation_string, six.string_types):
return activation_string
if not activation_string:
return None
act = activation_string.lower()
if act == "linear":
return None
elif act == "relu":
return tf.nn.relu
elif act == "gelu":
return gelu
elif act == "tanh":
return tf.tanh
x = tf.constant([[[1, 2, 3], [4, 5, 6]]])
x.shape
TensorShape([1, 2, 3])
w = tf.constant([[1], [2], [3]])
w.shape
TensorShape([3, 1])
tf.matmul(x, w)
<tf.Tensor: shape=(1, 2, 1), dtype=int32, numpy= array([[[14], [32]]], dtype=int32)>
tf.einsum("BFH,HO->BFO", x, w)
<tf.Tensor: shape=(1, 2, 1), dtype=int32, numpy= array([[[14], [32]]], dtype=int32)>
tf.tensordot(x, w, 1)
<tf.Tensor: shape=(1, 2, 1), dtype=int32, numpy= array([[[14], [32]]], dtype=int32)>
pvals = 1. / np.arange(1, 3 + 1)
pvals /= pvals.sum(keepdims=True)
pvals
array([0.54545455, 0.27272727, 0.18181818])
i = 2
p=pvals[:i] /pvals[:i].sum(keepdims=True)
p
array([0.66666667, 0.33333333])
ngrams = np.arange(1, 3 + 1, dtype=np.int64)
ngrams
array([1, 2, 3])
n = np.random.choice(ngrams[:i], p=pvals[:i] /pvals[:i].sum(keepdims=True))
n
1
import random
rng = random.Random(12345)
cd ~/Documents/Study/DL-Models/albert/
/Users/HaoShaochun/Documents/Study/DL-Models/albert
import tokenization
tokenizer = tokenization.FullTokenizer(
vocab_file="vocab.txt", do_lower_case=True,
spm_model_file=None)
tokens_a = "Text should be one-sentence-per-line, with empty lines between documents.".split()
tokens_b = "This sample text is public domain and was randomly selected from Project Guttenberg.".split()
tokens = []
tokens.append("[CLS]")
for token in tokens_a:
tokens.append(token)
tokens.append("[SEP]")
for token in tokens_b:
tokens.append(token)
tokens.append("[SEP]")
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"])
(output_tokens,
masked_lm_positions,
masked_lm_labels) = create_masked_lm_predictions(tokens)
num_to_predict: 4 [[[9]], [[9], [11]], [[9], [11], [12]]] [[[20]], [[20], [21]], [[20], [21], [22]]] [[[3]], [[3], [4]], [[3], [4], [5]]]
masked_lm_positions
[9, 11, 12, 20]
masked_lm_labels
['documents.', 'This', 'sample', 'selected']
print(" ".join(tokens))
[CLS] Text should be one-sentence-per-line, with empty lines between documents. [SEP] This sample text is public domain and was randomly selected from Project Guttenberg. [SEP]
print(" ".join(output_tokens))
[CLS] Text should be one-sentence-per-line, with empty lines between [MASK] [SEP] [MASK] [MASK] text is public domain and was randomly 屿 from Project Guttenberg. [SEP]
def create_masked_lm_predictions(
tokens,
masked_lm_prob=0.15,
max_predictions_per_seq=20,
vocab_words=list(tokenizer.vocab.keys()),
rng=rng):
cand_indexes = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
cand_indexes.append([i])
output_tokens = list(tokens)
masked_lm_positions = []
masked_lm_labels = []
num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
print("num_to_predict: ", num_to_predict)
ngrams = np.arange(1, 3 + 1, dtype=np.int64)
pvals = 1. / np.arange(1, 3 + 1)
pvals /= pvals.sum(keepdims=True)
ngram_indexes = []
for idx in range(len(cand_indexes)):
ngram_index = []
for n in ngrams:
ngram_index.append(cand_indexes[idx:idx+n])
ngram_indexes.append(ngram_index)
rng.shuffle(ngram_indexes)
masked_lms = []
covered_indexes = set()
for cand_index_set in ngram_indexes:
print(cand_index_set)
if len(masked_lms) >= num_to_predict:
break
n = np.random.choice(
ngrams[:len(cand_index_set)],
p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True))
# [16, 17] = sum([[16], [17]], [])
index_set = sum(cand_index_set[n - 1], [])
for index in index_set:
masked_token = None
if rng.random() < 0.8:
masked_token = "[MASK]"
else:
if rng.random() < 0.5:
masked_token = tokens[index]
else:
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
output_tokens[index] = masked_token
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
rng.shuffle(ngram_indexes)
masked_lms = sorted(masked_lms, key=lambda x: x.index)
for p in masked_lms:
masked_lm_positions.append(p.index)
masked_lm_labels.append(p.label)
return (output_tokens, masked_lm_positions, masked_lm_labels)
def create_masked_lm_predictions(
tokens,
masked_lm_prob=0.15,
max_predictions_per_seq=20,
vocab_words=list(tokenizer.vocab.keys()),
rng=rng):
cand_indexes = []
token_boundary = [0] * len(tokens)
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
token_boundary[i] = 1
continue
cand_indexes.append([i])
token_boundary[i] = 1
output_tokens = list(tokens)
masked_lm_positions = []
masked_lm_labels = []
num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
print("num_to_predict: ", num_to_predict)
ngrams = np.arange(1, 3 + 1, dtype=np.int64)
pvals = 1. / np.arange(1, 3 + 1)
pvals /= pvals.sum(keepdims=True)
ngram_indexes = []
for idx in range(len(cand_indexes)):
ngram_index = []
for n in ngrams:
ngram_index.append(cand_indexes[idx:idx+n])
ngram_indexes.append(ngram_index)
rng.shuffle(ngram_indexes)
masked_lms = []
covered_indexes = set()
for cand_index_set in ngram_indexes:
print("cand_index_set", cand_index_set)
if len(masked_lms) >= num_to_predict:
break
if not cand_index_set:
continue
for index_set in cand_index_set[0]:
print(index_set)
print("H?")
for index in index_set:
print(index, covered_indexes)
if index in covered_indexes:
continue
n = np.random.choice(
ngrams[:len(cand_index_set)],
p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True))
# [16, 17] = sum([[16], [17]], [])
index_set = sum(cand_index_set[n - 1], [])
n -= 1
while len(masked_lms) + len(index_set) > num_to_predict:
print("I?")
if n == 0:
break
index_set = sum(cand_index_set[n - 1], [])
n -= 1
if len(masked_lms) + len(index_set) > num_to_predict:
continue
is_any_index_covered = False
for index in index_set:
print("J?")
if index in covered_indexes:
is_any_index_covered = True
break
if is_any_index_covered:
continue
for index in index_set:
covered_indexes.add(index)
masked_token = None
if rng.random() < 0.8:
masked_token = "[MASK]"
else:
if rng.random() < 0.5:
masked_token = tokens[index]
else:
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
output_tokens[index] = masked_token
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
rng.shuffle(ngram_indexes)
masked_lms = sorted(masked_lms, key=lambda x: x.index)
for p in masked_lms:
masked_lm_positions.append(p.index)
masked_lm_labels.append(p.label)
return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary)
(output_tokens,
masked_lm_positions,
masked_lm_labels,
token_boundary) = create_masked_lm_predictions(tokens)
num_to_predict: 4 cand_index_set [[[12]], [[12], [13]], [[12], [13], [14]]] [12] H? 12 set() J? J? cand_index_set [[[3]], [[3], [4]], [[3], [4], [5]]] [3] H? 3 {12, 13} J? cand_index_set [[[22]], [[22], [23]], [[22], [23]]] [22] H? 22 {3, 12, 13} J? cand_index_set [[[15]], [[15], [16]], [[15], [16], [17]]]
masked_lm_positions
[5, 21, 22, 23]
masked_lm_labels
['with', 'from', 'Project', 'Guttenberg.']
output_tokens
['[CLS]', 'Text', 'should', 'be', 'one-sentence-per-line,', '##聰', 'empty', 'lines', 'between', 'documents.', '[SEP]', 'This', 'sample', 'text', 'is', 'public', 'domain', 'and', 'was', 'randomly', 'selected', '[MASK]', '[MASK]', 'Guttenberg.', '[SEP]']