In [0]:
"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need this cell
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/free_chat/chinese_gaoq1/main')
In [0]:
%tensorflow_version 1.x
!pip install texar
In [0]:
import tensorflow as tf
import texar.tf as tx

import numpy as np
import pprint
import logging
import copy

from pathlib import Path
from texar.tf.modules import TransformerEncoder, TransformerDecoder

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())
TensorFlow Version 1.15.0
GPU Enabled: True
In [0]:
# stream data from text files
def data_generator(f_paths, params):
  char2idx = params['char2idx']
  for f_path in f_paths:
    with open(f_path) as f:
      print('Reading', f_path)
      for line in f:
        line = line.rstrip()
        sp = line.split('|')
        if len(sp) == 2:
          source, target = sp
        else:
          continue
        source = [char2idx.get(c, len(char2idx)) for c in list(source)]
        target = [char2idx.get(c, len(char2idx)) for c in list(target)]
        if len(source) > params['max_len']:
          source = source[:params['max_len']]
        if len(target) > params['max_len']:
          target = target[:params['max_len']]
        target_in = [1] + target
        target_out = target + [2]
        if len(source) > 0:
          yield (source, (target_in, target_out))
In [0]:
def dataset(is_training, params):
  _shapes = ([None], ([None], [None]))
  _types = (tf.int32, (tf.int32, tf.int32))
  _pads = (0, (0, 0))
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_paths'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['buffer_size'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_paths'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds
In [0]:
def clip_grads(loss):
    variables = tf.trainable_variables()
    pprint.pprint(variables)
    grads = tf.gradients(loss, variables)
    clipped_grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
    return zip(clipped_grads, variables)
In [0]:
def forward(words, labels, mode):
    words_len = tf.count_nonzero(words, 1, dtype=tf.int32)
    
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    batch_sz = tf.shape(words)[0]
    
  
    with tf.variable_scope('Embedding'):
        embedding = tf.Variable(np.load('../vocab/char.npy'),
                                dtype=tf.float32,
                                name='fasttext_vectors')
        embedding = tf.concat([tf.zeros(shape=[1, params['embed_dim']]), embedding[1:, :]], axis=0)
        x = tf.nn.embedding_lookup(embedding, words)
        pos_embedder = tx.modules.SinusoidsPositionEmbedder(
            position_size = params['max_len'] + 1,
            hparams = config_model.position_embedder_hparams)
        x = (x * config_model.hidden_dim ** 0.5) + pos_embedder(sequence_length=words_len)


    with tf.variable_scope('Encoder'):
        encoder = TransformerEncoder(hparams=config_model.encoder)
        enc_out = encoder(inputs=x, sequence_length=words_len)
    
    
    with tf.variable_scope('Decoder'):
        decoder = TransformerDecoder(vocab_size=len(params['char2idx'])+1,
                                     output_layer=tf.transpose(embedding, (1, 0)),
                                     hparams=config_model.decoder)
        
        if is_training or (mode == tf.estimator.ModeKeys.EVAL):
            dec_inputs, dec_outputs = labels
            dec_seq_len = tf.count_nonzero(dec_inputs, 1, dtype=tf.int32)
            dec_inputs = tf.nn.embedding_lookup(embedding, dec_inputs)
            dec_inputs = (dec_inputs * config_model.hidden_dim ** 0.5) + pos_embedder(sequence_length=dec_seq_len)

            outputs = decoder(
                memory=enc_out,
                memory_sequence_length=words_len,
                inputs=dec_inputs,
                decoding_strategy='train_greedy',
                mode=tf.estimator.ModeKeys.TRAIN
            )
            
            return outputs.logits
        else:
            start_tokens = tf.fill([batch_sz], 1)

            def _embedding_fn(x, y):
                x_w_embed = tf.nn.embedding_lookup(embedding, x)
                y_p_embed = pos_embedder(y)
                return x_w_embed * config_model.hidden_dim ** 0.5 + y_p_embed

            predictions = decoder(
                memory=enc_out,
                memory_sequence_length=words_len,
                beam_width=params['beam_width'],
                length_penalty=params['length_penalty'],
                start_tokens=start_tokens,
                end_token=2,
                embedding=_embedding_fn,
                max_decoding_length=params['max_len'],
                mode=tf.estimator.ModeKeys.PREDICT)
            
            return predictions['sample_id'][:, :, :params['top_k']]
In [0]:
def model_fn(features, labels, mode, params):
    logits_or_ids = forward(features, labels, mode)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions=logits_or_ids)
    
    dec_inputs, dec_outputs = labels
    if (params['label_smoothing'] <= .0) or (mode == tf.estimator.ModeKeys.EVAL):
      loss_op = tf.contrib.seq2seq.sequence_loss(logits = logits_or_ids,
                                                 targets = dec_outputs,
                                                 weights = tf.to_float(tf.sign(dec_outputs)))
    else:
      loss_op = tf.losses.softmax_cross_entropy(onehot_labels = tf.one_hot(dec_outputs, len(params['char2idx'])+1),
                                                logits = logits_or_ids,
                                                weights = tf.to_float(tf.sign(dec_outputs)),
                                                label_smoothing = params['label_smoothing'],)
      
    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step=tf.train.get_or_create_global_step()
        
        decay_lr = tf.train.exponential_decay(
            params['lr'], global_step, 1000, .99)
        
        train_op = tf.train.AdamOptimizer(decay_lr).apply_gradients(
            clip_grads(loss_op), global_step=global_step)
        
        hook = tf.train.LoggingTensorHook({'lr': decay_lr}, every_n_iter=100)
        
        return tf.estimator.EstimatorSpec(
            mode=mode, loss=loss_op, train_op=train_op, training_hooks=[hook],)
      
    if mode == tf.estimator.ModeKeys.EVAL:
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss_op)
In [0]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip('\n')
      word2idx[line] = i
  return word2idx
In [0]:
def pad(test_strs):
  max_len = max([len(test_str) for test_str in test_strs])
  for test_str in test_strs:
    if len(test_str) < max_len:
      test_str += ['<pad>'] * (max_len - len(test_str))


def minimal_test(estimator):
  test_strs = [
    '你好',
    '早上好',
    '晚上好',
    '再见',
    '好久不见',
    '想死你了',
    '谢谢你',
    '爱你',
    '你好厉害啊',
    '你叫什么',
    '你几岁了',
    '现在几点',
    '今天天气怎么样',
    '我们现在在哪里',
    '讲个笑话',
    '你会几种语言呀',
    '你觉得我帅吗',
    '讨厌的周一',
    '好烦啊',
    '天气真好',
    '今天好冷',
    '今天好热',
    '下雨了',
    '风好大',
    '终于周五了',
    '我想去K歌',
    '红烧肉吃了会发胖吗',
    '你觉得梅西厉害吗',
  ]
  test_strs = [list(test_str) for test_str in test_strs]
  pad(test_strs)
  test_arrs = [[params['char2idx'].get(c, len(params['char2idx'])) for c in test_str] for test_str in test_strs]
  predicted = list(estimator.predict(tf.estimator.inputs.numpy_input_fn(
    x = np.asarray(test_arrs), shuffle = False)))
  predicted = np.asarray(predicted)
  print('-'*12)
  print('minimal test')
  for i, test_str in enumerate(test_strs):
    print('Q:', ' '.join([c for c in test_str if c != '<pad>']))
    for j in range(params['top_k']):
      sent = ' '.join([params['idx2char'].get(idx, '<unk>') for idx in predicted[i, :, j] if (idx != 0 and idx != 2)])
      print('A{}:'.format(j+1), sent)
    print()
  print('-'*12)
In [0]:
def is_increasing(history: list) -> bool:
  history = history[-(params['num_patience']+1):]
  for i in range(1, len(history)):
    if history[i-1] >= history[i]:
      return False
  return True  
In [0]:
class config_model:
    hidden_dim = 300
    num_heads = 8
    dropout_rate = .2
    num_blocks = 6

    position_embedder_hparams = {
        'dim': hidden_dim
    }

    encoder = {
        'dim': hidden_dim,
        'embedding_dropout': dropout_rate,
        'residual_dropout': dropout_rate,
        'num_blocks': num_blocks,
        'initializer': {
            'type': 'variance_scaling_initializer',
            'kwargs': {
                'scale': 1.0,
                'mode': 'fan_avg',
                'distribution': 'uniform',
            },
        },
        'multihead_attention': {
            'dropout_rate': dropout_rate,
            'num_heads': num_heads,
            'output_dim': hidden_dim,
            'use_bias': True,
        },
        'poswise_feedforward': {
          'name': 'fnn',
          'layers': [
              {
                  'type': 'Dense',
                  'kwargs': {
                      'name': 'conv1',
                      'units': hidden_dim * 4,
                      'activation': 'relu',
                      'use_bias': True,
                  },
              },
              {
                  'type': 'Dropout',
                  'kwargs': {
                      'rate': dropout_rate,
                  }
              },
              {
                  'type': 'Dense',
                  'kwargs': {
                      'name': 'conv2',
                      'units': hidden_dim,
                      'use_bias': True,
                  }
              }
          ],
        },
    }

    decoder = copy.deepcopy(encoder)
    decoder['output_layer_bias'] = True


params = {
    'model_dir': '../model/transformer',
    'log_path': '../log/transformer.txt',
    'train_paths': ['../data/train.txt', '../data/test.txt'],
    'test_paths': ['../data/core.txt'],
    'vocab_path': '../vocab/char.txt',
    'max_len': 30,
    'embed_dim': config_model.hidden_dim,
    'beam_width': 5,
    'top_k': 3,
    'length_penalty': .6,
    'label_smoothing': .2,
    'lr': 1e-3,
    'clip_norm': 5.,
    'buffer_size': 410000,
    'batch_size': 128,
    'num_patience': 5,
}
In [0]:
params['char2idx'] = get_vocab(params['vocab_path'])
params['idx2char'] = {idx: char for char, idx in params['char2idx'].items()}
In [14]:
# Create directory if not exist
Path(os.path.dirname(params['log_path'])).mkdir(exist_ok=True)
Path(params['model_dir']).mkdir(exist_ok=True, parents=True)

# Logging
logger = logging.getLogger('tensorflow')
logger.propagate = False
logger.setLevel(logging.INFO)
fh = logging.FileHandler(params['log_path'])
logger.addHandler(fh)

# Create an estimator
estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  model_dir=params['model_dir'],
  config=tf.estimator.RunConfig(
    save_checkpoints_steps=params['buffer_size']//params['batch_size'], keep_checkpoint_max=3),
  params=params)

best_ppl = 10000.
history_ppl = []
tf.enable_eager_execution()

while True:
  estimator.train(input_fn=lambda: dataset(is_training=True, params=params))

  minimal_test(estimator)
  
  loss = estimator.evaluate(input_fn=lambda: dataset(is_training=False, params=params))['loss']
  ppl = np.exp(loss)
  logger.info("Perplexity: {:.3f}".format(ppl))
  history_ppl.append(ppl)

  if ppl < best_ppl:
    best_ppl = ppl
  logger.info("Best Perplexity: {:.3f}".format(best_ppl))

  if len(history_ppl) > params['num_patience'] and is_increasing(history_ppl):
    logger.info("Perplexity not improved over {} epochs, Early Stop".format(params['num_patience']))
    break
  
  # BLEU
  labels = [label for _, (_, label) in dataset(is_training=False, params=params)]
  labels = [j for i in labels for j in i.numpy()]
  labels = [[params['idx2char'].get(idx, '<unk>') for idx in arr if (idx!=0 and idx!=2)] for arr in labels]

  preds = list(estimator.predict(input_fn=lambda: dataset(is_training=False, params=params)))
  assert len(labels) == len(preds)
  preds = [[params['idx2char'].get(idx, '<unk>') for idx in arr[:, 0] if (idx!=0 and idx!=2)] for arr in preds]

  bleu, bleu_1, bleu_2, bleu_3, bleu_4 = tx.evals.corpus_bleu_moses(list_of_references=[[l] for l in labels], hypotheses=preds, return_all=True)
  logger.info("BLEU-2: {:.1f}".format(bleu_2))
INFO:tensorflow:Using config: {'_model_dir': '../model/transformer', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 3203, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6b10a29400>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/module_base.py:83: The name tf.make_template is deprecated. Please use tf.compat.v1.make_template instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/modules/embedders/position_embedders.py:338: The name tf.mod is deprecated. Please use tf.math.mod instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/modules/encoders/transformer_encoders.py:142: The name tf.get_variable_scope is deprecated. Please use tf.compat.v1.get_variable_scope instead.

WARNING:tensorflow:From /usr/lib/python3.6/pydoc.py:1595: The name tf.layers.Dense is deprecated. Please use tf.compat.v1.layers.Dense instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/core/layers.py:608: The name tf.layers.Layer is deprecated. Please use tf.compat.v1.layers.Layer instead.

WARNING:tensorflow:From /usr/lib/python3.6/pydoc.py:1595: The name tf.layers.Dropout is deprecated. Please use tf.compat.v1.layers.Dropout instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/context.py:53: The name tf.get_collection_ref is deprecated. Please use tf.compat.v1.get_collection_ref instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/context.py:56: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/modules/encoders/transformer_encoders.py:317: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dropout instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/layers/core.py:271: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.__call__` method instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/utils/transformer_utils.py:73: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/module_base.py:140: The name tf.get_collection is deprecated. Please use tf.compat.v1.get_collection instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/module_base.py:141: The name tf.GraphKeys is deprecated. Please use tf.compat.v1.GraphKeys instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/modules/networks/network_base.py:125: The name tf.layers.BatchNormalization is deprecated. Please use tf.compat.v1.layers.BatchNormalization instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/modules/decoders/rnn_decoder_base.py:57: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/utils/transformer_attentions.py:109: The name tf.matrix_band_part is deprecated. Please use tf.linalg.band_part instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/texar/tf/modules/decoders/transformer_decoders.py:611: The name tf.AUTO_REUSE is deprecated. Please use tf.compat.v1.AUTO_REUSE instead.

WARNING:tensorflow:From <ipython-input-8-aafb468fc0d9>:15: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
[<tf.Variable 'Embedding/fasttext_vectors:0' shape=(5904, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/affine_bias:0' shape=(5904,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/gamma:0' shape=(300,) dtype=float32_ref>]
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-650856
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py:1069: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 650856 into ../model/transformer/model.ckpt.
Reading ../data/train.txt
INFO:tensorflow:loss = 5.048138, step = 650856
INFO:tensorflow:lr = 1.4425898e-06
INFO:tensorflow:global_step/sec: 3.98632
INFO:tensorflow:loss = 4.8630433, step = 650956 (25.091 sec)
INFO:tensorflow:lr = 1.4411402e-06 (25.090 sec)
INFO:tensorflow:global_step/sec: 5.02871
INFO:tensorflow:loss = 4.8994503, step = 651056 (19.885 sec)
INFO:tensorflow:lr = 1.4396928e-06 (19.882 sec)
INFO:tensorflow:global_step/sec: 5.08361
INFO:tensorflow:loss = 4.9150076, step = 651156 (19.672 sec)
INFO:tensorflow:lr = 1.4382471e-06 (19.672 sec)
INFO:tensorflow:global_step/sec: 5.04785
INFO:tensorflow:loss = 4.845054, step = 651256 (19.807 sec)
INFO:tensorflow:lr = 1.4368017e-06 (19.807 sec)
INFO:tensorflow:global_step/sec: 5.06356
INFO:tensorflow:loss = 4.8662877, step = 651356 (19.752 sec)
INFO:tensorflow:lr = 1.4353589e-06 (19.752 sec)
INFO:tensorflow:global_step/sec: 5.08223
INFO:tensorflow:loss = 4.941926, step = 651456 (19.677 sec)
INFO:tensorflow:lr = 1.4339164e-06 (19.678 sec)
INFO:tensorflow:global_step/sec: 5.05975
INFO:tensorflow:loss = 5.020806, step = 651556 (19.761 sec)
INFO:tensorflow:lr = 1.4324763e-06 (19.764 sec)
INFO:tensorflow:global_step/sec: 5.07532
INFO:tensorflow:loss = 4.971525, step = 651656 (19.706 sec)
INFO:tensorflow:lr = 1.4310377e-06 (19.701 sec)
INFO:tensorflow:global_step/sec: 5.0321
INFO:tensorflow:loss = 4.882325, step = 651756 (19.871 sec)
INFO:tensorflow:lr = 1.4295998e-06 (19.871 sec)
INFO:tensorflow:global_step/sec: 5.0626
INFO:tensorflow:loss = 4.9295044, step = 651856 (19.755 sec)
INFO:tensorflow:lr = 1.4281641e-06 (19.755 sec)
INFO:tensorflow:global_step/sec: 5.07049
INFO:tensorflow:loss = 5.0388036, step = 651956 (19.721 sec)
INFO:tensorflow:lr = 1.4267288e-06 (19.721 sec)
INFO:tensorflow:global_step/sec: 5.07254
INFO:tensorflow:loss = 4.874211, step = 652056 (19.712 sec)
INFO:tensorflow:lr = 1.425296e-06 (19.714 sec)
INFO:tensorflow:global_step/sec: 5.09335
INFO:tensorflow:loss = 5.0144157, step = 652156 (19.636 sec)
INFO:tensorflow:lr = 1.4238645e-06 (19.635 sec)
INFO:tensorflow:global_step/sec: 5.07083
INFO:tensorflow:loss = 4.929363, step = 652256 (19.720 sec)
INFO:tensorflow:lr = 1.4224337e-06 (19.720 sec)
INFO:tensorflow:global_step/sec: 5.10076
INFO:tensorflow:loss = 4.9470797, step = 652356 (19.605 sec)
INFO:tensorflow:lr = 1.4210052e-06 (19.606 sec)
INFO:tensorflow:global_step/sec: 5.10776
INFO:tensorflow:loss = 4.98228, step = 652456 (19.577 sec)
INFO:tensorflow:lr = 1.4195773e-06 (19.577 sec)
INFO:tensorflow:global_step/sec: 5.09744
INFO:tensorflow:loss = 4.9890523, step = 652556 (19.618 sec)
INFO:tensorflow:lr = 1.4181518e-06 (19.617 sec)
INFO:tensorflow:global_step/sec: 5.07529
INFO:tensorflow:loss = 4.9220424, step = 652656 (19.712 sec)
INFO:tensorflow:lr = 1.4167274e-06 (19.714 sec)
INFO:tensorflow:global_step/sec: 5.11015
INFO:tensorflow:loss = 4.9812136, step = 652756 (19.559 sec)
INFO:tensorflow:lr = 1.4153037e-06 (19.557 sec)
INFO:tensorflow:global_step/sec: 5.09053
INFO:tensorflow:loss = 4.826167, step = 652856 (19.645 sec)
INFO:tensorflow:lr = 1.4138823e-06 (19.645 sec)
INFO:tensorflow:global_step/sec: 5.07759
INFO:tensorflow:loss = 4.897809, step = 652956 (19.694 sec)
INFO:tensorflow:lr = 1.4124615e-06 (19.696 sec)
INFO:tensorflow:global_step/sec: 5.08837
INFO:tensorflow:loss = 4.9714694, step = 653056 (19.651 sec)
INFO:tensorflow:lr = 1.411043e-06 (19.650 sec)
INFO:tensorflow:global_step/sec: 5.09934
INFO:tensorflow:loss = 4.994254, step = 653156 (19.610 sec)
INFO:tensorflow:lr = 1.4096258e-06 (19.609 sec)
INFO:tensorflow:global_step/sec: 5.11357
INFO:tensorflow:loss = 4.89083, step = 653256 (19.558 sec)
INFO:tensorflow:lr = 1.4082094e-06 (19.559 sec)
INFO:tensorflow:global_step/sec: 5.08909
INFO:tensorflow:loss = 4.8537574, step = 653356 (19.650 sec)
INFO:tensorflow:lr = 1.4067951e-06 (19.652 sec)
INFO:tensorflow:global_step/sec: 5.07151
INFO:tensorflow:loss = 5.0255413, step = 653456 (19.718 sec)
INFO:tensorflow:lr = 1.4053816e-06 (19.715 sec)
INFO:tensorflow:global_step/sec: 5.08803
INFO:tensorflow:loss = 4.911609, step = 653556 (19.653 sec)
INFO:tensorflow:lr = 1.4039701e-06 (19.654 sec)
INFO:tensorflow:global_step/sec: 5.10388
INFO:tensorflow:loss = 4.868167, step = 653656 (19.594 sec)
INFO:tensorflow:lr = 1.4025601e-06 (19.596 sec)
INFO:tensorflow:global_step/sec: 5.08915
INFO:tensorflow:loss = 4.97602, step = 653756 (19.646 sec)
INFO:tensorflow:lr = 1.4011507e-06 (19.646 sec)
INFO:tensorflow:global_step/sec: 5.11741
INFO:tensorflow:loss = 4.942892, step = 653856 (19.544 sec)
INFO:tensorflow:lr = 1.3997436e-06 (19.545 sec)
INFO:tensorflow:global_step/sec: 5.11833
INFO:tensorflow:loss = 4.9817877, step = 653956 (19.537 sec)
INFO:tensorflow:lr = 1.398337e-06 (19.536 sec)
INFO:tensorflow:global_step/sec: 5.10963
INFO:tensorflow:loss = 4.8749976, step = 654056 (19.571 sec)
INFO:tensorflow:lr = 1.3969326e-06 (19.570 sec)
INFO:tensorflow:Saving checkpoints for 654059 into ../model/transformer/model.ckpt.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py:963: remove_checkpoint (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to delete files with this prefix.
INFO:tensorflow:global_step/sec: 4.59063
INFO:tensorflow:loss = 4.902522, step = 654156 (21.782 sec)
INFO:tensorflow:lr = 1.3955298e-06 (21.783 sec)
INFO:tensorflow:global_step/sec: 5.09699
INFO:tensorflow:loss = 4.8814692, step = 654256 (19.620 sec)
INFO:tensorflow:lr = 1.3941274e-06 (19.623 sec)
INFO:tensorflow:global_step/sec: 5.11221
INFO:tensorflow:loss = 4.9226937, step = 654356 (19.559 sec)
INFO:tensorflow:lr = 1.3927273e-06 (19.555 sec)
INFO:tensorflow:global_step/sec: 5.08753
INFO:tensorflow:loss = 4.97724, step = 654456 (19.658 sec)
INFO:tensorflow:lr = 1.3913276e-06 (19.659 sec)
INFO:tensorflow:global_step/sec: 5.11818
INFO:tensorflow:loss = 4.874299, step = 654556 (19.537 sec)
INFO:tensorflow:lr = 1.3899304e-06 (19.536 sec)
INFO:tensorflow:global_step/sec: 5.10421
INFO:tensorflow:loss = 4.8666754, step = 654656 (19.593 sec)
INFO:tensorflow:lr = 1.3885345e-06 (19.595 sec)
INFO:tensorflow:global_step/sec: 5.12092
INFO:tensorflow:loss = 4.946087, step = 654756 (19.528 sec)
INFO:tensorflow:lr = 1.3871393e-06 (19.527 sec)
INFO:tensorflow:global_step/sec: 5.0998
INFO:tensorflow:loss = 4.8817616, step = 654856 (19.609 sec)
INFO:tensorflow:lr = 1.3857461e-06 (19.609 sec)
INFO:tensorflow:global_step/sec: 5.07824
INFO:tensorflow:loss = 5.013923, step = 654956 (19.692 sec)
INFO:tensorflow:lr = 1.3843536e-06 (19.692 sec)
INFO:tensorflow:global_step/sec: 5.10062
INFO:tensorflow:loss = 4.8343573, step = 655056 (19.603 sec)
INFO:tensorflow:lr = 1.3829633e-06 (19.602 sec)
INFO:tensorflow:global_step/sec: 5.13039
INFO:tensorflow:loss = 5.0005765, step = 655156 (19.494 sec)
INFO:tensorflow:lr = 1.3815744e-06 (19.495 sec)
INFO:tensorflow:global_step/sec: 5.1295
INFO:tensorflow:loss = 4.9272656, step = 655256 (19.496 sec)
INFO:tensorflow:lr = 1.3801862e-06 (19.495 sec)
INFO:tensorflow:global_step/sec: 5.13454
INFO:tensorflow:loss = 4.906322, step = 655356 (19.475 sec)
INFO:tensorflow:lr = 1.3788e-06 (19.475 sec)
INFO:tensorflow:global_step/sec: 5.14248
INFO:tensorflow:loss = 5.044228, step = 655456 (19.444 sec)
INFO:tensorflow:lr = 1.3774144e-06 (19.444 sec)
INFO:tensorflow:global_step/sec: 5.11669
INFO:tensorflow:loss = 5.004399, step = 655556 (19.546 sec)
INFO:tensorflow:lr = 1.376031e-06 (19.547 sec)
INFO:tensorflow:global_step/sec: 5.13849
INFO:tensorflow:loss = 4.891444, step = 655656 (19.460 sec)
INFO:tensorflow:lr = 1.3746491e-06 (19.459 sec)
INFO:tensorflow:global_step/sec: 5.11514
INFO:tensorflow:loss = 4.9581423, step = 655756 (19.548 sec)
INFO:tensorflow:lr = 1.3732678e-06 (19.548 sec)
INFO:tensorflow:global_step/sec: 5.12834
INFO:tensorflow:loss = 4.982334, step = 655856 (19.500 sec)
INFO:tensorflow:lr = 1.3718886e-06 (19.500 sec)
INFO:tensorflow:global_step/sec: 5.10244
INFO:tensorflow:loss = 4.978611, step = 655956 (19.599 sec)
INFO:tensorflow:lr = 1.3705102e-06 (19.599 sec)
INFO:tensorflow:global_step/sec: 5.13682
INFO:tensorflow:loss = 4.9138856, step = 656056 (19.468 sec)
INFO:tensorflow:lr = 1.3691337e-06 (19.468 sec)
INFO:tensorflow:global_step/sec: 5.14282
INFO:tensorflow:loss = 5.062878, step = 656156 (19.444 sec)
INFO:tensorflow:lr = 1.3677587e-06 (19.444 sec)
INFO:tensorflow:global_step/sec: 5.13187
INFO:tensorflow:loss = 4.967395, step = 656256 (19.487 sec)
INFO:tensorflow:lr = 1.3663841e-06 (19.486 sec)
INFO:tensorflow:global_step/sec: 5.12443
INFO:tensorflow:loss = 4.916181, step = 656356 (19.513 sec)
INFO:tensorflow:lr = 1.3650118e-06 (19.513 sec)
INFO:tensorflow:global_step/sec: 5.10817
INFO:tensorflow:loss = 5.040177, step = 656456 (19.576 sec)
INFO:tensorflow:lr = 1.3636402e-06 (19.577 sec)
INFO:tensorflow:global_step/sec: 5.10714
INFO:tensorflow:loss = 4.8451858, step = 656556 (19.582 sec)
INFO:tensorflow:lr = 1.3622707e-06 (19.583 sec)
INFO:tensorflow:global_step/sec: 5.12472
INFO:tensorflow:loss = 4.8967586, step = 656656 (19.514 sec)
INFO:tensorflow:lr = 1.3609027e-06 (19.514 sec)
INFO:tensorflow:global_step/sec: 5.12403
INFO:tensorflow:loss = 4.8584347, step = 656756 (19.515 sec)
INFO:tensorflow:lr = 1.3595351e-06 (19.514 sec)
INFO:tensorflow:global_step/sec: 5.09274
INFO:tensorflow:loss = 4.9920006, step = 656856 (19.639 sec)
INFO:tensorflow:lr = 1.3581698e-06 (19.639 sec)
INFO:tensorflow:global_step/sec: 5.11755
INFO:tensorflow:loss = 4.976631, step = 656956 (19.539 sec)
INFO:tensorflow:lr = 1.3568048e-06 (19.540 sec)
INFO:tensorflow:global_step/sec: 5.11649
INFO:tensorflow:loss = 4.9308963, step = 657056 (19.544 sec)
INFO:tensorflow:lr = 1.3554422e-06 (19.544 sec)
INFO:tensorflow:global_step/sec: 5.12674
INFO:tensorflow:loss = 4.9675937, step = 657156 (19.507 sec)
INFO:tensorflow:lr = 1.3540811e-06 (19.508 sec)
INFO:tensorflow:global_step/sec: 5.13098
INFO:tensorflow:loss = 4.8716006, step = 657256 (19.489 sec)
INFO:tensorflow:lr = 1.3527203e-06 (19.491 sec)
INFO:tensorflow:Saving checkpoints for 657262 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.48289
INFO:tensorflow:loss = 5.025392, step = 657356 (22.307 sec)
INFO:tensorflow:lr = 1.3513618e-06 (22.308 sec)
INFO:tensorflow:global_step/sec: 5.12383
INFO:tensorflow:loss = 4.9975967, step = 657456 (19.513 sec)
INFO:tensorflow:lr = 1.3500039e-06 (19.513 sec)
INFO:tensorflow:global_step/sec: 5.12394
INFO:tensorflow:loss = 4.9402432, step = 657556 (19.521 sec)
INFO:tensorflow:lr = 1.3486482e-06 (19.518 sec)
INFO:tensorflow:global_step/sec: 5.10971
INFO:tensorflow:loss = 5.002862, step = 657656 (19.568 sec)
INFO:tensorflow:lr = 1.3472938e-06 (19.570 sec)
INFO:tensorflow:global_step/sec: 5.12354
INFO:tensorflow:loss = 4.982622, step = 657756 (19.520 sec)
INFO:tensorflow:lr = 1.3459397e-06 (19.518 sec)
INFO:tensorflow:global_step/sec: 5.12664
INFO:tensorflow:loss = 4.9657936, step = 657856 (19.505 sec)
INFO:tensorflow:lr = 1.3445881e-06 (19.505 sec)
INFO:tensorflow:global_step/sec: 5.13465
INFO:tensorflow:loss = 4.9409304, step = 657956 (19.476 sec)
INFO:tensorflow:lr = 1.3432368e-06 (19.477 sec)
INFO:tensorflow:global_step/sec: 5.08647
INFO:tensorflow:loss = 5.024118, step = 658056 (19.655 sec)
INFO:tensorflow:lr = 1.3418879e-06 (19.655 sec)
INFO:tensorflow:global_step/sec: 5.11269
INFO:tensorflow:loss = 4.9371686, step = 658156 (19.563 sec)
INFO:tensorflow:lr = 1.3405403e-06 (19.562 sec)
INFO:tensorflow:global_step/sec: 5.13946
INFO:tensorflow:loss = 4.95532, step = 658256 (19.456 sec)
INFO:tensorflow:lr = 1.3391933e-06 (19.457 sec)
INFO:tensorflow:global_step/sec: 5.14588
INFO:tensorflow:loss = 5.0598507, step = 658356 (19.431 sec)
INFO:tensorflow:lr = 1.3378483e-06 (19.430 sec)
INFO:tensorflow:global_step/sec: 5.12622
INFO:tensorflow:loss = 5.0347967, step = 658456 (19.513 sec)
INFO:tensorflow:lr = 1.3365038e-06 (19.513 sec)
INFO:tensorflow:global_step/sec: 5.12098
INFO:tensorflow:loss = 5.028099, step = 658556 (19.522 sec)
INFO:tensorflow:lr = 1.3351615e-06 (19.522 sec)
INFO:tensorflow:global_step/sec: 5.11064
INFO:tensorflow:loss = 4.898343, step = 658656 (19.568 sec)
INFO:tensorflow:lr = 1.3338207e-06 (19.568 sec)
INFO:tensorflow:global_step/sec: 5.12987
INFO:tensorflow:loss = 4.933578, step = 658756 (19.497 sec)
INFO:tensorflow:lr = 1.3324805e-06 (19.497 sec)
INFO:tensorflow:global_step/sec: 5.13133
INFO:tensorflow:loss = 5.094851, step = 658856 (19.488 sec)
INFO:tensorflow:lr = 1.3311421e-06 (19.489 sec)
INFO:tensorflow:global_step/sec: 5.11196
INFO:tensorflow:loss = 4.917619, step = 658956 (19.561 sec)
INFO:tensorflow:lr = 1.3298046e-06 (19.561 sec)
INFO:tensorflow:global_step/sec: 5.12432
INFO:tensorflow:loss = 5.0091143, step = 659056 (19.513 sec)
INFO:tensorflow:lr = 1.3284691e-06 (19.513 sec)
INFO:tensorflow:global_step/sec: 5.13036
INFO:tensorflow:loss = 4.940724, step = 659156 (19.493 sec)
INFO:tensorflow:lr = 1.3271349e-06 (19.493 sec)
INFO:tensorflow:global_step/sec: 5.11764
INFO:tensorflow:loss = 4.8617887, step = 659256 (19.537 sec)
INFO:tensorflow:lr = 1.3258012e-06 (19.540 sec)
INFO:tensorflow:global_step/sec: 5.12408
INFO:tensorflow:loss = 4.982927, step = 659356 (19.516 sec)
INFO:tensorflow:lr = 1.3244697e-06 (19.515 sec)
INFO:tensorflow:global_step/sec: 5.11763
INFO:tensorflow:loss = 4.9486423, step = 659456 (19.540 sec)
INFO:tensorflow:lr = 1.3231388e-06 (19.542 sec)
INFO:tensorflow:global_step/sec: 5.10709
INFO:tensorflow:loss = 4.920685, step = 659556 (19.582 sec)
INFO:tensorflow:lr = 1.32181e-06 (19.581 sec)
INFO:tensorflow:global_step/sec: 5.05871
INFO:tensorflow:loss = 5.0133615, step = 659656 (19.766 sec)
INFO:tensorflow:lr = 1.3204825e-06 (19.765 sec)
Reading ../data/test.txt
INFO:tensorflow:global_step/sec: 5.04728
INFO:tensorflow:loss = 4.981304, step = 659756 (19.820 sec)
INFO:tensorflow:lr = 1.3191557e-06 (19.820 sec)
INFO:tensorflow:global_step/sec: 5.22313
INFO:tensorflow:loss = 4.9983068, step = 659856 (19.143 sec)
INFO:tensorflow:lr = 1.3178309e-06 (19.143 sec)
INFO:tensorflow:global_step/sec: 5.22767
INFO:tensorflow:loss = 4.954889, step = 659956 (19.131 sec)
INFO:tensorflow:lr = 1.3165065e-06 (19.131 sec)
INFO:tensorflow:global_step/sec: 5.22358
INFO:tensorflow:loss = 4.91079, step = 660056 (19.144 sec)
INFO:tensorflow:lr = 1.3151845e-06 (19.146 sec)
INFO:tensorflow:global_step/sec: 5.22408
INFO:tensorflow:loss = 5.0549793, step = 660156 (19.136 sec)
INFO:tensorflow:lr = 1.3138637e-06 (19.136 sec)
INFO:tensorflow:global_step/sec: 5.23565
INFO:tensorflow:loss = 4.962602, step = 660256 (19.099 sec)
INFO:tensorflow:lr = 1.3125432e-06 (19.101 sec)
INFO:tensorflow:global_step/sec: 5.2229
INFO:tensorflow:loss = 4.8649783, step = 660356 (19.147 sec)
INFO:tensorflow:lr = 1.3112252e-06 (19.144 sec)
INFO:tensorflow:global_step/sec: 5.21232
INFO:tensorflow:loss = 5.001238, step = 660456 (19.185 sec)
INFO:tensorflow:lr = 1.3099075e-06 (19.186 sec)
INFO:tensorflow:Saving checkpoints for 660465 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.64286
INFO:tensorflow:loss = 4.988467, step = 660556 (21.542 sec)
INFO:tensorflow:lr = 1.308592e-06 (21.541 sec)
INFO:tensorflow:global_step/sec: 5.23046
INFO:tensorflow:loss = 4.90874, step = 660656 (19.115 sec)
INFO:tensorflow:lr = 1.3072778e-06 (19.118 sec)
INFO:tensorflow:global_step/sec: 5.22503
INFO:tensorflow:loss = 4.909614, step = 660756 (19.144 sec)
INFO:tensorflow:lr = 1.3059641e-06 (19.140 sec)
INFO:tensorflow:global_step/sec: 5.2207
INFO:tensorflow:loss = 5.0095596, step = 660856 (19.150 sec)
INFO:tensorflow:lr = 1.3046526e-06 (19.150 sec)
INFO:tensorflow:global_step/sec: 5.21682
INFO:tensorflow:loss = 4.9713793, step = 660956 (19.172 sec)
INFO:tensorflow:lr = 1.3033414e-06 (19.172 sec)
INFO:tensorflow:global_step/sec: 5.23095
INFO:tensorflow:loss = 4.972864, step = 661056 (19.113 sec)
INFO:tensorflow:lr = 1.3020324e-06 (19.114 sec)
INFO:tensorflow:global_step/sec: 5.22563
INFO:tensorflow:loss = 4.919397, step = 661156 (19.144 sec)
INFO:tensorflow:lr = 1.3007249e-06 (19.143 sec)
INFO:tensorflow:global_step/sec: 5.15717
INFO:tensorflow:loss = 4.9289775, step = 661256 (19.383 sec)
INFO:tensorflow:lr = 1.2994178e-06 (19.384 sec)
INFO:tensorflow:global_step/sec: 5.21418
INFO:tensorflow:loss = 4.9515405, step = 661356 (19.182 sec)
INFO:tensorflow:lr = 1.2981128e-06 (19.182 sec)
INFO:tensorflow:global_step/sec: 5.23608
INFO:tensorflow:loss = 4.979062, step = 661456 (19.100 sec)
INFO:tensorflow:lr = 1.2968083e-06 (19.100 sec)
INFO:tensorflow:global_step/sec: 5.22903
INFO:tensorflow:loss = 4.996896, step = 661556 (19.120 sec)
INFO:tensorflow:lr = 1.295506e-06 (19.121 sec)
INFO:tensorflow:global_step/sec: 5.22189
INFO:tensorflow:loss = 5.149022, step = 661656 (19.154 sec)
INFO:tensorflow:lr = 1.2942049e-06 (19.155 sec)
INFO:tensorflow:global_step/sec: 5.21454
INFO:tensorflow:loss = 4.947382, step = 661756 (19.172 sec)
INFO:tensorflow:lr = 1.2929045e-06 (19.173 sec)
INFO:tensorflow:global_step/sec: 5.22182
INFO:tensorflow:loss = 4.8876843, step = 661856 (19.155 sec)
INFO:tensorflow:lr = 1.2916059e-06 (19.151 sec)
INFO:tensorflow:global_step/sec: 5.23254
INFO:tensorflow:loss = 4.974611, step = 661956 (19.109 sec)
INFO:tensorflow:lr = 1.2903081e-06 (19.110 sec)
INFO:tensorflow:global_step/sec: 5.21895
INFO:tensorflow:loss = 4.968402, step = 662056 (19.162 sec)
INFO:tensorflow:lr = 1.2890123e-06 (19.161 sec)
INFO:tensorflow:global_step/sec: 5.21889
INFO:tensorflow:loss = 4.9360113, step = 662156 (19.163 sec)
INFO:tensorflow:lr = 1.2877177e-06 (19.162 sec)
INFO:tensorflow:global_step/sec: 5.22888
INFO:tensorflow:loss = 5.0261517, step = 662256 (19.120 sec)
INFO:tensorflow:lr = 1.2864238e-06 (19.120 sec)
INFO:tensorflow:global_step/sec: 5.21923
INFO:tensorflow:loss = 4.896244, step = 662356 (19.165 sec)
INFO:tensorflow:lr = 1.2851317e-06 (19.165 sec)
INFO:tensorflow:global_step/sec: 5.22811
INFO:tensorflow:loss = 5.0056753, step = 662456 (19.126 sec)
INFO:tensorflow:lr = 1.2838402e-06 (19.126 sec)
INFO:tensorflow:global_step/sec: 5.21608
INFO:tensorflow:loss = 5.0000963, step = 662556 (19.172 sec)
INFO:tensorflow:lr = 1.2825509e-06 (19.172 sec)
INFO:tensorflow:global_step/sec: 5.22758
INFO:tensorflow:loss = 4.961546, step = 662656 (19.125 sec)
INFO:tensorflow:lr = 1.2812628e-06 (19.125 sec)
INFO:tensorflow:global_step/sec: 5.22684
INFO:tensorflow:loss = 5.000926, step = 662756 (19.135 sec)
INFO:tensorflow:lr = 1.2799754e-06 (19.137 sec)
INFO:tensorflow:global_step/sec: 5.17786
INFO:tensorflow:loss = 4.9267254, step = 662856 (19.311 sec)
INFO:tensorflow:lr = 1.27869e-06 (19.310 sec)
INFO:tensorflow:Saving checkpoints for 662955 into ../model/transformer/model.ckpt.
INFO:tensorflow:Loss for final step: 4.9801307.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/inputs/queues/feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/inputs/queues/feeding_functions.py:500: add_queue_runner (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-662955
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py:882: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
------------
minimal test
Q: 你 好
A1: 你 好
A2: 你 好 你 好
A3: 谢 谢

Q: 早 上 好
A1: 早 上 好
A2: 早 上 好 !
A3: 晚 上 好

Q: 晚 上 好
A1: 晚 上 好
A2: 早 上 好
A3: 晚 上 好 !

Q: 再 见
A1: 哈 哈 哈
A2: 哈 哈 哈 哈
A3: 好 的

Q: 好 久 不 见
A1: 好 久 不 见
A2: 是 啊
A3: 哈 哈

Q: 想 死 你 了
A1: 我 也 是
A2: 么 么 哒
A3: 我 也 想 你

Q: 谢 谢 你
A1: 不 客 气
A2: 客 气
A3: 不 客 气 不 客 气

Q: 爱 你
A1: 么 么 哒
A2: 我 也 爱 你
A3: 我 也 是

Q: 你 好 厉 害 啊
A1: 哈 哈 哈
A2: 哈 哈 哈 哈
A3: 哈 哈

Q: 你 叫 什 么
A1: 叫 你
A2: 我 叫 你
A3: 我 叫 你 啊

Q: 你 几 岁 了
A1: 岁
A2: 岁 了
A3: 一 岁

Q: 现 在 几 点
A1: 点 多
A2: 十 二 点
A3: 十 点

Q: 今 天 天 气 怎 么 样
A1: 还 可 以
A2: 还 行
A3: 还 行 吧

Q: 我 们 现 在 在 哪 里
A1: 你 在 哪 里
A2: 你 在 哪
A3: 你 在 哪 里 ?

Q: 讲 个 笑 话
A1: 哈 哈 哈
A2: 哈 哈 哈 哈
A3: 哈 哈 哈 哈 哈

Q: 你 会 几 种 语 言 呀
A1: 不 会
A2: 两 种
A3: 两 个

Q: 你 觉 得 我 帅 吗
A1: 没 有
A2: 不 觉 得
A3: 不 是

Q: 讨 厌 的 周 一
A1: 哈 哈 哈
A2: 我 也 是
A3: 哈 哈

Q: 好 烦 啊
A1: 怎 么 了
A2: 不 烦 不 烦
A3: 怎 么 啦

Q: 天 气 真 好
A1: 对 啊
A2: 嗯 嗯
A3: 是 的

Q: 今 天 好 冷
A1: 不 冷
A2: 不 冷 啊
A3: 不 冷 不 冷

Q: 今 天 好 热
A1: 今 天 天 气 不 错
A2: 今 天 天 气 好
A3: 今 天 天 气 不 好

Q: 下 雨 了
A1: 是 的
A2: 下 雨 了
A3: 是 啊

Q: 风 好 大
A1: 是 的
A2: 是 啊
A3: 好 吧

Q: 终 于 周 五 了
A1: 哈 哈 哈
A2: 周 五
A3: 哈 哈 哈 哈

Q: 我 想 去 K 歌
A1: 来 啊
A2: 去 啊
A3: 去 吧

Q: 红 烧 肉 吃 了 会 发 胖 吗
A1: 不 会
A2: 不 会 啊
A3: 不 会 的

Q: 你 觉 得 梅 西 厉 害 吗
A1: 不 厉 害
A2: 我 觉 得
A3: 哈 哈 哈

------------
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-01-09T02:55:34Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-662955
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:Finished evaluation at 2020-01-09-02:55:38
INFO:tensorflow:Saving dict for global step 662955: global_step = 662955, loss = 3.42836
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 662955: ../model/transformer/model.ckpt-662955
INFO:tensorflow:Perplexity: 30.826
INFO:tensorflow:Best Perplexity: 30.826
Reading ../data/core.txt
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-662955
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:BLEU-2: 14.4
INFO:tensorflow:Calling model_fn.
[<tf.Variable 'Embedding/fasttext_vectors:0' shape=(5904, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/affine_bias:0' shape=(5904,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/gamma:0' shape=(300,) dtype=float32_ref>]
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-662955
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 662955 into ../model/transformer/model.ckpt.
Reading ../data/train.txt
INFO:tensorflow:loss = 4.8141522, step = 662955
INFO:tensorflow:lr = 1.2774183e-06
INFO:tensorflow:global_step/sec: 4.04148
INFO:tensorflow:loss = 4.875339, step = 663055 (24.748 sec)
INFO:tensorflow:lr = 1.2761345e-06 (24.744 sec)
INFO:tensorflow:global_step/sec: 5.10319
INFO:tensorflow:loss = 4.9293923, step = 663155 (19.595 sec)
INFO:tensorflow:lr = 1.2748529e-06 (19.596 sec)
INFO:tensorflow:global_step/sec: 5.12832
INFO:tensorflow:loss = 4.969243, step = 663255 (19.502 sec)
INFO:tensorflow:lr = 1.2735727e-06 (19.502 sec)
INFO:tensorflow:global_step/sec: 5.10603
INFO:tensorflow:loss = 5.0514183, step = 663355 (19.580 sec)
INFO:tensorflow:lr = 1.2722929e-06 (19.580 sec)
INFO:tensorflow:global_step/sec: 5.11313
INFO:tensorflow:loss = 4.9484763, step = 663455 (19.557 sec)
INFO:tensorflow:lr = 1.2710151e-06 (19.559 sec)
INFO:tensorflow:global_step/sec: 5.09506
INFO:tensorflow:loss = 4.9654245, step = 663555 (19.627 sec)
INFO:tensorflow:lr = 1.2697379e-06 (19.629 sec)
INFO:tensorflow:global_step/sec: 5.11607
INFO:tensorflow:loss = 4.952407, step = 663655 (19.550 sec)
INFO:tensorflow:lr = 1.2684628e-06 (19.547 sec)
INFO:tensorflow:global_step/sec: 5.09531
INFO:tensorflow:loss = 4.9992533, step = 663755 (19.623 sec)
INFO:tensorflow:lr = 1.2671888e-06 (19.627 sec)
INFO:tensorflow:global_step/sec: 5.10206
INFO:tensorflow:loss = 4.8966875, step = 663855 (19.599 sec)
INFO:tensorflow:lr = 1.2659156e-06 (19.598 sec)
INFO:tensorflow:global_step/sec: 5.10265
INFO:tensorflow:loss = 4.890871, step = 663955 (19.599 sec)
INFO:tensorflow:lr = 1.2646442e-06 (19.597 sec)
INFO:tensorflow:global_step/sec: 5.11194
INFO:tensorflow:loss = 4.985154, step = 664055 (19.566 sec)
INFO:tensorflow:lr = 1.2633733e-06 (19.566 sec)
INFO:tensorflow:global_step/sec: 5.11996
INFO:tensorflow:loss = 4.959223, step = 664155 (19.530 sec)
INFO:tensorflow:lr = 1.2621045e-06 (19.529 sec)
INFO:tensorflow:global_step/sec: 5.12189
INFO:tensorflow:loss = 5.0286584, step = 664255 (19.521 sec)
INFO:tensorflow:lr = 1.260837e-06 (19.521 sec)
INFO:tensorflow:global_step/sec: 5.09944
INFO:tensorflow:loss = 4.9045973, step = 664355 (19.615 sec)
INFO:tensorflow:lr = 1.2595701e-06 (19.615 sec)
INFO:tensorflow:global_step/sec: 5.1054
INFO:tensorflow:loss = 4.9127536, step = 664455 (19.582 sec)
INFO:tensorflow:lr = 1.2583051e-06 (19.585 sec)
INFO:tensorflow:global_step/sec: 5.10518
INFO:tensorflow:loss = 4.956955, step = 664555 (19.587 sec)
INFO:tensorflow:lr = 1.2570406e-06 (19.587 sec)
INFO:tensorflow:global_step/sec: 5.10688
INFO:tensorflow:loss = 5.0097737, step = 664655 (19.583 sec)
INFO:tensorflow:lr = 1.2557782e-06 (19.580 sec)
INFO:tensorflow:global_step/sec: 5.1222
INFO:tensorflow:loss = 4.9245577, step = 664755 (19.522 sec)
INFO:tensorflow:lr = 1.2545171e-06 (19.525 sec)
INFO:tensorflow:global_step/sec: 5.11389
INFO:tensorflow:loss = 5.0004625, step = 664855 (19.558 sec)
INFO:tensorflow:lr = 1.2532564e-06 (19.554 sec)
INFO:tensorflow:global_step/sec: 5.10368
INFO:tensorflow:loss = 4.898492, step = 664955 (19.591 sec)
INFO:tensorflow:lr = 1.2519976e-06 (19.591 sec)
INFO:tensorflow:global_step/sec: 5.12861
INFO:tensorflow:loss = 5.0891695, step = 665055 (19.501 sec)
INFO:tensorflow:lr = 1.2507396e-06 (19.502 sec)
INFO:tensorflow:global_step/sec: 5.09238
INFO:tensorflow:loss = 4.971229, step = 665155 (19.633 sec)
INFO:tensorflow:lr = 1.2494835e-06 (19.634 sec)
INFO:tensorflow:global_step/sec: 5.09745
INFO:tensorflow:loss = 4.8698845, step = 665255 (19.618 sec)
INFO:tensorflow:lr = 1.2482286e-06 (19.617 sec)
INFO:tensorflow:global_step/sec: 5.13009
INFO:tensorflow:loss = 4.9307346, step = 665355 (19.498 sec)
INFO:tensorflow:lr = 1.2469744e-06 (19.498 sec)
INFO:tensorflow:global_step/sec: 5.10923
INFO:tensorflow:loss = 4.9169836, step = 665455 (19.567 sec)
INFO:tensorflow:lr = 1.2457222e-06 (19.568 sec)
INFO:tensorflow:global_step/sec: 5.11737
INFO:tensorflow:loss = 5.0640416, step = 665555 (19.541 sec)
INFO:tensorflow:lr = 1.2444701e-06 (19.544 sec)
INFO:tensorflow:global_step/sec: 5.10476
INFO:tensorflow:loss = 4.977833, step = 665655 (19.596 sec)
INFO:tensorflow:lr = 1.2432204e-06 (19.592 sec)
INFO:tensorflow:global_step/sec: 5.11955
INFO:tensorflow:loss = 4.9789295, step = 665755 (19.527 sec)
INFO:tensorflow:lr = 1.2419717e-06 (19.527 sec)
INFO:tensorflow:global_step/sec: 5.11453
INFO:tensorflow:loss = 5.0566754, step = 665855 (19.556 sec)
INFO:tensorflow:lr = 1.2407238e-06 (19.555 sec)
INFO:tensorflow:global_step/sec: 5.13326
INFO:tensorflow:loss = 5.0130744, step = 665955 (19.481 sec)
INFO:tensorflow:lr = 1.2394777e-06 (19.481 sec)
INFO:tensorflow:global_step/sec: 5.0795
INFO:tensorflow:loss = 5.080052, step = 666055 (19.686 sec)
INFO:tensorflow:lr = 1.2382324e-06 (19.686 sec)
INFO:tensorflow:global_step/sec: 5.10242
INFO:tensorflow:loss = 4.9850864, step = 666155 (19.600 sec)
INFO:tensorflow:lr = 1.2369887e-06 (19.599 sec)
INFO:tensorflow:Saving checkpoints for 666158 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.56247
INFO:tensorflow:loss = 4.9643674, step = 666255 (21.916 sec)
INFO:tensorflow:lr = 1.2357464e-06 (21.917 sec)
INFO:tensorflow:global_step/sec: 5.11461
INFO:tensorflow:loss = 4.9450245, step = 666355 (19.551 sec)
INFO:tensorflow:lr = 1.2345046e-06 (19.550 sec)
INFO:tensorflow:global_step/sec: 5.10916
INFO:tensorflow:loss = 4.843778, step = 666455 (19.572 sec)
INFO:tensorflow:lr = 1.2332648e-06 (19.572 sec)
INFO:tensorflow:global_step/sec: 5.12336
INFO:tensorflow:loss = 4.8696527, step = 666555 (19.519 sec)
INFO:tensorflow:lr = 1.2320255e-06 (19.524 sec)
INFO:tensorflow:global_step/sec: 5.13627
INFO:tensorflow:loss = 4.8954463, step = 666655 (19.475 sec)
INFO:tensorflow:lr = 1.2307883e-06 (19.469 sec)
INFO:tensorflow:global_step/sec: 5.11355
INFO:tensorflow:loss = 4.9577804, step = 666755 (19.556 sec)
INFO:tensorflow:lr = 1.2295521e-06 (19.559 sec)
INFO:tensorflow:global_step/sec: 5.09883
INFO:tensorflow:loss = 4.976067, step = 666855 (19.607 sec)
INFO:tensorflow:lr = 1.2283166e-06 (19.608 sec)
INFO:tensorflow:global_step/sec: 5.08379
INFO:tensorflow:loss = 4.928569, step = 666955 (19.671 sec)
INFO:tensorflow:lr = 1.227083e-06 (19.668 sec)
INFO:tensorflow:global_step/sec: 5.1099
INFO:tensorflow:loss = 5.028079, step = 667055 (19.570 sec)
INFO:tensorflow:lr = 1.2258498e-06 (19.570 sec)
INFO:tensorflow:global_step/sec: 5.11702
INFO:tensorflow:loss = 5.0077724, step = 667155 (19.546 sec)
INFO:tensorflow:lr = 1.2246187e-06 (19.547 sec)
INFO:tensorflow:global_step/sec: 5.09418
INFO:tensorflow:loss = 5.0327826, step = 667255 (19.627 sec)
INFO:tensorflow:lr = 1.2233888e-06 (19.627 sec)
INFO:tensorflow:global_step/sec: 5.12405
INFO:tensorflow:loss = 4.8829236, step = 667355 (19.519 sec)
INFO:tensorflow:lr = 1.2221597e-06 (19.518 sec)
INFO:tensorflow:global_step/sec: 5.10932
INFO:tensorflow:loss = 4.8924975, step = 667455 (19.571 sec)
INFO:tensorflow:lr = 1.2209322e-06 (19.571 sec)
INFO:tensorflow:global_step/sec: 5.08889
INFO:tensorflow:loss = 4.9705343, step = 667555 (19.648 sec)
INFO:tensorflow:lr = 1.2197053e-06 (19.648 sec)
INFO:tensorflow:global_step/sec: 5.07713
INFO:tensorflow:loss = 5.0114713, step = 667655 (19.700 sec)
INFO:tensorflow:lr = 1.2184803e-06 (19.700 sec)
INFO:tensorflow:global_step/sec: 5.08652
INFO:tensorflow:loss = 4.9815655, step = 667755 (19.660 sec)
INFO:tensorflow:lr = 1.2172566e-06 (19.660 sec)
INFO:tensorflow:global_step/sec: 5.08227
INFO:tensorflow:loss = 4.9700937, step = 667855 (19.674 sec)
INFO:tensorflow:lr = 1.2160333e-06 (19.677 sec)
INFO:tensorflow:global_step/sec: 5.09818
INFO:tensorflow:loss = 4.9395394, step = 667955 (19.617 sec)
INFO:tensorflow:lr = 1.2148121e-06 (19.616 sec)
INFO:tensorflow:global_step/sec: 5.08254
INFO:tensorflow:loss = 4.982976, step = 668055 (19.670 sec)
INFO:tensorflow:lr = 1.2135914e-06 (19.670 sec)
INFO:tensorflow:global_step/sec: 5.06573
INFO:tensorflow:loss = 4.9357696, step = 668155 (19.744 sec)
INFO:tensorflow:lr = 1.2123727e-06 (19.743 sec)
INFO:tensorflow:global_step/sec: 5.10915
INFO:tensorflow:loss = 4.892908, step = 668255 (19.573 sec)
INFO:tensorflow:lr = 1.2111551e-06 (19.573 sec)
INFO:tensorflow:global_step/sec: 5.0577
INFO:tensorflow:loss = 5.0735254, step = 668355 (19.768 sec)
INFO:tensorflow:lr = 1.209938e-06 (19.769 sec)
INFO:tensorflow:global_step/sec: 5.07501
INFO:tensorflow:loss = 4.994915, step = 668455 (19.705 sec)
INFO:tensorflow:lr = 1.2087229e-06 (19.708 sec)
INFO:tensorflow:global_step/sec: 5.03645
INFO:tensorflow:loss = 4.979548, step = 668555 (19.855 sec)
INFO:tensorflow:lr = 1.2075081e-06 (19.851 sec)
INFO:tensorflow:global_step/sec: 5.07974
INFO:tensorflow:loss = 4.8441954, step = 668655 (19.691 sec)
INFO:tensorflow:lr = 1.2062955e-06 (19.692 sec)
INFO:tensorflow:global_step/sec: 5.07708
INFO:tensorflow:loss = 5.0573854, step = 668755 (19.694 sec)
INFO:tensorflow:lr = 1.205084e-06 (19.694 sec)
INFO:tensorflow:global_step/sec: 5.08431
INFO:tensorflow:loss = 4.960089, step = 668855 (19.669 sec)
INFO:tensorflow:lr = 1.2038731e-06 (19.671 sec)
INFO:tensorflow:global_step/sec: 5.09156
INFO:tensorflow:loss = 4.9782777, step = 668955 (19.642 sec)
INFO:tensorflow:lr = 1.2026641e-06 (19.640 sec)
INFO:tensorflow:global_step/sec: 5.07901
INFO:tensorflow:loss = 4.916501, step = 669055 (19.685 sec)
INFO:tensorflow:lr = 1.2014556e-06 (19.686 sec)
INFO:tensorflow:global_step/sec: 5.04453
INFO:tensorflow:loss = 4.8961263, step = 669155 (19.826 sec)
INFO:tensorflow:lr = 1.200249e-06 (19.825 sec)
INFO:tensorflow:global_step/sec: 5.06576
INFO:tensorflow:loss = 4.843474, step = 669255 (19.738 sec)
INFO:tensorflow:lr = 1.1990435e-06 (19.739 sec)
INFO:tensorflow:global_step/sec: 5.06774
INFO:tensorflow:loss = 4.882683, step = 669355 (19.735 sec)
INFO:tensorflow:lr = 1.1978385e-06 (19.735 sec)
INFO:tensorflow:Saving checkpoints for 669361 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.5605
INFO:tensorflow:loss = 4.934825, step = 669455 (21.923 sec)
INFO:tensorflow:lr = 1.1966356e-06 (21.923 sec)
INFO:tensorflow:global_step/sec: 5.06974
INFO:tensorflow:loss = 5.0007772, step = 669555 (19.728 sec)
INFO:tensorflow:lr = 1.1954332e-06 (19.728 sec)
INFO:tensorflow:global_step/sec: 5.08619
INFO:tensorflow:loss = 4.8769627, step = 669655 (19.658 sec)
INFO:tensorflow:lr = 1.1942326e-06 (19.658 sec)
INFO:tensorflow:global_step/sec: 5.08504
INFO:tensorflow:loss = 4.904562, step = 669755 (19.666 sec)
INFO:tensorflow:lr = 1.1930332e-06 (19.669 sec)
INFO:tensorflow:global_step/sec: 5.09326
INFO:tensorflow:loss = 4.9721484, step = 669855 (19.633 sec)
INFO:tensorflow:lr = 1.1918344e-06 (19.631 sec)
INFO:tensorflow:global_step/sec: 5.02691
INFO:tensorflow:loss = 4.897151, step = 669955 (19.893 sec)
INFO:tensorflow:lr = 1.1906375e-06 (19.893 sec)
INFO:tensorflow:global_step/sec: 5.07095
INFO:tensorflow:loss = 4.9641705, step = 670055 (19.720 sec)
INFO:tensorflow:lr = 1.1894409e-06 (19.725 sec)
INFO:tensorflow:global_step/sec: 5.05254
INFO:tensorflow:loss = 4.899669, step = 670155 (19.796 sec)
INFO:tensorflow:lr = 1.1882463e-06 (19.791 sec)
INFO:tensorflow:global_step/sec: 5.08513
INFO:tensorflow:loss = 5.0089045, step = 670255 (19.665 sec)
INFO:tensorflow:lr = 1.1870529e-06 (19.667 sec)
INFO:tensorflow:global_step/sec: 5.07922
INFO:tensorflow:loss = 4.9193306, step = 670355 (19.684 sec)
INFO:tensorflow:lr = 1.1858604e-06 (19.686 sec)
INFO:tensorflow:global_step/sec: 5.08803
INFO:tensorflow:loss = 4.9338403, step = 670455 (19.660 sec)
INFO:tensorflow:lr = 1.1846694e-06 (19.657 sec)
INFO:tensorflow:global_step/sec: 5.06116
INFO:tensorflow:loss = 5.1277213, step = 670555 (19.758 sec)
INFO:tensorflow:lr = 1.183479e-06 (19.758 sec)
INFO:tensorflow:global_step/sec: 5.0519
INFO:tensorflow:loss = 4.9437265, step = 670655 (19.797 sec)
INFO:tensorflow:lr = 1.1822904e-06 (19.796 sec)
INFO:tensorflow:global_step/sec: 5.08045
INFO:tensorflow:loss = 4.958161, step = 670755 (19.676 sec)
INFO:tensorflow:lr = 1.181103e-06 (19.679 sec)
INFO:tensorflow:global_step/sec: 5.06198
INFO:tensorflow:loss = 4.9010787, step = 670855 (19.759 sec)
INFO:tensorflow:lr = 1.1799159e-06 (19.756 sec)
INFO:tensorflow:global_step/sec: 5.07554
INFO:tensorflow:loss = 4.9242086, step = 670955 (19.698 sec)
INFO:tensorflow:lr = 1.1787309e-06 (19.703 sec)
INFO:tensorflow:global_step/sec: 5.09299
INFO:tensorflow:loss = 4.9825544, step = 671055 (19.639 sec)
INFO:tensorflow:lr = 1.1775467e-06 (19.635 sec)
INFO:tensorflow:global_step/sec: 5.07646
INFO:tensorflow:loss = 4.8742185, step = 671155 (19.699 sec)
INFO:tensorflow:lr = 1.176364e-06 (19.699 sec)
INFO:tensorflow:global_step/sec: 5.07412
INFO:tensorflow:loss = 5.064085, step = 671255 (19.706 sec)
INFO:tensorflow:lr = 1.1751825e-06 (19.707 sec)
INFO:tensorflow:global_step/sec: 5.07943
INFO:tensorflow:loss = 5.011388, step = 671355 (19.690 sec)
INFO:tensorflow:lr = 1.1740018e-06 (19.691 sec)
INFO:tensorflow:global_step/sec: 5.05005
INFO:tensorflow:loss = 4.985449, step = 671455 (19.797 sec)
INFO:tensorflow:lr = 1.1728226e-06 (19.796 sec)
INFO:tensorflow:global_step/sec: 5.07335
INFO:tensorflow:loss = 4.9106016, step = 671555 (19.714 sec)
INFO:tensorflow:lr = 1.171644e-06 (19.713 sec)
INFO:tensorflow:global_step/sec: 5.07434
INFO:tensorflow:loss = 4.9931293, step = 671655 (19.703 sec)
INFO:tensorflow:lr = 1.1704674e-06 (19.708 sec)
INFO:tensorflow:global_step/sec: 5.05354
INFO:tensorflow:loss = 4.8999267, step = 671755 (19.791 sec)
INFO:tensorflow:lr = 1.169292e-06 (19.787 sec)
Reading ../data/test.txt
INFO:tensorflow:global_step/sec: 5.06704
INFO:tensorflow:loss = 4.9460125, step = 671855 (19.738 sec)
INFO:tensorflow:lr = 1.168117e-06 (19.738 sec)
INFO:tensorflow:global_step/sec: 5.19188
INFO:tensorflow:loss = 4.9419265, step = 671955 (19.258 sec)
INFO:tensorflow:lr = 1.1669438e-06 (19.259 sec)
INFO:tensorflow:global_step/sec: 5.19826
INFO:tensorflow:loss = 4.92948, step = 672055 (19.234 sec)
INFO:tensorflow:lr = 1.1657711e-06 (19.241 sec)
INFO:tensorflow:global_step/sec: 5.17292
INFO:tensorflow:loss = 4.9868855, step = 672155 (19.332 sec)
INFO:tensorflow:lr = 1.1646005e-06 (19.324 sec)
INFO:tensorflow:global_step/sec: 5.16592
INFO:tensorflow:loss = 5.0124035, step = 672255 (19.358 sec)
INFO:tensorflow:lr = 1.1634309e-06 (19.361 sec)
INFO:tensorflow:global_step/sec: 5.19581
INFO:tensorflow:loss = 4.988905, step = 672355 (19.246 sec)
INFO:tensorflow:lr = 1.1622617e-06 (19.247 sec)
INFO:tensorflow:global_step/sec: 5.1915
INFO:tensorflow:loss = 4.928968, step = 672455 (19.267 sec)
INFO:tensorflow:lr = 1.1610944e-06 (19.263 sec)
INFO:tensorflow:global_step/sec: 5.18176
INFO:tensorflow:loss = 4.9622874, step = 672555 (19.298 sec)
INFO:tensorflow:lr = 1.1599276e-06 (19.298 sec)
INFO:tensorflow:Saving checkpoints for 672564 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.6349
INFO:tensorflow:loss = 4.9601517, step = 672655 (21.571 sec)
INFO:tensorflow:lr = 1.1587628e-06 (21.571 sec)
INFO:tensorflow:global_step/sec: 5.18857
INFO:tensorflow:loss = 4.876265, step = 672755 (19.279 sec)
INFO:tensorflow:lr = 1.1575991e-06 (19.278 sec)
INFO:tensorflow:global_step/sec: 5.18361
INFO:tensorflow:loss = 4.959883, step = 672855 (19.286 sec)
INFO:tensorflow:lr = 1.1564357e-06 (19.286 sec)
INFO:tensorflow:global_step/sec: 5.16188
INFO:tensorflow:loss = 4.9683924, step = 672955 (19.377 sec)
INFO:tensorflow:lr = 1.1552744e-06 (19.378 sec)
INFO:tensorflow:global_step/sec: 5.17645
INFO:tensorflow:loss = 4.9584804, step = 673055 (19.318 sec)
INFO:tensorflow:lr = 1.1541134e-06 (19.317 sec)
INFO:tensorflow:global_step/sec: 5.20195
INFO:tensorflow:loss = 4.9750276, step = 673155 (19.224 sec)
INFO:tensorflow:lr = 1.1529544e-06 (19.224 sec)
INFO:tensorflow:global_step/sec: 5.17123
INFO:tensorflow:loss = 5.0614576, step = 673255 (19.336 sec)
INFO:tensorflow:lr = 1.1517965e-06 (19.338 sec)
INFO:tensorflow:global_step/sec: 5.20567
INFO:tensorflow:loss = 4.8833413, step = 673355 (19.208 sec)
INFO:tensorflow:lr = 1.150639e-06 (19.207 sec)
INFO:tensorflow:global_step/sec: 5.21144
INFO:tensorflow:loss = 4.996898, step = 673455 (19.191 sec)
INFO:tensorflow:lr = 1.1494835e-06 (19.191 sec)
INFO:tensorflow:global_step/sec: 5.21312
INFO:tensorflow:loss = 4.92876, step = 673555 (19.183 sec)
INFO:tensorflow:lr = 1.1483284e-06 (19.185 sec)
INFO:tensorflow:global_step/sec: 5.22489
INFO:tensorflow:loss = 4.952419, step = 673655 (19.135 sec)
INFO:tensorflow:lr = 1.1471751e-06 (19.134 sec)
INFO:tensorflow:global_step/sec: 5.22458
INFO:tensorflow:loss = 4.869579, step = 673755 (19.144 sec)
INFO:tensorflow:lr = 1.1460231e-06 (19.143 sec)
INFO:tensorflow:global_step/sec: 5.20806
INFO:tensorflow:loss = 4.908499, step = 673855 (19.198 sec)
INFO:tensorflow:lr = 1.1448714e-06 (19.199 sec)
INFO:tensorflow:global_step/sec: 5.21761
INFO:tensorflow:loss = 5.046917, step = 673955 (19.168 sec)
INFO:tensorflow:lr = 1.1437215e-06 (19.167 sec)
INFO:tensorflow:global_step/sec: 5.2277
INFO:tensorflow:loss = 4.868713, step = 674055 (19.129 sec)
INFO:tensorflow:lr = 1.1425723e-06 (19.129 sec)
INFO:tensorflow:global_step/sec: 5.22141
INFO:tensorflow:loss = 4.866466, step = 674155 (19.149 sec)
INFO:tensorflow:lr = 1.1414248e-06 (19.149 sec)
INFO:tensorflow:global_step/sec: 5.19162
INFO:tensorflow:loss = 4.9682593, step = 674255 (19.264 sec)
INFO:tensorflow:lr = 1.1402784e-06 (19.266 sec)
INFO:tensorflow:global_step/sec: 5.22146
INFO:tensorflow:loss = 4.8868346, step = 674355 (19.152 sec)
INFO:tensorflow:lr = 1.1391327e-06 (19.151 sec)
INFO:tensorflow:global_step/sec: 5.22933
INFO:tensorflow:loss = 4.9308023, step = 674455 (19.123 sec)
INFO:tensorflow:lr = 1.1379886e-06 (19.123 sec)
INFO:tensorflow:global_step/sec: 5.22369
INFO:tensorflow:loss = 4.9625397, step = 674555 (19.140 sec)
INFO:tensorflow:lr = 1.1368451e-06 (19.143 sec)
INFO:tensorflow:global_step/sec: 5.19968
INFO:tensorflow:loss = 4.917578, step = 674655 (19.239 sec)
INFO:tensorflow:lr = 1.1357033e-06 (19.238 sec)
INFO:tensorflow:global_step/sec: 5.21902
INFO:tensorflow:loss = 4.9797573, step = 674755 (19.158 sec)
INFO:tensorflow:lr = 1.1345627e-06 (19.156 sec)
INFO:tensorflow:global_step/sec: 5.20576
INFO:tensorflow:loss = 4.9902744, step = 674855 (19.205 sec)
INFO:tensorflow:lr = 1.1334228e-06 (19.205 sec)
INFO:tensorflow:global_step/sec: 5.20237
INFO:tensorflow:loss = 4.873797, step = 674955 (19.225 sec)
INFO:tensorflow:lr = 1.1322844e-06 (19.225 sec)
INFO:tensorflow:Saving checkpoints for 675054 into ../model/transformer/model.ckpt.
INFO:tensorflow:Loss for final step: 4.913788.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-675054
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
------------
minimal test
Q: 你 好
A1: 你 好
A2: 你 好 !
A3: 我 好

Q: 早 上 好
A1: 早 上 好
A2: 你 好
A3: 晚 上 好

Q: 晚 上 好
A1: 晚 上 好
A2: 晚 安
A3: 晚 上 好 !

Q: 再 见
A1: 哈 哈 哈
A2: 不 见
A3: 哈 哈

Q: 好 久 不 见
A1: 好 久 不 见
A2: 好 久 不 见 了
A3: 是 啊

Q: 想 死 你 了
A1: 我 也 想 你
A2: 我 也 是
A3: 我 也 想 你 了

Q: 谢 谢 你
A1: 不 客 气
A2: 不 谢
A3: 不 用 谢

Q: 爱 你
A1: 么 么 哒
A2: 我 也 爱 你
A3: 爱 你

Q: 你 好 厉 害 啊
A1: 哈 哈
A2: 哈 哈 哈
A3: 我 也 觉 得

Q: 你 叫 什 么
A1: 你 叫 什 么
A2: 我 叫 你
A3: 我 叫 什 么

Q: 你 几 岁 了
A1: 你 猜
A2: 一 岁
A3: 哈 哈 哈

Q: 现 在 几 点
A1: 点
A2: 点 半
A3: 点 左 右

Q: 今 天 天 气 怎 么 样
A1: 还 行
A2: 天 气
A3: 还 不 错

Q: 我 们 现 在 在 哪 里
A1: 你 在 哪
A2: 你 们 在 哪
A3: 我 们 学 校

Q: 讲 个 笑 话
A1: 哈 哈 哈
A2: 哈 哈
A3: 哈 哈 哈 哈

Q: 你 会 几 种 语 言 呀
A1: 哈 哈 哈
A2: 哈 哈 哈 哈
A3: 哈 哈 哈 哈 哈

Q: 你 觉 得 我 帅 吗
A1: 我 觉 得
A2: 我 觉 得 你 很 帅
A3: 哈 哈

Q: 讨 厌 的 周 一
A1: 哈 哈 哈
A2: 哈 哈 哈 哈
A3: 哈 哈

Q: 好 烦 啊
A1: 咋 啦
A2: 怎 么 了
A3: 怎 么 啦

Q: 天 气 真 好
A1: 哈 哈
A2: 是 啊
A3: 是 的

Q: 今 天 好 冷
A1: 我 们 这 里 度
A2: 我 们 这 里 也 冷
A3: 这 几 天 天 气 不 好

Q: 今 天 好 热
A1: 是 的
A2: 哈 哈
A3: 是 啊

Q: 下 雨 了
A1: 下 雨 了
A2: 下 雨 天
A3: 今 天 下 了

Q: 风 好 大
A1: 是 的
A2: 是 啊
A3: 哈 哈

Q: 终 于 周 五 了
A1: 哈 哈 哈 哈
A2: 哈 哈 哈
A3: 是 的

Q: 我 想 去 K 歌
A1: 来 来 来
A2: 来 啊 来 啊
A3: 来 吧

Q: 红 烧 肉 吃 了 会 发 胖 吗
A1: 不 会
A2: 吃 了
A3: 不 知 道

Q: 你 觉 得 梅 西 厉 害 吗
A1: 不 是
A2: 是 的
A3: 没 有

------------
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-01-09T03:37:13Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-675054
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:Finished evaluation at 2020-01-09-03:37:17
INFO:tensorflow:Saving dict for global step 675054: global_step = 675054, loss = 3.386772
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 675054: ../model/transformer/model.ckpt-675054
INFO:tensorflow:Perplexity: 29.570
INFO:tensorflow:Best Perplexity: 29.570
Reading ../data/core.txt
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-675054
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:BLEU-2: 9.7
INFO:tensorflow:Calling model_fn.
[<tf.Variable 'Embedding/fasttext_vectors:0' shape=(5904, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/affine_bias:0' shape=(5904,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/gamma:0' shape=(300,) dtype=float32_ref>]
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-675054
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 675054 into ../model/transformer/model.ckpt.
Reading ../data/train.txt
INFO:tensorflow:loss = 4.9308743, step = 675054
INFO:tensorflow:lr = 1.1311583e-06
INFO:tensorflow:global_step/sec: 4.02269
INFO:tensorflow:loss = 5.0233836, step = 675154 (24.866 sec)
INFO:tensorflow:lr = 1.1300217e-06 (24.867 sec)
INFO:tensorflow:global_step/sec: 5.08675
INFO:tensorflow:loss = 4.905008, step = 675254 (19.656 sec)
INFO:tensorflow:lr = 1.1288869e-06 (19.656 sec)
INFO:tensorflow:global_step/sec: 5.09298
INFO:tensorflow:loss = 4.9825096, step = 675354 (19.638 sec)
INFO:tensorflow:lr = 1.1277531e-06 (19.637 sec)
INFO:tensorflow:global_step/sec: 5.12513
INFO:tensorflow:loss = 4.8779507, step = 675454 (19.512 sec)
INFO:tensorflow:lr = 1.12662e-06 (19.512 sec)
INFO:tensorflow:global_step/sec: 5.0712
INFO:tensorflow:loss = 4.9595313, step = 675554 (19.714 sec)
INFO:tensorflow:lr = 1.1254883e-06 (19.714 sec)
INFO:tensorflow:global_step/sec: 5.1336
INFO:tensorflow:loss = 4.841404, step = 675654 (19.479 sec)
INFO:tensorflow:lr = 1.1243574e-06 (19.479 sec)
INFO:tensorflow:global_step/sec: 5.12389
INFO:tensorflow:loss = 4.980285, step = 675754 (19.517 sec)
INFO:tensorflow:lr = 1.1232282e-06 (19.520 sec)
INFO:tensorflow:global_step/sec: 5.11394
INFO:tensorflow:loss = 5.0196495, step = 675854 (19.554 sec)
INFO:tensorflow:lr = 1.1221002e-06 (19.553 sec)
INFO:tensorflow:global_step/sec: 5.13774
INFO:tensorflow:loss = 4.9464273, step = 675954 (19.468 sec)
INFO:tensorflow:lr = 1.1209726e-06 (19.465 sec)
INFO:tensorflow:global_step/sec: 5.1424
INFO:tensorflow:loss = 4.9263864, step = 676054 (19.446 sec)
INFO:tensorflow:lr = 1.1198468e-06 (19.446 sec)
INFO:tensorflow:global_step/sec: 5.15231
INFO:tensorflow:loss = 4.9268975, step = 676154 (19.408 sec)
INFO:tensorflow:lr = 1.1187215e-06 (19.408 sec)
INFO:tensorflow:global_step/sec: 5.16621
INFO:tensorflow:loss = 4.8613977, step = 676254 (19.357 sec)
INFO:tensorflow:lr = 1.117598e-06 (19.360 sec)
INFO:tensorflow:global_step/sec: 5.15203
INFO:tensorflow:loss = 5.0302353, step = 676354 (19.408 sec)
INFO:tensorflow:lr = 1.1164757e-06 (19.406 sec)
INFO:tensorflow:global_step/sec: 5.15072
INFO:tensorflow:loss = 4.9455276, step = 676454 (19.416 sec)
INFO:tensorflow:lr = 1.1153537e-06 (19.415 sec)
INFO:tensorflow:global_step/sec: 5.15008
INFO:tensorflow:loss = 5.0335608, step = 676554 (19.419 sec)
INFO:tensorflow:lr = 1.1142336e-06 (19.419 sec)
INFO:tensorflow:global_step/sec: 5.16259
INFO:tensorflow:loss = 4.9902525, step = 676654 (19.364 sec)
INFO:tensorflow:lr = 1.1131139e-06 (19.368 sec)
INFO:tensorflow:global_step/sec: 5.14339
INFO:tensorflow:loss = 4.863207, step = 676754 (19.447 sec)
INFO:tensorflow:lr = 1.111996e-06 (19.444 sec)
INFO:tensorflow:global_step/sec: 5.16334
INFO:tensorflow:loss = 4.91811, step = 676854 (19.367 sec)
INFO:tensorflow:lr = 1.1108792e-06 (19.367 sec)
INFO:tensorflow:global_step/sec: 5.14825
INFO:tensorflow:loss = 5.023487, step = 676954 (19.424 sec)
INFO:tensorflow:lr = 1.1097628e-06 (19.424 sec)
INFO:tensorflow:global_step/sec: 5.15196
INFO:tensorflow:loss = 4.960225, step = 677054 (19.406 sec)
INFO:tensorflow:lr = 1.1086483e-06 (19.407 sec)
INFO:tensorflow:global_step/sec: 5.11675
INFO:tensorflow:loss = 4.9260836, step = 677154 (19.543 sec)
INFO:tensorflow:lr = 1.1075343e-06 (19.543 sec)
INFO:tensorflow:global_step/sec: 5.16169
INFO:tensorflow:loss = 4.974646, step = 677254 (19.377 sec)
INFO:tensorflow:lr = 1.106422e-06 (19.379 sec)
INFO:tensorflow:global_step/sec: 5.14339
INFO:tensorflow:loss = 4.9472075, step = 677354 (19.438 sec)
INFO:tensorflow:lr = 1.1053108e-06 (19.437 sec)
INFO:tensorflow:global_step/sec: 5.08867
INFO:tensorflow:loss = 4.8374915, step = 677454 (19.656 sec)
INFO:tensorflow:lr = 1.1042001e-06 (19.655 sec)
INFO:tensorflow:global_step/sec: 5.13624
INFO:tensorflow:loss = 4.8497944, step = 677554 (19.469 sec)
INFO:tensorflow:lr = 1.1030912e-06 (19.469 sec)
INFO:tensorflow:global_step/sec: 5.14884
INFO:tensorflow:loss = 4.945532, step = 677654 (19.418 sec)
INFO:tensorflow:lr = 1.1019827e-06 (19.419 sec)
INFO:tensorflow:global_step/sec: 5.15062
INFO:tensorflow:loss = 4.857283, step = 677754 (19.421 sec)
INFO:tensorflow:lr = 1.100876e-06 (19.421 sec)
INFO:tensorflow:global_step/sec: 5.14435
INFO:tensorflow:loss = 4.899746, step = 677854 (19.435 sec)
INFO:tensorflow:lr = 1.0997704e-06 (19.436 sec)
INFO:tensorflow:global_step/sec: 5.15748
INFO:tensorflow:loss = 4.9584208, step = 677954 (19.387 sec)
INFO:tensorflow:lr = 1.0986653e-06 (19.386 sec)
INFO:tensorflow:global_step/sec: 5.16235
INFO:tensorflow:loss = 4.856601, step = 678054 (19.371 sec)
INFO:tensorflow:lr = 1.0975618e-06 (19.375 sec)
INFO:tensorflow:global_step/sec: 5.12531
INFO:tensorflow:loss = 4.8721886, step = 678154 (19.511 sec)
INFO:tensorflow:lr = 1.0964591e-06 (19.512 sec)
INFO:tensorflow:global_step/sec: 5.14995
INFO:tensorflow:loss = 4.824185, step = 678254 (19.422 sec)
INFO:tensorflow:lr = 1.0953579e-06 (19.417 sec)
INFO:tensorflow:Saving checkpoints for 678257 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.4936
INFO:tensorflow:loss = 4.8824663, step = 678354 (22.251 sec)
INFO:tensorflow:lr = 1.0942578e-06 (22.254 sec)
INFO:tensorflow:global_step/sec: 5.13171
INFO:tensorflow:loss = 4.8718243, step = 678454 (19.490 sec)
INFO:tensorflow:lr = 1.0931581e-06 (19.487 sec)
INFO:tensorflow:global_step/sec: 5.1347
INFO:tensorflow:loss = 4.7506633, step = 678554 (19.474 sec)
INFO:tensorflow:lr = 1.0920603e-06 (19.474 sec)
INFO:tensorflow:global_step/sec: 5.14068
INFO:tensorflow:loss = 4.8627663, step = 678654 (19.451 sec)
INFO:tensorflow:lr = 1.0909629e-06 (19.454 sec)
INFO:tensorflow:global_step/sec: 5.13665
INFO:tensorflow:loss = 5.024792, step = 678754 (19.464 sec)
INFO:tensorflow:lr = 1.0898673e-06 (19.463 sec)
INFO:tensorflow:global_step/sec: 5.13855
INFO:tensorflow:loss = 4.8935156, step = 678854 (19.467 sec)
INFO:tensorflow:lr = 1.0887727e-06 (19.465 sec)
INFO:tensorflow:global_step/sec: 5.12322
INFO:tensorflow:loss = 4.9490323, step = 678954 (19.517 sec)
INFO:tensorflow:lr = 1.0876787e-06 (19.519 sec)
INFO:tensorflow:global_step/sec: 5.0977
INFO:tensorflow:loss = 4.943809, step = 679054 (19.612 sec)
INFO:tensorflow:lr = 1.0865863e-06 (19.611 sec)
INFO:tensorflow:global_step/sec: 5.16287
INFO:tensorflow:loss = 4.871459, step = 679154 (19.373 sec)
INFO:tensorflow:lr = 1.0854943e-06 (19.375 sec)
INFO:tensorflow:global_step/sec: 5.15527
INFO:tensorflow:loss = 4.883729, step = 679254 (19.393 sec)
INFO:tensorflow:lr = 1.0844043e-06 (19.393 sec)
INFO:tensorflow:global_step/sec: 5.13948
INFO:tensorflow:loss = 4.9161954, step = 679354 (19.457 sec)
INFO:tensorflow:lr = 1.0833152e-06 (19.458 sec)
INFO:tensorflow:global_step/sec: 5.18057
INFO:tensorflow:loss = 5.024586, step = 679454 (19.308 sec)
INFO:tensorflow:lr = 1.0822266e-06 (19.306 sec)
INFO:tensorflow:global_step/sec: 5.08805
INFO:tensorflow:loss = 4.905444, step = 679554 (19.653 sec)
INFO:tensorflow:lr = 1.0811398e-06 (19.654 sec)
INFO:tensorflow:global_step/sec: 5.14896
INFO:tensorflow:loss = 4.8901644, step = 679654 (19.420 sec)
INFO:tensorflow:lr = 1.0800534e-06 (19.420 sec)
INFO:tensorflow:global_step/sec: 5.12783
INFO:tensorflow:loss = 5.094094, step = 679754 (19.505 sec)
INFO:tensorflow:lr = 1.0789687e-06 (19.504 sec)
INFO:tensorflow:global_step/sec: 5.17279
INFO:tensorflow:loss = 5.022425, step = 679854 (19.326 sec)
INFO:tensorflow:lr = 1.0778851e-06 (19.328 sec)
INFO:tensorflow:global_step/sec: 5.15794
INFO:tensorflow:loss = 5.0063577, step = 679954 (19.388 sec)
INFO:tensorflow:lr = 1.0768018e-06 (19.387 sec)
INFO:tensorflow:global_step/sec: 5.14084
INFO:tensorflow:loss = 4.947064, step = 680054 (19.455 sec)
INFO:tensorflow:lr = 1.0757204e-06 (19.457 sec)
INFO:tensorflow:global_step/sec: 5.14633
INFO:tensorflow:loss = 5.0113316, step = 680154 (19.431 sec)
INFO:tensorflow:lr = 1.0746394e-06 (19.430 sec)
INFO:tensorflow:global_step/sec: 5.13051
INFO:tensorflow:loss = 4.936766, step = 680254 (19.495 sec)
INFO:tensorflow:lr = 1.0735602e-06 (19.492 sec)
INFO:tensorflow:global_step/sec: 5.14998
INFO:tensorflow:loss = 4.932782, step = 680354 (19.412 sec)
INFO:tensorflow:lr = 1.0724822e-06 (19.413 sec)
INFO:tensorflow:global_step/sec: 5.13979
INFO:tensorflow:loss = 4.946468, step = 680454 (19.459 sec)
INFO:tensorflow:lr = 1.0714043e-06 (19.458 sec)
INFO:tensorflow:global_step/sec: 5.15644
INFO:tensorflow:loss = 4.975125, step = 680554 (19.391 sec)
INFO:tensorflow:lr = 1.0703284e-06 (19.391 sec)
INFO:tensorflow:global_step/sec: 5.11496
INFO:tensorflow:loss = 4.9292836, step = 680654 (19.550 sec)
INFO:tensorflow:lr = 1.0692528e-06 (19.553 sec)
INFO:tensorflow:global_step/sec: 5.15621
INFO:tensorflow:loss = 4.9932065, step = 680754 (19.395 sec)
INFO:tensorflow:lr = 1.0681789e-06 (19.391 sec)
INFO:tensorflow:global_step/sec: 5.1601
INFO:tensorflow:loss = 4.995244, step = 680854 (19.382 sec)
INFO:tensorflow:lr = 1.0671062e-06 (19.383 sec)
INFO:tensorflow:global_step/sec: 5.16783
INFO:tensorflow:loss = 4.9968863, step = 680954 (19.348 sec)
INFO:tensorflow:lr = 1.0660339e-06 (19.347 sec)
INFO:tensorflow:global_step/sec: 5.15084
INFO:tensorflow:loss = 4.9162807, step = 681054 (19.414 sec)
INFO:tensorflow:lr = 1.0649634e-06 (19.415 sec)
INFO:tensorflow:global_step/sec: 5.15781
INFO:tensorflow:loss = 5.044767, step = 681154 (19.391 sec)
INFO:tensorflow:lr = 1.0638931e-06 (19.391 sec)
INFO:tensorflow:global_step/sec: 5.16159
INFO:tensorflow:loss = 4.9743986, step = 681254 (19.376 sec)
INFO:tensorflow:lr = 1.0628247e-06 (19.374 sec)
INFO:tensorflow:global_step/sec: 5.15489
INFO:tensorflow:loss = 5.064058, step = 681354 (19.395 sec)
INFO:tensorflow:lr = 1.0617573e-06 (19.398 sec)
INFO:tensorflow:global_step/sec: 5.16056
INFO:tensorflow:loss = 4.8747582, step = 681454 (19.378 sec)
INFO:tensorflow:lr = 1.0606904e-06 (19.379 sec)
INFO:tensorflow:Saving checkpoints for 681460 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.64193
INFO:tensorflow:loss = 5.018409, step = 681554 (21.543 sec)
INFO:tensorflow:lr = 1.0596251e-06 (21.541 sec)
INFO:tensorflow:global_step/sec: 5.12663
INFO:tensorflow:loss = 5.101055, step = 681654 (19.509 sec)
INFO:tensorflow:lr = 1.0585603e-06 (19.508 sec)
INFO:tensorflow:global_step/sec: 5.15953
INFO:tensorflow:loss = 4.831344, step = 681754 (19.382 sec)
INFO:tensorflow:lr = 1.0574971e-06 (19.381 sec)
INFO:tensorflow:global_step/sec: 5.15313
INFO:tensorflow:loss = 4.9887776, step = 681854 (19.402 sec)
INFO:tensorflow:lr = 1.056435e-06 (19.403 sec)
INFO:tensorflow:global_step/sec: 5.1687
INFO:tensorflow:loss = 4.9701796, step = 681954 (19.350 sec)
INFO:tensorflow:lr = 1.0553736e-06 (19.350 sec)
INFO:tensorflow:global_step/sec: 5.15118
INFO:tensorflow:loss = 5.024588, step = 682054 (19.414 sec)
INFO:tensorflow:lr = 1.0543137e-06 (19.415 sec)
INFO:tensorflow:global_step/sec: 5.13115
INFO:tensorflow:loss = 4.823765, step = 682154 (19.485 sec)
INFO:tensorflow:lr = 1.0532542e-06 (19.483 sec)
INFO:tensorflow:global_step/sec: 5.10764
INFO:tensorflow:loss = 4.87919, step = 682254 (19.583 sec)
INFO:tensorflow:lr = 1.0521965e-06 (19.585 sec)
INFO:tensorflow:global_step/sec: 5.15714
INFO:tensorflow:loss = 4.9927163, step = 682354 (19.387 sec)
INFO:tensorflow:lr = 1.0511397e-06 (19.386 sec)
INFO:tensorflow:global_step/sec: 5.14595
INFO:tensorflow:loss = 4.933419, step = 682454 (19.432 sec)
INFO:tensorflow:lr = 1.0500834e-06 (19.432 sec)
INFO:tensorflow:global_step/sec: 5.15402
INFO:tensorflow:loss = 5.005822, step = 682554 (19.408 sec)
INFO:tensorflow:lr = 1.0490289e-06 (19.407 sec)
INFO:tensorflow:global_step/sec: 5.13305
INFO:tensorflow:loss = 4.829187, step = 682654 (19.481 sec)
INFO:tensorflow:lr = 1.0479747e-06 (19.481 sec)
INFO:tensorflow:global_step/sec: 5.16955
INFO:tensorflow:loss = 4.9689174, step = 682754 (19.345 sec)
INFO:tensorflow:lr = 1.0469222e-06 (19.345 sec)
INFO:tensorflow:global_step/sec: 5.14092
INFO:tensorflow:loss = 4.9187717, step = 682854 (19.446 sec)
INFO:tensorflow:lr = 1.0458708e-06 (19.447 sec)
INFO:tensorflow:global_step/sec: 5.15518
INFO:tensorflow:loss = 4.9609737, step = 682954 (19.398 sec)
INFO:tensorflow:lr = 1.0448198e-06 (19.399 sec)
INFO:tensorflow:global_step/sec: 5.14818
INFO:tensorflow:loss = 4.929585, step = 683054 (19.425 sec)
INFO:tensorflow:lr = 1.0437703e-06 (19.427 sec)
INFO:tensorflow:global_step/sec: 5.16111
INFO:tensorflow:loss = 4.9654245, step = 683154 (19.378 sec)
INFO:tensorflow:lr = 1.0427215e-06 (19.376 sec)
INFO:tensorflow:global_step/sec: 5.15713
INFO:tensorflow:loss = 4.9668884, step = 683254 (19.388 sec)
INFO:tensorflow:lr = 1.0416744e-06 (19.387 sec)
INFO:tensorflow:global_step/sec: 5.12428
INFO:tensorflow:loss = 5.0167375, step = 683354 (19.517 sec)
INFO:tensorflow:lr = 1.0406284e-06 (19.517 sec)
INFO:tensorflow:global_step/sec: 5.14542
INFO:tensorflow:loss = 4.8925705, step = 683454 (19.436 sec)
INFO:tensorflow:lr = 1.0395826e-06 (19.436 sec)
INFO:tensorflow:global_step/sec: 5.12923
INFO:tensorflow:loss = 5.02041, step = 683554 (19.497 sec)
INFO:tensorflow:lr = 1.0385386e-06 (19.497 sec)
INFO:tensorflow:global_step/sec: 5.1432
INFO:tensorflow:loss = 4.9887943, step = 683654 (19.442 sec)
INFO:tensorflow:lr = 1.0374949e-06 (19.442 sec)
INFO:tensorflow:global_step/sec: 5.15677
INFO:tensorflow:loss = 4.8906255, step = 683754 (19.392 sec)
INFO:tensorflow:lr = 1.036453e-06 (19.393 sec)
INFO:tensorflow:global_step/sec: 5.1096
INFO:tensorflow:loss = 4.908335, step = 683854 (19.572 sec)
INFO:tensorflow:lr = 1.0354121e-06 (19.573 sec)
Reading ../data/test.txt
INFO:tensorflow:global_step/sec: 5.13398
INFO:tensorflow:loss = 5.0420423, step = 683954 (19.473 sec)
INFO:tensorflow:lr = 1.0343717e-06 (19.473 sec)
INFO:tensorflow:global_step/sec: 5.24997
INFO:tensorflow:loss = 5.025483, step = 684054 (19.052 sec)
INFO:tensorflow:lr = 1.0333329e-06 (19.051 sec)
INFO:tensorflow:global_step/sec: 5.23556
INFO:tensorflow:loss = 4.983535, step = 684154 (19.096 sec)
INFO:tensorflow:lr = 1.0322944e-06 (19.101 sec)
INFO:tensorflow:global_step/sec: 5.23949
INFO:tensorflow:loss = 5.1789565, step = 684254 (19.085 sec)
INFO:tensorflow:lr = 1.0312577e-06 (19.083 sec)
INFO:tensorflow:global_step/sec: 5.22302
INFO:tensorflow:loss = 4.943187, step = 684354 (19.149 sec)
INFO:tensorflow:lr = 1.030222e-06 (19.148 sec)
INFO:tensorflow:global_step/sec: 5.23958
INFO:tensorflow:loss = 4.9531674, step = 684454 (19.082 sec)
INFO:tensorflow:lr = 1.0291868e-06 (19.082 sec)
INFO:tensorflow:global_step/sec: 5.24344
INFO:tensorflow:loss = 4.9054155, step = 684554 (19.077 sec)
INFO:tensorflow:lr = 1.0281532e-06 (19.077 sec)
INFO:tensorflow:global_step/sec: 5.2403
INFO:tensorflow:loss = 4.8892765, step = 684654 (19.082 sec)
INFO:tensorflow:lr = 1.02712e-06 (19.081 sec)
INFO:tensorflow:Saving checkpoints for 684663 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.6532
INFO:tensorflow:loss = 4.949104, step = 684754 (21.489 sec)
INFO:tensorflow:lr = 1.0260884e-06 (21.491 sec)
INFO:tensorflow:global_step/sec: 5.24035
INFO:tensorflow:loss = 4.932747, step = 684854 (19.082 sec)
INFO:tensorflow:lr = 1.0250579e-06 (19.081 sec)
INFO:tensorflow:global_step/sec: 5.22195
INFO:tensorflow:loss = 4.951329, step = 684954 (19.153 sec)
INFO:tensorflow:lr = 1.0240279e-06 (19.153 sec)
INFO:tensorflow:global_step/sec: 5.23349
INFO:tensorflow:loss = 5.018296, step = 685054 (19.104 sec)
INFO:tensorflow:lr = 1.0229995e-06 (19.105 sec)
INFO:tensorflow:global_step/sec: 5.23416
INFO:tensorflow:loss = 4.9884877, step = 685154 (19.107 sec)
INFO:tensorflow:lr = 1.0219715e-06 (19.107 sec)
INFO:tensorflow:global_step/sec: 5.24345
INFO:tensorflow:loss = 4.849711, step = 685254 (19.068 sec)
INFO:tensorflow:lr = 1.0209452e-06 (19.069 sec)
INFO:tensorflow:global_step/sec: 5.24368
INFO:tensorflow:loss = 4.967324, step = 685354 (19.070 sec)
INFO:tensorflow:lr = 1.0199198e-06 (19.072 sec)
INFO:tensorflow:global_step/sec: 5.20932
INFO:tensorflow:loss = 5.0278196, step = 685454 (19.200 sec)
INFO:tensorflow:lr = 1.0188949e-06 (19.197 sec)
INFO:tensorflow:global_step/sec: 5.22981
INFO:tensorflow:loss = 4.9408293, step = 685554 (19.122 sec)
INFO:tensorflow:lr = 1.0178717e-06 (19.122 sec)
INFO:tensorflow:global_step/sec: 5.24021
INFO:tensorflow:loss = 5.025908, step = 685654 (19.081 sec)
INFO:tensorflow:lr = 1.0168488e-06 (19.082 sec)
INFO:tensorflow:global_step/sec: 5.2472
INFO:tensorflow:loss = 4.980867, step = 685754 (19.056 sec)
INFO:tensorflow:lr = 1.0158276e-06 (19.058 sec)
INFO:tensorflow:global_step/sec: 5.24583
INFO:tensorflow:loss = 4.9960117, step = 685854 (19.066 sec)
INFO:tensorflow:lr = 1.0148075e-06 (19.065 sec)
INFO:tensorflow:global_step/sec: 5.23404
INFO:tensorflow:loss = 5.016256, step = 685954 (19.105 sec)
INFO:tensorflow:lr = 1.0137876e-06 (19.103 sec)
INFO:tensorflow:global_step/sec: 5.22725
INFO:tensorflow:loss = 4.9670935, step = 686054 (19.130 sec)
INFO:tensorflow:lr = 1.0127695e-06 (19.130 sec)
INFO:tensorflow:global_step/sec: 5.23796
INFO:tensorflow:loss = 4.903215, step = 686154 (19.091 sec)
INFO:tensorflow:lr = 1.0117518e-06 (19.092 sec)
INFO:tensorflow:global_step/sec: 5.22434
INFO:tensorflow:loss = 4.9338617, step = 686254 (19.144 sec)
INFO:tensorflow:lr = 1.0107357e-06 (19.144 sec)
INFO:tensorflow:global_step/sec: 5.23697
INFO:tensorflow:loss = 4.9349976, step = 686354 (19.090 sec)
INFO:tensorflow:lr = 1.0097206e-06 (19.094 sec)
INFO:tensorflow:global_step/sec: 5.23
INFO:tensorflow:loss = 4.873442, step = 686454 (19.126 sec)
INFO:tensorflow:lr = 1.008706e-06 (19.120 sec)
INFO:tensorflow:global_step/sec: 5.23024
INFO:tensorflow:loss = 4.969791, step = 686554 (19.119 sec)
INFO:tensorflow:lr = 1.007693e-06 (19.119 sec)
INFO:tensorflow:global_step/sec: 5.23511
INFO:tensorflow:loss = 4.8918633, step = 686654 (19.101 sec)
INFO:tensorflow:lr = 1.0066803e-06 (19.101 sec)
INFO:tensorflow:global_step/sec: 5.24461
INFO:tensorflow:loss = 4.942817, step = 686754 (19.068 sec)
INFO:tensorflow:lr = 1.0056694e-06 (19.069 sec)
INFO:tensorflow:global_step/sec: 5.22647
INFO:tensorflow:loss = 4.9725113, step = 686854 (19.128 sec)
INFO:tensorflow:lr = 1.0046593e-06 (19.129 sec)
INFO:tensorflow:global_step/sec: 5.23216
INFO:tensorflow:loss = 4.8320837, step = 686954 (19.113 sec)
INFO:tensorflow:lr = 1.0036497e-06 (19.113 sec)
INFO:tensorflow:global_step/sec: 5.18872
INFO:tensorflow:loss = 5.0282636, step = 687054 (19.277 sec)
INFO:tensorflow:lr = 1.0026417e-06 (19.275 sec)
INFO:tensorflow:Saving checkpoints for 687153 into ../model/transformer/model.ckpt.
INFO:tensorflow:Loss for final step: 4.9582477.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-687153
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
------------
minimal test
Q: 你 好
A1: 你 好
A2: 你 好 !
A3: 你 也 好

Q: 早 上 好
A1: 早 上 好
A2: 早 安
A3: 早 上 好 !

Q: 晚 上 好
A1: 晚 上 好
A2: 晚 安
A3: 晚 好

Q: 再 见
A1: 哈 哈 哈
A2: 哈 哈
A3: 哈 哈 哈 哈 哈

Q: 好 久 不 见
A1: 好 久 不 见
A2: 是 啊
A3: 是 的

Q: 想 死 你 了
A1: 我 也 想 你
A2: 我 也 是
A3: 哈 哈 哈

Q: 谢 谢 你
A1: 不 客 气
A2: 不 用 客 气
A3: 不 客 气 啦

Q: 爱 你
A1: 我 也 爱 你
A2: 哈 哈 哈
A3: 我 不 爱 你

Q: 你 好 厉 害 啊
A1: 哈 哈 哈
A2: 那 必 须 的
A3: 哈 哈

Q: 你 叫 什 么
A1: 你 猜
A2: 我 叫 你
A3: 我 是 你 爸 爸

Q: 你 几 岁 了
A1: 岁
A2: 你 猜
A3: 岁 了

Q: 现 在 几 点
A1: 点 半
A2: 点 多
A3: 点 半 吧

Q: 今 天 天 气 怎 么 样
A1: 还 不 错
A2: 天 气 不 错
A3: 今 天 下 雨

Q: 我 们 现 在 在 哪 里
A1: 你 猜
A2: 你 在 哪
A3: 你 在 哪 呢

Q: 讲 个 笑 话
A1: 哈 哈 哈 哈
A2: 哈 哈 哈
A3: 哈 哈

Q: 你 会 几 种 语 言 呀
A1: 你 猜
A2: 不 知 道
A3: 不 知 道 啊

Q: 你 觉 得 我 帅 吗
A1: 不 是
A2: 你 觉 得 呢
A3: 不 帅

Q: 讨 厌 的 周 一
A1: 哈 哈 哈
A2: 哈 哈
A3: 哈 哈 哈 哈

Q: 好 烦 啊
A1: 哈 哈 哈
A2: 不 烦 不 烦
A3: 我 也 是

Q: 天 气 真 好
A1: 是 的
A2: 天 气 好
A3: 是 啊

Q: 今 天 好 冷
A1: 不 冷
A2: 这 几 天 都 冷
A3: 今 天 还 好

Q: 今 天 好 热
A1: 哈 哈
A2: 哈 哈 哈
A3: 哈 哈 哈 哈

Q: 下 雨 了
A1: 下 雨 了
A2: 是 啊
A3: 哈 哈

Q: 风 好 大
A1: 是 啊
A2: 哈 哈 哈
A3: 哈 哈

Q: 终 于 周 五 了
A1: 哈 哈 哈
A2: 哈 哈
A3: 哈 哈 哈 哈 哈

Q: 我 想 去 K 歌
A1: 去 吧
A2: 来 啊
A3: 我 也 想 去

Q: 红 烧 肉 吃 了 会 发 胖 吗
A1: 会 啊
A2: 不 会
A3: 不 会 啊

Q: 你 觉 得 梅 西 厉 害 吗
A1: 不 厉 害
A2: 哈 哈 哈 哈
A3: 哈 哈

------------
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-01-09T04:18:30Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-687153
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:Finished evaluation at 2020-01-09-04:18:34
INFO:tensorflow:Saving dict for global step 687153: global_step = 687153, loss = 3.4968934
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 687153: ../model/transformer/model.ckpt-687153
INFO:tensorflow:Perplexity: 33.013
INFO:tensorflow:Best Perplexity: 29.570
Reading ../data/core.txt
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-687153
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:BLEU-2: 12.2
INFO:tensorflow:Calling model_fn.
[<tf.Variable 'Embedding/fasttext_vectors:0' shape=(5904, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/affine_bias:0' shape=(5904,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/gamma:0' shape=(300,) dtype=float32_ref>]
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-687153
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 687153 into ../model/transformer/model.ckpt.
Reading ../data/train.txt
INFO:tensorflow:loss = 4.9720006, step = 687153
INFO:tensorflow:lr = 1.0016447e-06
INFO:tensorflow:global_step/sec: 4.07047
INFO:tensorflow:loss = 5.0158787, step = 687253 (24.569 sec)
INFO:tensorflow:lr = 1.0006381e-06 (24.567 sec)
INFO:tensorflow:global_step/sec: 5.15309
INFO:tensorflow:loss = 4.947072, step = 687353 (19.410 sec)
INFO:tensorflow:lr = 9.996332e-07 (19.409 sec)
INFO:tensorflow:global_step/sec: 5.12538
INFO:tensorflow:loss = 4.950285, step = 687453 (19.512 sec)
INFO:tensorflow:lr = 9.986293e-07 (19.513 sec)
INFO:tensorflow:global_step/sec: 5.14613
INFO:tensorflow:loss = 4.888521, step = 687553 (19.427 sec)
INFO:tensorflow:lr = 9.976258e-07 (19.426 sec)
INFO:tensorflow:global_step/sec: 5.13695
INFO:tensorflow:loss = 4.8836727, step = 687653 (19.471 sec)
INFO:tensorflow:lr = 9.96624e-07 (19.471 sec)
INFO:tensorflow:global_step/sec: 5.15475
INFO:tensorflow:loss = 4.87634, step = 687753 (19.400 sec)
INFO:tensorflow:lr = 9.956224e-07 (19.399 sec)
INFO:tensorflow:global_step/sec: 5.14644
INFO:tensorflow:loss = 4.932794, step = 687853 (19.430 sec)
INFO:tensorflow:lr = 9.946225e-07 (19.431 sec)
INFO:tensorflow:global_step/sec: 5.14827
INFO:tensorflow:loss = 4.9000716, step = 687953 (19.421 sec)
INFO:tensorflow:lr = 9.936236e-07 (19.420 sec)
INFO:tensorflow:global_step/sec: 5.12503
INFO:tensorflow:loss = 4.877325, step = 688053 (19.512 sec)
INFO:tensorflow:lr = 9.92625e-07 (19.513 sec)
INFO:tensorflow:global_step/sec: 5.12838
INFO:tensorflow:loss = 4.932374, step = 688153 (19.500 sec)
INFO:tensorflow:lr = 9.916283e-07 (19.499 sec)
INFO:tensorflow:global_step/sec: 5.12881
INFO:tensorflow:loss = 4.9321356, step = 688253 (19.501 sec)
INFO:tensorflow:lr = 9.906319e-07 (19.501 sec)
INFO:tensorflow:global_step/sec: 5.14213
INFO:tensorflow:loss = 4.881751, step = 688353 (19.446 sec)
INFO:tensorflow:lr = 9.896369e-07 (19.448 sec)
INFO:tensorflow:global_step/sec: 5.14256
INFO:tensorflow:loss = 4.861979, step = 688453 (19.447 sec)
INFO:tensorflow:lr = 9.886429e-07 (19.444 sec)
INFO:tensorflow:global_step/sec: 5.16191
INFO:tensorflow:loss = 4.940867, step = 688553 (19.373 sec)
INFO:tensorflow:lr = 9.876496e-07 (19.373 sec)
INFO:tensorflow:global_step/sec: 5.15352
INFO:tensorflow:loss = 5.089053, step = 688653 (19.403 sec)
INFO:tensorflow:lr = 9.866578e-07 (19.403 sec)
INFO:tensorflow:global_step/sec: 5.16113
INFO:tensorflow:loss = 5.0857563, step = 688753 (19.376 sec)
INFO:tensorflow:lr = 9.856662e-07 (19.378 sec)
INFO:tensorflow:global_step/sec: 5.15127
INFO:tensorflow:loss = 4.898282, step = 688853 (19.410 sec)
INFO:tensorflow:lr = 9.846764e-07 (19.410 sec)
INFO:tensorflow:global_step/sec: 5.15036
INFO:tensorflow:loss = 4.794425, step = 688953 (19.416 sec)
INFO:tensorflow:lr = 9.836874e-07 (19.416 sec)
INFO:tensorflow:global_step/sec: 5.10861
INFO:tensorflow:loss = 4.9453306, step = 689053 (19.579 sec)
INFO:tensorflow:lr = 9.826989e-07 (19.578 sec)
INFO:tensorflow:global_step/sec: 5.14642
INFO:tensorflow:loss = 4.946598, step = 689153 (19.431 sec)
INFO:tensorflow:lr = 9.81712e-07 (19.429 sec)
INFO:tensorflow:global_step/sec: 5.1385
INFO:tensorflow:loss = 4.882892, step = 689253 (19.461 sec)
INFO:tensorflow:lr = 9.807255e-07 (19.461 sec)
INFO:tensorflow:global_step/sec: 5.16508
INFO:tensorflow:loss = 4.9224277, step = 689353 (19.361 sec)
INFO:tensorflow:lr = 9.797405e-07 (19.361 sec)
INFO:tensorflow:global_step/sec: 5.14114
INFO:tensorflow:loss = 5.0047474, step = 689453 (19.448 sec)
INFO:tensorflow:lr = 9.787566e-07 (19.447 sec)
INFO:tensorflow:global_step/sec: 5.11009
INFO:tensorflow:loss = 4.8117814, step = 689553 (19.568 sec)
INFO:tensorflow:lr = 9.777732e-07 (19.569 sec)
INFO:tensorflow:global_step/sec: 5.13838
INFO:tensorflow:loss = 4.966133, step = 689653 (19.461 sec)
INFO:tensorflow:lr = 9.767911e-07 (19.461 sec)
INFO:tensorflow:global_step/sec: 5.15577
INFO:tensorflow:loss = 4.9660034, step = 689753 (19.400 sec)
INFO:tensorflow:lr = 9.758095e-07 (19.399 sec)
INFO:tensorflow:global_step/sec: 5.12688
INFO:tensorflow:loss = 4.933957, step = 689853 (19.506 sec)
INFO:tensorflow:lr = 9.748295e-07 (19.507 sec)
INFO:tensorflow:global_step/sec: 5.16856
INFO:tensorflow:loss = 4.911001, step = 689953 (19.348 sec)
INFO:tensorflow:lr = 9.738505e-07 (19.347 sec)
INFO:tensorflow:global_step/sec: 5.14948
INFO:tensorflow:loss = 4.953211, step = 690053 (19.417 sec)
INFO:tensorflow:lr = 9.728719e-07 (19.417 sec)
INFO:tensorflow:global_step/sec: 5.13769
INFO:tensorflow:loss = 4.946312, step = 690153 (19.465 sec)
INFO:tensorflow:lr = 9.718948e-07 (19.465 sec)
INFO:tensorflow:global_step/sec: 5.15816
INFO:tensorflow:loss = 4.879356, step = 690253 (19.387 sec)
INFO:tensorflow:lr = 9.709182e-07 (19.388 sec)
INFO:tensorflow:global_step/sec: 5.13721
INFO:tensorflow:loss = 4.951071, step = 690353 (19.472 sec)
INFO:tensorflow:lr = 9.699431e-07 (19.472 sec)
INFO:tensorflow:Saving checkpoints for 690356 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.4885
INFO:tensorflow:loss = 4.8830557, step = 690453 (22.272 sec)
INFO:tensorflow:lr = 9.689691e-07 (22.272 sec)
INFO:tensorflow:global_step/sec: 5.16238
INFO:tensorflow:loss = 4.9147615, step = 690553 (19.368 sec)
INFO:tensorflow:lr = 9.679953e-07 (19.370 sec)
INFO:tensorflow:global_step/sec: 5.13505
INFO:tensorflow:loss = 4.9561605, step = 690653 (19.474 sec)
INFO:tensorflow:lr = 9.670232e-07 (19.475 sec)
INFO:tensorflow:global_step/sec: 5.13745
INFO:tensorflow:loss = 4.9421735, step = 690753 (19.465 sec)
INFO:tensorflow:lr = 9.660514e-07 (19.463 sec)
INFO:tensorflow:global_step/sec: 5.12989
INFO:tensorflow:loss = 4.897192, step = 690853 (19.494 sec)
INFO:tensorflow:lr = 9.650813e-07 (19.492 sec)
INFO:tensorflow:global_step/sec: 5.11341
INFO:tensorflow:loss = 4.889584, step = 690953 (19.561 sec)
INFO:tensorflow:lr = 9.64112e-07 (19.562 sec)
INFO:tensorflow:global_step/sec: 5.13713
INFO:tensorflow:loss = 4.9040337, step = 691053 (19.465 sec)
INFO:tensorflow:lr = 9.631432e-07 (19.464 sec)
INFO:tensorflow:global_step/sec: 5.10521
INFO:tensorflow:loss = 4.8790727, step = 691153 (19.587 sec)
INFO:tensorflow:lr = 9.621759e-07 (19.588 sec)
INFO:tensorflow:global_step/sec: 5.12288
INFO:tensorflow:loss = 4.970057, step = 691253 (19.517 sec)
INFO:tensorflow:lr = 9.61209e-07 (19.519 sec)
INFO:tensorflow:global_step/sec: 5.14992
INFO:tensorflow:loss = 4.9955077, step = 691353 (19.419 sec)
INFO:tensorflow:lr = 9.602437e-07 (19.417 sec)
INFO:tensorflow:global_step/sec: 5.14649
INFO:tensorflow:loss = 4.946452, step = 691453 (19.433 sec)
INFO:tensorflow:lr = 9.592793e-07 (19.433 sec)
INFO:tensorflow:global_step/sec: 5.16111
INFO:tensorflow:loss = 4.90373, step = 691553 (19.376 sec)
INFO:tensorflow:lr = 9.583154e-07 (19.376 sec)
INFO:tensorflow:global_step/sec: 5.16038
INFO:tensorflow:loss = 4.9902706, step = 691653 (19.374 sec)
INFO:tensorflow:lr = 9.57353e-07 (19.374 sec)
INFO:tensorflow:global_step/sec: 5.14604
INFO:tensorflow:loss = 4.961497, step = 691753 (19.436 sec)
INFO:tensorflow:lr = 9.56391e-07 (19.437 sec)
INFO:tensorflow:global_step/sec: 5.16775
INFO:tensorflow:loss = 4.944961, step = 691853 (19.351 sec)
INFO:tensorflow:lr = 9.554304e-07 (19.350 sec)
INFO:tensorflow:global_step/sec: 5.14911
INFO:tensorflow:loss = 4.911937, step = 691953 (19.424 sec)
INFO:tensorflow:lr = 9.544709e-07 (19.424 sec)
INFO:tensorflow:global_step/sec: 5.16528
INFO:tensorflow:loss = 4.9416304, step = 692053 (19.357 sec)
INFO:tensorflow:lr = 9.535118e-07 (19.358 sec)
INFO:tensorflow:global_step/sec: 5.12058
INFO:tensorflow:loss = 4.863657, step = 692153 (19.529 sec)
INFO:tensorflow:lr = 9.5255416e-07 (19.528 sec)
INFO:tensorflow:global_step/sec: 5.15008
INFO:tensorflow:loss = 4.9188476, step = 692253 (19.416 sec)
INFO:tensorflow:lr = 9.51597e-07 (19.416 sec)
INFO:tensorflow:global_step/sec: 5.16183
INFO:tensorflow:loss = 4.9743, step = 692353 (19.370 sec)
INFO:tensorflow:lr = 9.5064127e-07 (19.373 sec)
INFO:tensorflow:global_step/sec: 5.14159
INFO:tensorflow:loss = 5.017494, step = 692453 (19.451 sec)
INFO:tensorflow:lr = 9.496865e-07 (19.449 sec)
INFO:tensorflow:global_step/sec: 5.14335
INFO:tensorflow:loss = 4.963124, step = 692553 (19.444 sec)
INFO:tensorflow:lr = 9.487323e-07 (19.443 sec)
INFO:tensorflow:global_step/sec: 5.15716
INFO:tensorflow:loss = 4.9037776, step = 692653 (19.391 sec)
INFO:tensorflow:lr = 9.477794e-07 (19.391 sec)
INFO:tensorflow:global_step/sec: 5.09561
INFO:tensorflow:loss = 5.001163, step = 692753 (19.621 sec)
INFO:tensorflow:lr = 9.468271e-07 (19.623 sec)
INFO:tensorflow:global_step/sec: 5.14136
INFO:tensorflow:loss = 4.965095, step = 692853 (19.454 sec)
INFO:tensorflow:lr = 9.4587614e-07 (19.451 sec)
INFO:tensorflow:global_step/sec: 5.14524
INFO:tensorflow:loss = 4.9764204, step = 692953 (19.435 sec)
INFO:tensorflow:lr = 9.449262e-07 (19.436 sec)
INFO:tensorflow:global_step/sec: 5.13134
INFO:tensorflow:loss = 4.902689, step = 693053 (19.485 sec)
INFO:tensorflow:lr = 9.439766e-07 (19.488 sec)
INFO:tensorflow:global_step/sec: 5.1467
INFO:tensorflow:loss = 5.0218444, step = 693153 (19.429 sec)
INFO:tensorflow:lr = 9.430286e-07 (19.428 sec)
INFO:tensorflow:global_step/sec: 5.1679
INFO:tensorflow:loss = 5.016225, step = 693253 (19.353 sec)
INFO:tensorflow:lr = 9.42081e-07 (19.352 sec)
INFO:tensorflow:global_step/sec: 5.1411
INFO:tensorflow:loss = 4.998582, step = 693353 (19.449 sec)
INFO:tensorflow:lr = 9.4113494e-07 (19.448 sec)
INFO:tensorflow:global_step/sec: 5.15021
INFO:tensorflow:loss = 5.0632377, step = 693453 (19.420 sec)
INFO:tensorflow:lr = 9.401898e-07 (19.420 sec)
INFO:tensorflow:global_step/sec: 5.17014
INFO:tensorflow:loss = 4.850567, step = 693553 (19.343 sec)
INFO:tensorflow:lr = 9.3924496e-07 (19.344 sec)
INFO:tensorflow:Saving checkpoints for 693559 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.56643
INFO:tensorflow:loss = 4.8500886, step = 693653 (21.899 sec)
INFO:tensorflow:lr = 9.383017e-07 (21.903 sec)
INFO:tensorflow:global_step/sec: 5.12657
INFO:tensorflow:loss = 4.9538803, step = 693753 (19.505 sec)
INFO:tensorflow:lr = 9.373588e-07 (19.500 sec)
INFO:tensorflow:global_step/sec: 5.14261
INFO:tensorflow:loss = 4.9473305, step = 693853 (19.443 sec)
INFO:tensorflow:lr = 9.3641734e-07 (19.442 sec)
INFO:tensorflow:global_step/sec: 5.17199
INFO:tensorflow:loss = 4.9755497, step = 693953 (19.336 sec)
INFO:tensorflow:lr = 9.35477e-07 (19.335 sec)
INFO:tensorflow:global_step/sec: 5.16375
INFO:tensorflow:loss = 4.9232326, step = 694053 (19.370 sec)
INFO:tensorflow:lr = 9.3453696e-07 (19.370 sec)
INFO:tensorflow:global_step/sec: 5.15273
INFO:tensorflow:loss = 4.9926343, step = 694153 (19.407 sec)
INFO:tensorflow:lr = 9.335984e-07 (19.407 sec)
INFO:tensorflow:global_step/sec: 5.18547
INFO:tensorflow:loss = 4.953304, step = 694253 (19.280 sec)
INFO:tensorflow:lr = 9.326602e-07 (19.283 sec)
INFO:tensorflow:global_step/sec: 5.11248
INFO:tensorflow:loss = 4.853437, step = 694353 (19.565 sec)
INFO:tensorflow:lr = 9.3172355e-07 (19.561 sec)
INFO:tensorflow:global_step/sec: 5.17004
INFO:tensorflow:loss = 4.8413033, step = 694453 (19.339 sec)
INFO:tensorflow:lr = 9.307878e-07 (19.339 sec)
INFO:tensorflow:global_step/sec: 5.16816
INFO:tensorflow:loss = 4.913798, step = 694553 (19.353 sec)
INFO:tensorflow:lr = 9.298525e-07 (19.352 sec)
INFO:tensorflow:global_step/sec: 5.152
INFO:tensorflow:loss = 4.890079, step = 694653 (19.410 sec)
INFO:tensorflow:lr = 9.2891867e-07 (19.410 sec)
INFO:tensorflow:global_step/sec: 5.17531
INFO:tensorflow:loss = 4.9300404, step = 694753 (19.318 sec)
INFO:tensorflow:lr = 9.279851e-07 (19.318 sec)
INFO:tensorflow:global_step/sec: 5.14586
INFO:tensorflow:loss = 4.8607388, step = 694853 (19.437 sec)
INFO:tensorflow:lr = 9.270532e-07 (19.438 sec)
INFO:tensorflow:global_step/sec: 5.17125
INFO:tensorflow:loss = 4.878534, step = 694953 (19.339 sec)
INFO:tensorflow:lr = 9.261222e-07 (19.339 sec)
INFO:tensorflow:global_step/sec: 5.14974
INFO:tensorflow:loss = 5.0475698, step = 695053 (19.413 sec)
INFO:tensorflow:lr = 9.2519156e-07 (19.416 sec)
INFO:tensorflow:global_step/sec: 5.16363
INFO:tensorflow:loss = 4.999761, step = 695153 (19.371 sec)
INFO:tensorflow:lr = 9.2426245e-07 (19.368 sec)
INFO:tensorflow:global_step/sec: 5.13734
INFO:tensorflow:loss = 4.8577085, step = 695253 (19.461 sec)
INFO:tensorflow:lr = 9.2333363e-07 (19.461 sec)
INFO:tensorflow:global_step/sec: 5.1486
INFO:tensorflow:loss = 4.947336, step = 695353 (19.423 sec)
INFO:tensorflow:lr = 9.224063e-07 (19.423 sec)
INFO:tensorflow:global_step/sec: 5.165
INFO:tensorflow:loss = 5.0686617, step = 695453 (19.361 sec)
INFO:tensorflow:lr = 9.2147997e-07 (19.362 sec)
INFO:tensorflow:global_step/sec: 5.16462
INFO:tensorflow:loss = 4.876643, step = 695553 (19.367 sec)
INFO:tensorflow:lr = 9.20554e-07 (19.366 sec)
INFO:tensorflow:global_step/sec: 5.15925
INFO:tensorflow:loss = 4.9856434, step = 695653 (19.384 sec)
INFO:tensorflow:lr = 9.1962954e-07 (19.384 sec)
INFO:tensorflow:global_step/sec: 5.15793
INFO:tensorflow:loss = 4.9645915, step = 695753 (19.382 sec)
INFO:tensorflow:lr = 9.187054e-07 (19.383 sec)
INFO:tensorflow:global_step/sec: 5.1622
INFO:tensorflow:loss = 4.907149, step = 695853 (19.372 sec)
INFO:tensorflow:lr = 9.177827e-07 (19.372 sec)
INFO:tensorflow:global_step/sec: 5.14102
INFO:tensorflow:loss = 4.9918942, step = 695953 (19.455 sec)
INFO:tensorflow:lr = 9.1686104e-07 (19.456 sec)
Reading ../data/test.txt
INFO:tensorflow:global_step/sec: 5.16403
INFO:tensorflow:loss = 4.867622, step = 696053 (19.361 sec)
INFO:tensorflow:lr = 9.1593967e-07 (19.360 sec)
INFO:tensorflow:global_step/sec: 5.24015
INFO:tensorflow:loss = 4.9409776, step = 696153 (19.087 sec)
INFO:tensorflow:lr = 9.1501977e-07 (19.087 sec)
INFO:tensorflow:global_step/sec: 5.26928
INFO:tensorflow:loss = 5.0770802, step = 696253 (18.978 sec)
INFO:tensorflow:lr = 9.141002e-07 (18.979 sec)
INFO:tensorflow:global_step/sec: 5.25863
INFO:tensorflow:loss = 4.8067923, step = 696353 (19.013 sec)
INFO:tensorflow:lr = 9.1318225e-07 (19.012 sec)
INFO:tensorflow:global_step/sec: 5.25129
INFO:tensorflow:loss = 4.978089, step = 696453 (19.047 sec)
INFO:tensorflow:lr = 9.1226514e-07 (19.046 sec)
INFO:tensorflow:global_step/sec: 5.25326
INFO:tensorflow:loss = 4.87521, step = 696553 (19.037 sec)
INFO:tensorflow:lr = 9.113485e-07 (19.037 sec)
INFO:tensorflow:global_step/sec: 5.25727
INFO:tensorflow:loss = 4.8826404, step = 696653 (19.021 sec)
INFO:tensorflow:lr = 9.1043324e-07 (19.022 sec)
INFO:tensorflow:global_step/sec: 5.25922
INFO:tensorflow:loss = 4.9340186, step = 696753 (19.014 sec)
INFO:tensorflow:lr = 9.0951823e-07 (19.016 sec)
INFO:tensorflow:Saving checkpoints for 696762 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.68879
INFO:tensorflow:loss = 4.908461, step = 696853 (21.325 sec)
INFO:tensorflow:lr = 9.086049e-07 (21.323 sec)
INFO:tensorflow:global_step/sec: 5.24782
INFO:tensorflow:loss = 4.926461, step = 696953 (19.055 sec)
INFO:tensorflow:lr = 9.076924e-07 (19.055 sec)
INFO:tensorflow:global_step/sec: 5.26295
INFO:tensorflow:loss = 5.0209713, step = 697053 (18.999 sec)
INFO:tensorflow:lr = 9.0678026e-07 (19.002 sec)
INFO:tensorflow:global_step/sec: 5.25017
INFO:tensorflow:loss = 4.965448, step = 697153 (19.051 sec)
INFO:tensorflow:lr = 9.0586957e-07 (19.047 sec)
INFO:tensorflow:global_step/sec: 5.25087
INFO:tensorflow:loss = 4.9258814, step = 697253 (19.042 sec)
INFO:tensorflow:lr = 9.0495934e-07 (19.046 sec)
INFO:tensorflow:global_step/sec: 5.25205
INFO:tensorflow:loss = 4.9021935, step = 697353 (19.043 sec)
INFO:tensorflow:lr = 9.0405047e-07 (19.039 sec)
INFO:tensorflow:global_step/sec: 5.2591
INFO:tensorflow:loss = 5.1157613, step = 697453 (19.016 sec)
INFO:tensorflow:lr = 9.0314256e-07 (19.017 sec)
INFO:tensorflow:global_step/sec: 5.21726
INFO:tensorflow:loss = 4.9752507, step = 697553 (19.168 sec)
INFO:tensorflow:lr = 9.0223494e-07 (19.166 sec)
INFO:tensorflow:global_step/sec: 5.26029
INFO:tensorflow:loss = 5.0111666, step = 697653 (19.005 sec)
INFO:tensorflow:lr = 9.0132886e-07 (19.008 sec)
INFO:tensorflow:global_step/sec: 5.2413
INFO:tensorflow:loss = 5.0124707, step = 697753 (19.085 sec)
INFO:tensorflow:lr = 9.0042306e-07 (19.082 sec)
INFO:tensorflow:global_step/sec: 5.23811
INFO:tensorflow:loss = 4.9163876, step = 697853 (19.090 sec)
INFO:tensorflow:lr = 8.9951885e-07 (19.090 sec)
INFO:tensorflow:global_step/sec: 5.20648
INFO:tensorflow:loss = 4.9446316, step = 697953 (19.207 sec)
INFO:tensorflow:lr = 8.986154e-07 (19.207 sec)
INFO:tensorflow:global_step/sec: 5.19999
INFO:tensorflow:loss = 4.949003, step = 698053 (19.227 sec)
INFO:tensorflow:lr = 8.977125e-07 (19.228 sec)
INFO:tensorflow:global_step/sec: 5.20544
INFO:tensorflow:loss = 5.1094794, step = 698153 (19.210 sec)
INFO:tensorflow:lr = 8.9681095e-07 (19.210 sec)
INFO:tensorflow:global_step/sec: 5.22954
INFO:tensorflow:loss = 4.9431233, step = 698253 (19.122 sec)
INFO:tensorflow:lr = 8.959097e-07 (19.121 sec)
INFO:tensorflow:global_step/sec: 5.23592
INFO:tensorflow:loss = 4.9271502, step = 698353 (19.101 sec)
INFO:tensorflow:lr = 8.950099e-07 (19.102 sec)
INFO:tensorflow:global_step/sec: 5.23593
INFO:tensorflow:loss = 5.039077, step = 698453 (19.096 sec)
INFO:tensorflow:lr = 8.9411117e-07 (19.097 sec)
INFO:tensorflow:global_step/sec: 5.23763
INFO:tensorflow:loss = 4.8878818, step = 698553 (19.093 sec)
INFO:tensorflow:lr = 8.932126e-07 (19.091 sec)
INFO:tensorflow:global_step/sec: 5.21924
INFO:tensorflow:loss = 4.943321, step = 698653 (19.161 sec)
INFO:tensorflow:lr = 8.9231554e-07 (19.162 sec)
INFO:tensorflow:global_step/sec: 5.19762
INFO:tensorflow:loss = 4.961584, step = 698753 (19.239 sec)
INFO:tensorflow:lr = 8.914189e-07 (19.237 sec)
INFO:tensorflow:global_step/sec: 5.21595
INFO:tensorflow:loss = 4.8739753, step = 698853 (19.172 sec)
INFO:tensorflow:lr = 8.9052367e-07 (19.173 sec)
INFO:tensorflow:global_step/sec: 5.23967
INFO:tensorflow:loss = 4.856079, step = 698953 (19.089 sec)
INFO:tensorflow:lr = 8.8962935e-07 (19.088 sec)
INFO:tensorflow:global_step/sec: 5.23466
INFO:tensorflow:loss = 4.980717, step = 699053 (19.104 sec)
INFO:tensorflow:lr = 8.887353e-07 (19.105 sec)
INFO:tensorflow:global_step/sec: 5.20627
INFO:tensorflow:loss = 5.0072045, step = 699153 (19.206 sec)
INFO:tensorflow:lr = 8.878428e-07 (19.206 sec)
INFO:tensorflow:Saving checkpoints for 699252 into ../model/transformer/model.ckpt.
INFO:tensorflow:Loss for final step: 4.865859.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-699252
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
------------
minimal test
Q: 你 好
A1: 你 好
A2: 你 好 你 好
A3: 你 也 好

Q: 早 上 好
A1: 早 上 好
A2: 下 午 好
A3: 早 安

Q: 晚 上 好
A1: 晚 上 好
A2: 晚 安
A3: 晚 上 好 !

Q: 再 见
A1: 图 片 评 论
A2: 哈 哈 哈 哈
A3: 哈 哈 哈

Q: 好 久 不 见
A1: 好 久 不 见
A2: 是 啊
A3: 是 的

Q: 想 死 你 了
A1: 哈 哈
A2: 哈 哈 哈 哈 哈
A3: 我 也 是

Q: 谢 谢 你
A1: 不 客 气
A2: 不 用 谢
A3: 客 气

Q: 爱 你
A1: 我 也 爱 你
A2: 么 么 哒
A3: 爱 你

Q: 你 好 厉 害 啊
A1: 哈 哈
A2: 哈 哈 哈
A3: 你 也 厉 害

Q: 你 叫 什 么
A1: 你 猜
A2: 我 叫 你
A3: 我 是 小 可 爱

Q: 你 几 岁 了
A1: 三 岁
A2: 你 猜
A3: 十 五

Q: 现 在 几 点
A1: 点 多
A2: 十 二 点
A3: 点 半

Q: 今 天 天 气 怎 么 样
A1: 还 不 错
A2: 还 好
A3: 还 可 以

Q: 我 们 现 在 在 哪 里
A1: 在 哪
A2: 在 家
A3: 你 们 在 哪 里

Q: 讲 个 笑 话
A1: 图 片 评 论
A2: 哈 哈 哈 哈
A3: 哈 哈 哈

Q: 你 会 几 种 语 言 呀
A1: 一 般
A2: 不 会
A3: 不 知 道

Q: 你 觉 得 我 帅 吗
A1: 不 帅
A2: 帅 帅
A3: 没 有

Q: 讨 厌 的 周 一
A1: 哈 哈 哈
A2: 哈 哈 哈 哈
A3: 哈 哈

Q: 好 烦 啊
A1: 怎 么 了
A2: 咋 啦
A3: 怎 么 啦

Q: 天 气 真 好
A1: 是 啊
A2: 嗯 嗯
A3: 是 的

Q: 今 天 好 冷
A1: 是 啊
A2: 哈 哈
A3: 哈 哈 哈

Q: 今 天 好 热
A1: 哈 哈 哈
A2: 哈 哈
A3: 是 啊

Q: 下 雨 了
A1: 哈 哈 哈
A2: 哈 哈
A3: 下 了

Q: 风 好 大
A1: 哈 哈
A2: 是 的
A3: 哈 哈 哈

Q: 终 于 周 五 了
A1: 哈 哈 哈
A2: 哈 哈 哈 哈
A3: 是 的

Q: 我 想 去 K 歌
A1: 去 吧
A2: 来 来 来
A3: 来 啊

Q: 红 烧 肉 吃 了 会 发 胖 吗
A1: 不 会
A2: 不 会 啊
A3: 不 会 不 会

Q: 你 觉 得 梅 西 厉 害 吗
A1: 不 厉 害
A2: 没 有
A3: 我 觉 得 不 厉 害

------------
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-01-09T04:59:45Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-699252
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:Finished evaluation at 2020-01-09-04:59:49
INFO:tensorflow:Saving dict for global step 699252: global_step = 699252, loss = 3.3806677
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 699252: ../model/transformer/model.ckpt-699252
INFO:tensorflow:Perplexity: 29.390
INFO:tensorflow:Best Perplexity: 29.390
Reading ../data/core.txt
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-699252
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:BLEU-2: 12.1
INFO:tensorflow:Calling model_fn.
[<tf.Variable 'Embedding/fasttext_vectors:0' shape=(5904, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/affine_bias:0' shape=(5904,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/gamma:0' shape=(300,) dtype=float32_ref>]
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-699252
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 699252 into ../model/transformer/model.ckpt.
Reading ../data/train.txt
INFO:tensorflow:loss = 4.9029717, step = 699252
INFO:tensorflow:lr = 8.8695987e-07
INFO:tensorflow:global_step/sec: 4.04986
INFO:tensorflow:loss = 5.076115, step = 699352 (24.694 sec)
INFO:tensorflow:lr = 8.860685e-07 (24.695 sec)
INFO:tensorflow:global_step/sec: 5.14416
INFO:tensorflow:loss = 4.88351, step = 699452 (19.439 sec)
INFO:tensorflow:lr = 8.8517874e-07 (19.440 sec)
INFO:tensorflow:global_step/sec: 5.15773
INFO:tensorflow:loss = 4.961347, step = 699552 (19.392 sec)
INFO:tensorflow:lr = 8.842892e-07 (19.392 sec)
INFO:tensorflow:global_step/sec: 5.12904
INFO:tensorflow:loss = 4.9886494, step = 699652 (19.493 sec)
INFO:tensorflow:lr = 8.834011e-07 (19.495 sec)
INFO:tensorflow:global_step/sec: 5.13132
INFO:tensorflow:loss = 4.932857, step = 699752 (19.492 sec)
INFO:tensorflow:lr = 8.825139e-07 (19.489 sec)
INFO:tensorflow:global_step/sec: 5.15809
INFO:tensorflow:loss = 4.941634, step = 699852 (19.384 sec)
INFO:tensorflow:lr = 8.8162705e-07 (19.385 sec)
INFO:tensorflow:global_step/sec: 5.1414
INFO:tensorflow:loss = 4.9530053, step = 699952 (19.455 sec)
INFO:tensorflow:lr = 8.8074165e-07 (19.453 sec)
INFO:tensorflow:global_step/sec: 5.14868
INFO:tensorflow:loss = 4.9832115, step = 700052 (19.421 sec)
INFO:tensorflow:lr = 8.7985654e-07 (19.422 sec)
INFO:tensorflow:global_step/sec: 5.12372
INFO:tensorflow:loss = 4.844157, step = 700152 (19.517 sec)
INFO:tensorflow:lr = 8.7897297e-07 (19.518 sec)
INFO:tensorflow:global_step/sec: 5.12918
INFO:tensorflow:loss = 4.877806, step = 700252 (19.493 sec)
INFO:tensorflow:lr = 8.780902e-07 (19.496 sec)
INFO:tensorflow:global_step/sec: 5.16467
INFO:tensorflow:loss = 4.9987435, step = 700352 (19.362 sec)
INFO:tensorflow:lr = 8.7720787e-07 (19.358 sec)
INFO:tensorflow:global_step/sec: 5.1545
INFO:tensorflow:loss = 5.0221505, step = 700452 (19.404 sec)
INFO:tensorflow:lr = 8.7632696e-07 (19.404 sec)
INFO:tensorflow:global_step/sec: 5.13811
INFO:tensorflow:loss = 4.890877, step = 700552 (19.463 sec)
INFO:tensorflow:lr = 8.754463e-07 (19.463 sec)
INFO:tensorflow:global_step/sec: 5.15048
INFO:tensorflow:loss = 4.8903313, step = 700652 (19.416 sec)
INFO:tensorflow:lr = 8.745671e-07 (19.416 sec)
INFO:tensorflow:global_step/sec: 5.16215
INFO:tensorflow:loss = 4.898708, step = 700752 (19.370 sec)
INFO:tensorflow:lr = 8.736888e-07 (19.371 sec)
INFO:tensorflow:global_step/sec: 5.14517
INFO:tensorflow:loss = 4.896408, step = 700852 (19.434 sec)
INFO:tensorflow:lr = 8.728108e-07 (19.435 sec)
INFO:tensorflow:global_step/sec: 5.13916
INFO:tensorflow:loss = 4.9617805, step = 700952 (19.457 sec)
INFO:tensorflow:lr = 8.719343e-07 (19.457 sec)
INFO:tensorflow:global_step/sec: 5.16515
INFO:tensorflow:loss = 4.961663, step = 701052 (19.360 sec)
INFO:tensorflow:lr = 8.7105815e-07 (19.363 sec)
INFO:tensorflow:global_step/sec: 5.13841
INFO:tensorflow:loss = 5.011018, step = 701152 (19.461 sec)
INFO:tensorflow:lr = 8.701833e-07 (19.461 sec)
INFO:tensorflow:global_step/sec: 5.16846
INFO:tensorflow:loss = 4.8541656, step = 701252 (19.348 sec)
INFO:tensorflow:lr = 8.693094e-07 (19.348 sec)
INFO:tensorflow:global_step/sec: 5.14747
INFO:tensorflow:loss = 4.93289, step = 701352 (19.427 sec)
INFO:tensorflow:lr = 8.6843585e-07 (19.425 sec)
INFO:tensorflow:global_step/sec: 5.1613
INFO:tensorflow:loss = 4.9462137, step = 701452 (19.379 sec)
INFO:tensorflow:lr = 8.6756364e-07 (19.378 sec)
INFO:tensorflow:global_step/sec: 5.12899
INFO:tensorflow:loss = 4.888146, step = 701552 (19.502 sec)
INFO:tensorflow:lr = 8.666918e-07 (19.502 sec)
INFO:tensorflow:global_step/sec: 5.15389
INFO:tensorflow:loss = 4.809617, step = 701652 (19.394 sec)
INFO:tensorflow:lr = 8.658214e-07 (19.397 sec)
INFO:tensorflow:global_step/sec: 5.09483
INFO:tensorflow:loss = 5.0501204, step = 701752 (19.631 sec)
INFO:tensorflow:lr = 8.6495186e-07 (19.627 sec)
INFO:tensorflow:global_step/sec: 5.14973
INFO:tensorflow:loss = 4.9791865, step = 701852 (19.415 sec)
INFO:tensorflow:lr = 8.640828e-07 (19.416 sec)
INFO:tensorflow:global_step/sec: 5.1608
INFO:tensorflow:loss = 4.9464736, step = 701952 (19.377 sec)
INFO:tensorflow:lr = 8.63215e-07 (19.376 sec)
INFO:tensorflow:global_step/sec: 5.14924
INFO:tensorflow:loss = 5.0071855, step = 702052 (19.426 sec)
INFO:tensorflow:lr = 8.6234746e-07 (19.427 sec)
INFO:tensorflow:global_step/sec: 5.13022
INFO:tensorflow:loss = 4.973911, step = 702152 (19.492 sec)
INFO:tensorflow:lr = 8.6148145e-07 (19.491 sec)
INFO:tensorflow:global_step/sec: 5.14548
INFO:tensorflow:loss = 4.8843446, step = 702252 (19.433 sec)
INFO:tensorflow:lr = 8.606163e-07 (19.433 sec)
INFO:tensorflow:global_step/sec: 5.15366
INFO:tensorflow:loss = 4.942857, step = 702352 (19.402 sec)
INFO:tensorflow:lr = 8.597514e-07 (19.401 sec)
INFO:tensorflow:global_step/sec: 5.16458
INFO:tensorflow:loss = 4.943082, step = 702452 (19.364 sec)
INFO:tensorflow:lr = 8.5888803e-07 (19.366 sec)
INFO:tensorflow:Saving checkpoints for 702455 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.58546
INFO:tensorflow:loss = 4.925069, step = 702552 (21.810 sec)
INFO:tensorflow:lr = 8.5802503e-07 (21.810 sec)
INFO:tensorflow:global_step/sec: 5.1413
INFO:tensorflow:loss = 4.815398, step = 702652 (19.449 sec)
INFO:tensorflow:lr = 8.571633e-07 (19.448 sec)
INFO:tensorflow:global_step/sec: 5.15029
INFO:tensorflow:loss = 4.8284645, step = 702752 (19.413 sec)
INFO:tensorflow:lr = 8.5630245e-07 (19.417 sec)
INFO:tensorflow:global_step/sec: 5.14864
INFO:tensorflow:loss = 5.0910616, step = 702852 (19.425 sec)
INFO:tensorflow:lr = 8.5544195e-07 (19.422 sec)
INFO:tensorflow:global_step/sec: 5.04477
INFO:tensorflow:loss = 5.048201, step = 702952 (19.827 sec)
INFO:tensorflow:lr = 8.545828e-07 (19.828 sec)
INFO:tensorflow:global_step/sec: 5.13189
INFO:tensorflow:loss = 4.982502, step = 703052 (19.486 sec)
INFO:tensorflow:lr = 8.537241e-07 (19.485 sec)
INFO:tensorflow:global_step/sec: 5.11839
INFO:tensorflow:loss = 4.9726458, step = 703152 (19.535 sec)
INFO:tensorflow:lr = 8.5286666e-07 (19.535 sec)
INFO:tensorflow:global_step/sec: 5.16094
INFO:tensorflow:loss = 4.984554, step = 703252 (19.371 sec)
INFO:tensorflow:lr = 8.5201015e-07 (19.371 sec)
INFO:tensorflow:global_step/sec: 5.08731
INFO:tensorflow:loss = 4.9315176, step = 703352 (19.657 sec)
INFO:tensorflow:lr = 8.51154e-07 (19.657 sec)
INFO:tensorflow:global_step/sec: 5.14867
INFO:tensorflow:loss = 4.8714676, step = 703452 (19.422 sec)
INFO:tensorflow:lr = 8.5029916e-07 (19.422 sec)
INFO:tensorflow:global_step/sec: 5.12898
INFO:tensorflow:loss = 4.944737, step = 703552 (19.501 sec)
INFO:tensorflow:lr = 8.494447e-07 (19.502 sec)
INFO:tensorflow:global_step/sec: 5.13399
INFO:tensorflow:loss = 4.968346, step = 703652 (19.478 sec)
INFO:tensorflow:lr = 8.4859164e-07 (19.477 sec)
INFO:tensorflow:global_step/sec: 5.14103
INFO:tensorflow:loss = 4.8724895, step = 703752 (19.451 sec)
INFO:tensorflow:lr = 8.477394e-07 (19.452 sec)
INFO:tensorflow:global_step/sec: 5.13976
INFO:tensorflow:loss = 4.94501, step = 703852 (19.454 sec)
INFO:tensorflow:lr = 8.4688753e-07 (19.453 sec)
INFO:tensorflow:global_step/sec: 5.16293
INFO:tensorflow:loss = 4.8129, step = 703952 (19.368 sec)
INFO:tensorflow:lr = 8.46037e-07 (19.371 sec)
INFO:tensorflow:global_step/sec: 5.14268
INFO:tensorflow:loss = 5.0112095, step = 704052 (19.445 sec)
INFO:tensorflow:lr = 8.4518683e-07 (19.445 sec)
INFO:tensorflow:global_step/sec: 5.11971
INFO:tensorflow:loss = 5.024304, step = 704152 (19.532 sec)
INFO:tensorflow:lr = 8.44338e-07 (19.531 sec)
INFO:tensorflow:global_step/sec: 5.13933
INFO:tensorflow:loss = 4.9757347, step = 704252 (19.458 sec)
INFO:tensorflow:lr = 8.434901e-07 (19.459 sec)
INFO:tensorflow:global_step/sec: 5.12053
INFO:tensorflow:loss = 4.9369144, step = 704352 (19.529 sec)
INFO:tensorflow:lr = 8.426424e-07 (19.527 sec)
INFO:tensorflow:global_step/sec: 5.11951
INFO:tensorflow:loss = 4.8365755, step = 704452 (19.538 sec)
INFO:tensorflow:lr = 8.417962e-07 (19.537 sec)
INFO:tensorflow:global_step/sec: 5.12784
INFO:tensorflow:loss = 4.982062, step = 704552 (19.497 sec)
INFO:tensorflow:lr = 8.409503e-07 (19.497 sec)
INFO:tensorflow:global_step/sec: 5.15058
INFO:tensorflow:loss = 4.9438643, step = 704652 (19.421 sec)
INFO:tensorflow:lr = 8.4010566e-07 (19.424 sec)
INFO:tensorflow:global_step/sec: 5.16148
INFO:tensorflow:loss = 4.899953, step = 704752 (19.369 sec)
INFO:tensorflow:lr = 8.39262e-07 (19.367 sec)
INFO:tensorflow:global_step/sec: 5.13518
INFO:tensorflow:loss = 4.9176173, step = 704852 (19.474 sec)
INFO:tensorflow:lr = 8.384187e-07 (19.473 sec)
INFO:tensorflow:global_step/sec: 5.10247
INFO:tensorflow:loss = 4.92602, step = 704952 (19.598 sec)
INFO:tensorflow:lr = 8.375767e-07 (19.600 sec)
INFO:tensorflow:global_step/sec: 5.13556
INFO:tensorflow:loss = 4.920562, step = 705052 (19.476 sec)
INFO:tensorflow:lr = 8.3673496e-07 (19.472 sec)
INFO:tensorflow:global_step/sec: 5.15569
INFO:tensorflow:loss = 4.9289956, step = 705152 (19.396 sec)
INFO:tensorflow:lr = 8.3589464e-07 (19.397 sec)
INFO:tensorflow:global_step/sec: 5.16071
INFO:tensorflow:loss = 4.8920665, step = 705252 (19.378 sec)
INFO:tensorflow:lr = 8.350551e-07 (19.377 sec)
INFO:tensorflow:global_step/sec: 5.1483
INFO:tensorflow:loss = 4.9995146, step = 705352 (19.424 sec)
INFO:tensorflow:lr = 8.34216e-07 (19.425 sec)
INFO:tensorflow:global_step/sec: 5.12728
INFO:tensorflow:loss = 5.039324, step = 705452 (19.499 sec)
INFO:tensorflow:lr = 8.3337824e-07 (19.501 sec)
INFO:tensorflow:global_step/sec: 5.14009
INFO:tensorflow:loss = 4.9656057, step = 705552 (19.459 sec)
INFO:tensorflow:lr = 8.325407e-07 (19.456 sec)
INFO:tensorflow:global_step/sec: 5.11474
INFO:tensorflow:loss = 4.92409, step = 705652 (19.547 sec)
INFO:tensorflow:lr = 8.317047e-07 (19.549 sec)
INFO:tensorflow:Saving checkpoints for 705658 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.48558
INFO:tensorflow:loss = 4.9109416, step = 705752 (22.294 sec)
INFO:tensorflow:lr = 8.3086945e-07 (22.293 sec)
INFO:tensorflow:global_step/sec: 5.13565
INFO:tensorflow:loss = 4.853748, step = 705852 (19.472 sec)
INFO:tensorflow:lr = 8.300345e-07 (19.473 sec)
INFO:tensorflow:global_step/sec: 5.15169
INFO:tensorflow:loss = 4.9620566, step = 705952 (19.410 sec)
INFO:tensorflow:lr = 8.2920087e-07 (19.412 sec)
INFO:tensorflow:global_step/sec: 5.14536
INFO:tensorflow:loss = 5.0181518, step = 706052 (19.435 sec)
INFO:tensorflow:lr = 8.2836766e-07 (19.432 sec)
INFO:tensorflow:global_step/sec: 5.13019
INFO:tensorflow:loss = 5.0109615, step = 706152 (19.497 sec)
INFO:tensorflow:lr = 8.275357e-07 (19.497 sec)
INFO:tensorflow:global_step/sec: 5.14712
INFO:tensorflow:loss = 5.0479765, step = 706252 (19.428 sec)
INFO:tensorflow:lr = 8.267046e-07 (19.428 sec)
INFO:tensorflow:global_step/sec: 5.12583
INFO:tensorflow:loss = 5.0469203, step = 706352 (19.510 sec)
INFO:tensorflow:lr = 8.258738e-07 (19.511 sec)
INFO:tensorflow:global_step/sec: 5.1353
INFO:tensorflow:loss = 4.971116, step = 706452 (19.472 sec)
INFO:tensorflow:lr = 8.250444e-07 (19.472 sec)
INFO:tensorflow:global_step/sec: 5.10834
INFO:tensorflow:loss = 4.9818325, step = 706552 (19.579 sec)
INFO:tensorflow:lr = 8.242154e-07 (19.580 sec)
INFO:tensorflow:global_step/sec: 5.11954
INFO:tensorflow:loss = 5.060797, step = 706652 (19.525 sec)
INFO:tensorflow:lr = 8.233876e-07 (19.525 sec)
INFO:tensorflow:global_step/sec: 5.14348
INFO:tensorflow:loss = 4.840772, step = 706752 (19.442 sec)
INFO:tensorflow:lr = 8.225607e-07 (19.441 sec)
INFO:tensorflow:global_step/sec: 5.13767
INFO:tensorflow:loss = 4.9157405, step = 706852 (19.464 sec)
INFO:tensorflow:lr = 8.2173415e-07 (19.464 sec)
INFO:tensorflow:global_step/sec: 5.15126
INFO:tensorflow:loss = 5.0062704, step = 706952 (19.418 sec)
INFO:tensorflow:lr = 8.209088e-07 (19.418 sec)
INFO:tensorflow:global_step/sec: 5.14016
INFO:tensorflow:loss = 4.9798965, step = 707052 (19.450 sec)
INFO:tensorflow:lr = 8.2008387e-07 (19.451 sec)
INFO:tensorflow:global_step/sec: 5.1308
INFO:tensorflow:loss = 4.893226, step = 707152 (19.490 sec)
INFO:tensorflow:lr = 8.192604e-07 (19.489 sec)
INFO:tensorflow:global_step/sec: 5.11376
INFO:tensorflow:loss = 4.880021, step = 707252 (19.560 sec)
INFO:tensorflow:lr = 8.1843757e-07 (19.560 sec)
INFO:tensorflow:global_step/sec: 5.14958
INFO:tensorflow:loss = 4.959422, step = 707352 (19.418 sec)
INFO:tensorflow:lr = 8.1761516e-07 (19.418 sec)
INFO:tensorflow:global_step/sec: 5.12715
INFO:tensorflow:loss = 5.0690236, step = 707452 (19.503 sec)
INFO:tensorflow:lr = 8.1679406e-07 (19.503 sec)
INFO:tensorflow:global_step/sec: 5.12025
INFO:tensorflow:loss = 5.0705323, step = 707552 (19.529 sec)
INFO:tensorflow:lr = 8.159732e-07 (19.529 sec)
INFO:tensorflow:global_step/sec: 5.12796
INFO:tensorflow:loss = 4.982247, step = 707652 (19.504 sec)
INFO:tensorflow:lr = 8.151537e-07 (19.506 sec)
INFO:tensorflow:global_step/sec: 5.12786
INFO:tensorflow:loss = 4.9352117, step = 707752 (19.496 sec)
INFO:tensorflow:lr = 8.143351e-07 (19.495 sec)
INFO:tensorflow:global_step/sec: 5.11514
INFO:tensorflow:loss = 4.9760633, step = 707852 (19.551 sec)
INFO:tensorflow:lr = 8.1351675e-07 (19.552 sec)
INFO:tensorflow:global_step/sec: 5.12203
INFO:tensorflow:loss = 4.935376, step = 707952 (19.528 sec)
INFO:tensorflow:lr = 8.1269974e-07 (19.526 sec)
INFO:tensorflow:global_step/sec: 5.11396
INFO:tensorflow:loss = 5.0178804, step = 708052 (19.553 sec)
INFO:tensorflow:lr = 8.118832e-07 (19.554 sec)
Reading ../data/test.txt
INFO:tensorflow:global_step/sec: 5.09243
INFO:tensorflow:loss = 5.056995, step = 708152 (19.633 sec)
INFO:tensorflow:lr = 8.1106776e-07 (19.636 sec)
INFO:tensorflow:global_step/sec: 5.23395
INFO:tensorflow:loss = 4.9783554, step = 708252 (19.106 sec)
INFO:tensorflow:lr = 8.1025325e-07 (19.106 sec)
INFO:tensorflow:global_step/sec: 5.22565
INFO:tensorflow:loss = 4.9045014, step = 708352 (19.138 sec)
INFO:tensorflow:lr = 8.0943903e-07 (19.135 sec)
INFO:tensorflow:global_step/sec: 5.22824
INFO:tensorflow:loss = 4.870046, step = 708452 (19.127 sec)
INFO:tensorflow:lr = 8.0862606e-07 (19.127 sec)
INFO:tensorflow:global_step/sec: 5.24089
INFO:tensorflow:loss = 4.797813, step = 708552 (19.079 sec)
INFO:tensorflow:lr = 8.078135e-07 (19.082 sec)
INFO:tensorflow:global_step/sec: 5.21649
INFO:tensorflow:loss = 4.914493, step = 708652 (19.175 sec)
INFO:tensorflow:lr = 8.070022e-07 (19.171 sec)
INFO:tensorflow:global_step/sec: 5.20816
INFO:tensorflow:loss = 4.920562, step = 708752 (19.200 sec)
INFO:tensorflow:lr = 8.0619174e-07 (19.200 sec)
INFO:tensorflow:global_step/sec: 5.21409
INFO:tensorflow:loss = 5.0017037, step = 708852 (19.178 sec)
INFO:tensorflow:lr = 8.0538166e-07 (19.179 sec)
INFO:tensorflow:Saving checkpoints for 708861 into ../model/transformer/model.ckpt.
INFO:tensorflow:global_step/sec: 4.62398
INFO:tensorflow:loss = 5.0028477, step = 708952 (21.627 sec)
INFO:tensorflow:lr = 8.045728e-07 (21.630 sec)
INFO:tensorflow:global_step/sec: 5.22877
INFO:tensorflow:loss = 4.961921, step = 709052 (19.123 sec)
INFO:tensorflow:lr = 8.0376424e-07 (19.120 sec)
INFO:tensorflow:global_step/sec: 5.20364
INFO:tensorflow:loss = 4.9579244, step = 709152 (19.215 sec)
INFO:tensorflow:lr = 8.0295706e-07 (19.218 sec)
INFO:tensorflow:global_step/sec: 5.21707
INFO:tensorflow:loss = 4.9352455, step = 709252 (19.168 sec)
INFO:tensorflow:lr = 8.0215074e-07 (19.166 sec)
INFO:tensorflow:global_step/sec: 5.22147
INFO:tensorflow:loss = 5.0476947, step = 709352 (19.152 sec)
INFO:tensorflow:lr = 8.013446e-07 (19.153 sec)
INFO:tensorflow:global_step/sec: 5.22595
INFO:tensorflow:loss = 4.9090524, step = 709452 (19.135 sec)
INFO:tensorflow:lr = 8.0053985e-07 (19.135 sec)
INFO:tensorflow:global_step/sec: 5.22882
INFO:tensorflow:loss = 5.016692, step = 709552 (19.129 sec)
INFO:tensorflow:lr = 7.9973535e-07 (19.126 sec)
INFO:tensorflow:global_step/sec: 5.2003
INFO:tensorflow:loss = 4.989188, step = 709652 (19.230 sec)
INFO:tensorflow:lr = 7.989322e-07 (19.230 sec)
INFO:tensorflow:global_step/sec: 5.19785
INFO:tensorflow:loss = 4.9917727, step = 709752 (19.235 sec)
INFO:tensorflow:lr = 7.9812986e-07 (19.237 sec)
INFO:tensorflow:global_step/sec: 5.21953
INFO:tensorflow:loss = 4.979067, step = 709852 (19.162 sec)
INFO:tensorflow:lr = 7.973278e-07 (19.161 sec)
INFO:tensorflow:global_step/sec: 5.2162
INFO:tensorflow:loss = 4.8074884, step = 709952 (19.171 sec)
INFO:tensorflow:lr = 7.9652705e-07 (19.170 sec)
INFO:tensorflow:global_step/sec: 5.22236
INFO:tensorflow:loss = 4.868454, step = 710052 (19.149 sec)
INFO:tensorflow:lr = 7.9572663e-07 (19.149 sec)
INFO:tensorflow:global_step/sec: 5.22889
INFO:tensorflow:loss = 4.9518294, step = 710152 (19.124 sec)
INFO:tensorflow:lr = 7.9492753e-07 (19.124 sec)
INFO:tensorflow:global_step/sec: 5.23153
INFO:tensorflow:loss = 5.0123086, step = 710252 (19.111 sec)
INFO:tensorflow:lr = 7.941292e-07 (19.114 sec)
INFO:tensorflow:global_step/sec: 5.20782
INFO:tensorflow:loss = 5.0800343, step = 710352 (19.202 sec)
INFO:tensorflow:lr = 7.933312e-07 (19.199 sec)
INFO:tensorflow:global_step/sec: 5.23241
INFO:tensorflow:loss = 4.9574256, step = 710452 (19.115 sec)
INFO:tensorflow:lr = 7.9253454e-07 (19.114 sec)
INFO:tensorflow:global_step/sec: 5.22858
INFO:tensorflow:loss = 4.906209, step = 710552 (19.127 sec)
INFO:tensorflow:lr = 7.9173805e-07 (19.127 sec)
INFO:tensorflow:global_step/sec: 5.22789
INFO:tensorflow:loss = 4.940554, step = 710652 (19.128 sec)
INFO:tensorflow:lr = 7.909429e-07 (19.128 sec)
INFO:tensorflow:global_step/sec: 5.2321
INFO:tensorflow:loss = 4.9599113, step = 710752 (19.111 sec)
INFO:tensorflow:lr = 7.9014853e-07 (19.111 sec)
INFO:tensorflow:global_step/sec: 5.2323
INFO:tensorflow:loss = 5.017372, step = 710852 (19.115 sec)
INFO:tensorflow:lr = 7.8935454e-07 (19.114 sec)
INFO:tensorflow:global_step/sec: 5.22636
INFO:tensorflow:loss = 4.9868765, step = 710952 (19.134 sec)
INFO:tensorflow:lr = 7.885618e-07 (19.136 sec)
INFO:tensorflow:global_step/sec: 5.24117
INFO:tensorflow:loss = 4.976105, step = 711052 (19.074 sec)
INFO:tensorflow:lr = 7.8776935e-07 (19.076 sec)
INFO:tensorflow:global_step/sec: 5.22561
INFO:tensorflow:loss = 4.935708, step = 711152 (19.136 sec)
INFO:tensorflow:lr = 7.869782e-07 (19.135 sec)
INFO:tensorflow:global_step/sec: 5.22103
INFO:tensorflow:loss = 4.950136, step = 711252 (19.157 sec)
INFO:tensorflow:lr = 7.861879e-07 (19.156 sec)
INFO:tensorflow:Saving checkpoints for 711351 into ../model/transformer/model.ckpt.
INFO:tensorflow:Loss for final step: 5.04247.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-711351
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
------------
minimal test
Q: 你 好
A1: 你 好
A2: 不 好
A3: 你 好 啊

Q: 早 上 好
A1: 早 上 好
A2: 晚 上 好
A3: 早 安

Q: 晚 上 好
A1: 晚 上 好
A2: 晚 安
A3: 晚 上 好 !

Q: 再 见
A1: 哈 哈 哈
A2: 哈 哈 哈 哈
A3: 哈 哈

Q: 好 久 不 见
A1: 好 久 不 见
A2: 是 啊
A3: 是 的

Q: 想 死 你 了
A1: 我 也 想 你
A2: 我 也 是
A3: 我 也 想 你 了

Q: 谢 谢 你
A1: 不 客 气
A2: 不 用 谢
A3: 不 谢

Q: 爱 你
A1: 我 也 爱 你
A2: 么 么 哒
A3: 爱 你

Q: 你 好 厉 害 啊
A1: 哈 哈 哈
A2: 哈 哈
A3: 哈 哈 哈 哈 哈

Q: 你 叫 什 么
A1: 我 叫 你
A2: 我 叫 什 么
A3: 就 叫 你

Q: 你 几 岁 了
A1: 你 猜
A2: 哈 哈
A3: 你 猜 猜

Q: 现 在 几 点
A1: 点
A2: 点 左 右
A3: 点 左 右 吧

Q: 今 天 天 气 怎 么 样
A1: 还 行
A2: 下 雨 了
A3: 还 行 吧

Q: 我 们 现 在 在 哪 里
A1: 在 家
A2: 我 们 学 校
A3: 在 我 们 学 校

Q: 讲 个 笑 话
A1: 哈 哈 哈
A2: 哈 哈
A3: 哈 哈 哈 哈

Q: 你 会 几 种 语 言 呀
A1: 两 种
A2: 不 知 道
A3: 两 种 语 言

Q: 你 觉 得 我 帅 吗
A1: 你 觉 得 呢
A2: 我 觉 得
A3: 不 是

Q: 讨 厌 的 周 一
A1: 哈 哈 哈 哈
A2: 哈 哈 哈
A3: 哈 哈

Q: 好 烦 啊
A1: 怎 么 了
A2: 怎 么 啦
A3: 怎 么 了 ?

Q: 天 气 真 好
A1: 是 的
A2: 是 啊
A3: 哈 哈

Q: 今 天 好 冷
A1: 我 也 是
A2: 今 天 也 是
A3: 我 们 这 里 也 冷

Q: 今 天 好 热
A1: 是 的
A2: 我 也 是
A3: 我 们 这 也 是

Q: 下 雨 了
A1: 下 雨 了
A2: 下 雨 了 ?
A3: 下 了

Q: 风 好 大
A1: 哈 哈
A2: 哈 哈 哈
A3: 哈 哈 哈 哈

Q: 终 于 周 五 了
A1: 哈 哈
A2: 是 的
A3: 哈 哈 哈

Q: 我 想 去 K 歌
A1: 来 啊 来 啊
A2: 去 去 去
A3: 来 吧

Q: 红 烧 肉 吃 了 会 发 胖 吗
A1: 不 会
A2: 不 会 啊
A3: 不 会 的

Q: 你 觉 得 梅 西 厉 害 吗
A1: 我 觉 得
A2: 不 厉 害
A3: 厉 害

------------
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-01-09T05:41:03Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-711351
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:Finished evaluation at 2020-01-09-05:41:07
INFO:tensorflow:Saving dict for global step 711351: global_step = 711351, loss = 3.4199302
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 711351: ../model/transformer/model.ckpt-711351
INFO:tensorflow:Perplexity: 30.567
INFO:tensorflow:Best Perplexity: 29.390
Reading ../data/core.txt
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-711351
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Reading ../data/core.txt
INFO:tensorflow:BLEU-2: 12.1
INFO:tensorflow:Calling model_fn.
[<tf.Variable 'Embedding/fasttext_vectors:0' shape=(5904, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/output/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Encoder/transformer_encoder/LayerNorm/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/affine_bias:0' shape=(5904,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_0/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_1/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_2/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_3/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_4/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/self_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/query/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/key/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/kernel:0' shape=(300, 512) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/value/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/kernel:0' shape=(512, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/encdec_attention/multihead_attention/output/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/past_poswise_ln/gamma:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv1/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/kernel:0' shape=(1200, 300) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/layer_5/fnn/conv2/bias:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/beta:0' shape=(300,) dtype=float32_ref>,
 <tf.Variable 'Decoder/transformer_decoder/gamma:0' shape=(300,) dtype=float32_ref>]
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model/transformer/model.ckpt-711351
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 711351 into ../model/transformer/model.ckpt.
Reading ../data/train.txt
INFO:tensorflow:loss = 5.0087075, step = 711351
INFO:tensorflow:lr = 7.854061e-07
INFO:tensorflow:global_step/sec: 4.038
INFO:tensorflow:loss = 4.8784485, step = 711451 (24.769 sec)
INFO:tensorflow:lr = 7.8461676e-07 (24.769 sec)
INFO:tensorflow:global_step/sec: 5.13537
INFO:tensorflow:loss = 4.859515, step = 711551 (19.470 sec)
INFO:tensorflow:lr = 7.838288e-07 (19.470 sec)
INFO:tensorflow:global_step/sec: 5.1336
INFO:tensorflow:loss = 5.056298, step = 711651 (19.480 sec)
INFO:tensorflow:lr = 7.830412e-07 (19.484 sec)
INFO:tensorflow:global_step/sec: 5.12848
INFO:tensorflow:loss = 4.9057612, step = 711751 (19.500 sec)
INFO:tensorflow:lr = 7.822548e-07 (19.497 sec)
INFO:tensorflow:global_step/sec: 5.12103
INFO:tensorflow:loss = 4.97365, step = 711851 (19.526 sec)
INFO:tensorflow:lr = 7.814692e-07 (19.526 sec)
INFO:tensorflow:global_step/sec: 5.12718
INFO:tensorflow:loss = 5.033398, step = 711951 (19.505 sec)
INFO:tensorflow:lr = 7.8068393e-07 (19.505 sec)
INFO:tensorflow:global_step/sec: 5.14261
INFO:tensorflow:loss = 4.874139, step = 712051 (19.444 sec)
INFO:tensorflow:lr = 7.7989984e-07 (19.443 sec)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-14-9ed2ab6cde40> in <module>()
     22 
     23 while True:
---> 24   estimator.train(input_fn=lambda: dataset(is_training=True, params=params))
     25 
     26   minimal_test(estimator)

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
    368 
    369       saving_listeners = _check_listeners_type(saving_listeners)
--> 370       loss = self._train_model(input_fn, hooks, saving_listeners)
    371       logging.info('Loss for final step: %s.', loss)
    372       return self

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
   1159       return self._train_model_distributed(input_fn, hooks, saving_listeners)
   1160     else:
-> 1161       return self._train_model_default(input_fn, hooks, saving_listeners)
   1162 
   1163   def _train_model_default(self, input_fn, hooks, saving_listeners):

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model_default(self, input_fn, hooks, saving_listeners)
   1193       return self._train_with_estimator_spec(estimator_spec, worker_hooks,
   1194                                              hooks, global_step_tensor,
-> 1195                                              saving_listeners)
   1196 
   1197   def _train_model_distributed(self, input_fn, hooks, saving_listeners):

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks, global_step_tensor, saving_listeners)
   1492       any_step_done = False
   1493       while not mon_sess.should_stop():
-> 1494         _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
   1495         any_step_done = True
   1496     if not any_step_done:

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
    752         feed_dict=feed_dict,
    753         options=options,
--> 754         run_metadata=run_metadata)
    755 
    756   def run_step_fn(self, step_fn):

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
   1257             feed_dict=feed_dict,
   1258             options=options,
-> 1259             run_metadata=run_metadata)
   1260       except _PREEMPTION_ERRORS as e:
   1261         logging.info(

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py in run(self, *args, **kwargs)
   1343   def run(self, *args, **kwargs):
   1344     try:
-> 1345       return self._sess.run(*args, **kwargs)
   1346     except _PREEMPTION_ERRORS:
   1347       raise

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
   1416         feed_dict=feed_dict,
   1417         options=options,
-> 1418         run_metadata=run_metadata)
   1419 
   1420     for hook in self._hooks:

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py in run(self, *args, **kwargs)
   1174 
   1175   def run(self, *args, **kwargs):
-> 1176     return self._sess.run(*args, **kwargs)
   1177 
   1178   def run_step_fn(self, step_fn, raw_session, run_with_hooks):

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    954     try:
    955       result = self._run(None, fetches, feed_dict, options_ptr,
--> 956                          run_metadata_ptr)
    957       if run_metadata:
    958         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1178     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1179       results = self._do_run(handle, final_targets, final_fetches,
-> 1180                              feed_dict_tensor, options, run_metadata)
   1181     else:
   1182       results = []

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1357     if handle is None:
   1358       return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1359                            run_metadata)
   1360     else:
   1361       return self._do_call(_prun_fn, handle, feeds, fetches)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _do_call(self, fn, *args)
   1363   def _do_call(self, fn, *args):
   1364     try:
-> 1365       return fn(*args)
   1366     except errors.OpError as e:
   1367       message = compat.as_text(e.message)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
   1348       self._extend_graph()
   1349       return self._call_tf_sessionrun(options, feed_dict, fetch_list,
-> 1350                                       target_list, run_metadata)
   1351 
   1352     def _prun_fn(handle, feed_dict, fetch_list):

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
   1441     return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
   1442                                             fetch_list, target_list,
-> 1443                                             run_metadata)
   1444 
   1445   def _call_tf_sessionprun(self, handle, feed_dict, fetch_list):

KeyboardInterrupt: