%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.datasets import imdb
一般自然語言處理, 我們會限制最大要使用的字數。
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz 17465344/17464789 [==============================] - 0s 0us/step 17473536/17464789 [==============================] - 0s 0us/step
print(f'訓練資料筆數:{len(x_train)}')
print(f'測試資料筆數:{len(x_test)}')
訓練資料筆數:25000 測試資料筆數:25000
注意每筆評論的長度當然是不一樣的。
print(f'第一筆訓練資料的長度:{len(x_train[0])}')
print(f'第二筆測試資料的長度:{len(x_train[1])}')
第一筆訓練資料的長度:218 第二筆測試資料的長度:189
print(f'第一筆資料的標籤:{y_train[0]}(正評)')
print(f'第二筆資料的標籤:{y_train[1]}(負評)')
第一筆資料的標籤:1(正評) 第二筆資料的標籤:0(負評)
雖然我們可以做真的 seq2seq, 可是資料長度不一樣對計算上有麻煩, 因此平常還是會固定一定長度, 其餘補 0。
x_train = sequence.pad_sequences(x_train, maxlen=100)
x_test = sequence.pad_sequences(x_test, maxlen=100)
model = Sequential()
model.add(Embedding(10000, 128))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, None, 128) 1280000 lstm (LSTM) (None, 128) 131584 dense (Dense) (None, 1) 129 ================================================================= Total params: 1,411,713 Trainable params: 1,411,713 Non-trainable params: 0 _________________________________________________________________
model.fit(x_train, y_train, batch_size=32, epochs=10,
validation_data=(x_test, y_test))
Epoch 1/10 782/782 [==============================] - 22s 24ms/step - loss: 0.4183 - accuracy: 0.8095 - val_loss: 0.3533 - val_accuracy: 0.8436 Epoch 2/10 782/782 [==============================] - 18s 24ms/step - loss: 0.2572 - accuracy: 0.8975 - val_loss: 0.3465 - val_accuracy: 0.8459 Epoch 3/10 782/782 [==============================] - 19s 24ms/step - loss: 0.1825 - accuracy: 0.9292 - val_loss: 0.4315 - val_accuracy: 0.8412 Epoch 4/10 782/782 [==============================] - 18s 23ms/step - loss: 0.1317 - accuracy: 0.9512 - val_loss: 0.4378 - val_accuracy: 0.8267 Epoch 5/10 782/782 [==============================] - 19s 24ms/step - loss: 0.0930 - accuracy: 0.9672 - val_loss: 0.6900 - val_accuracy: 0.8292 Epoch 6/10 782/782 [==============================] - 18s 24ms/step - loss: 0.0794 - accuracy: 0.9730 - val_loss: 0.5700 - val_accuracy: 0.8333 Epoch 7/10 782/782 [==============================] - 19s 24ms/step - loss: 0.0505 - accuracy: 0.9833 - val_loss: 0.6798 - val_accuracy: 0.8370 Epoch 8/10 782/782 [==============================] - 18s 24ms/step - loss: 0.0460 - accuracy: 0.9848 - val_loss: 0.6846 - val_accuracy: 0.8300 Epoch 9/10 782/782 [==============================] - 18s 23ms/step - loss: 0.0302 - accuracy: 0.9912 - val_loss: 0.7452 - val_accuracy: 0.8279 Epoch 10/10 782/782 [==============================] - 18s 23ms/step - loss: 0.0195 - accuracy: 0.9941 - val_loss: 0.8609 - val_accuracy: 0.8259
<tensorflow.python.keras.callbacks.History at 0x7f8b51a9e490>
這次是把 model 和訓練權重分開存, 使用上更有彈性。
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
%cd '/content/drive/My Drive/Colab Notebooks'
/content/drive/My Drive/Colab Notebooks
model_json = model.to_json()
open('imdb_model_architecture.json', 'w').write(model_json)
model.save_weights('imdb_model_weights.h5')