In [ ]:

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

2. 讀入套件¶

我們最主要是需要 tf.keras 中的 Tokenizer。

In [ ]:

from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

In [ ]:

!wget --no-check-certificate \
    https://raw.githubusercontent.com/yenlung/Python-AI-Book/main/dream.txt \
    -O /content/dream.txt

--2022-04-05 03:53:10--  https://raw.githubusercontent.com/yenlung/Python-AI-Book/main/dream.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2656053 (2.5M) [text/plain]
Saving to: ‘/content/dream.txt’

/content/dream.txt  100%[===================>]   2.53M  --.-KB/s    in 0.04s   

2022-04-05 03:53:11 (60.0 MB/s) - ‘/content/dream.txt’ saved [2656053/2656053]

3. 讀入蒐集到的所有文本¶

In [ ]:

f = open('dream.txt', 'r')
lines = f.readlines()
f.close()

In [ ]:

text_lines = [x.lstrip('\u3000\u3000') for x in lines]

In [ ]:

text = ''.join(text_lines)

4. 打造自己的 Tokenizer¶

In [ ]:

tokenizer = Tokenizer(char_level=True)

In [ ]:

tokenizer.fit_on_texts([text])

In [ ]:

tokenizer.texts_to_sequences(["我打造了一個函數學習機。"])

Out[ ]:

[[15, 99, 721, 3, 6, 26, 597, 362, 1061, 912, 2]]

In [ ]:

tokenizer.sequences_to_texts([[15, 99, 721, 3, 6, 26, 597, 362, 1061, 912, 2]])

Out[ ]:

['我 打 造 了 一 個 數 學 習 機 。']

5. 把Tokenizer存起來¶

In [ ]:

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive

In [ ]:

%cd "/content/drive/MyDrive/Colab Notebooks/"

/content/drive/MyDrive/Colab Notebooks

In [ ]:

f = open('MyTokenizer.pkl', 'wb')
pickle.dump(tokenizer, f)
f.close()

之後要用 pickle 讀回我們訓練好的 tokenizer 是這樣:

f = open('tokenizer.pkl', 'rb')
tokenizer = pickle.load(f)
f.close()