"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need these
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/imdb/data')
!pip install tf-nightly-2.0-preview
import tensorflow as tf
import numpy as np
from collections import Counter
from pathlib import Path
from tqdm import tqdm
print('TensorFlow Version:', tf.__version__)
Make Data
"""
sort texts (and labels) according to the length of text
"""
def sort_by_len(x, y):
x, y = np.asarray(x), np.asarray(y)
idx = sorted(range(len(x)), key=lambda i: len(x[i]))
return x[idx], y[idx]
_word2idx = tf.keras.datasets.imdb.get_word_index()
word2idx = {w: i+3 for w, i in _word2idx.items()}
word2idx['<pad>'] = 0
word2idx['<start>'] = 1
word2idx['<unk>'] = 2
idx2word = {i: w for w, i in word2idx.items()}
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data()
x_train, y_train = sort_by_len(x_train, y_train)
x_test, y_test = sort_by_len(x_test, y_test)
def write_file(f_path, xs, ys):
with open(f_path, 'w') as f:
for x, y in zip(xs, ys):
f.write(str(y)+'\t'+' '.join([idx2word[i] for i in x][1:])+'\n')
write_file('../data/train.txt', x_train, y_train)
write_file('../data/test.txt', x_test, y_test)
Make Vocabulary
counter = Counter()
with open('../data/train.txt') as f:
for line in f:
line = line.rstrip()
label, words = line.split('\t')
words = words.split(' ')
counter.update(words)
words = ['<pad>'] + [w for w, freq in counter.most_common() if freq >= 10]
print('Vocab Size:', len(words))
Path('../vocab').mkdir(exist_ok=True)
with open('../vocab/word.txt', 'w') as f:
for w in words:
f.write(w+'\n')
Make Pretrained Embedding
word2idx = {}
with open('../vocab/word.txt') as f:
for i, line in enumerate(f):
line = line.rstrip()
word2idx[line] = i
embedding = np.zeros((len(word2idx)+1, 300)) # + 1 for unknown word
with open('../data/glove.840B.300d.txt') as f:
count = 0
for i, line in enumerate(f):
if i % 100000 == 0:
print('- At line {}'.format(i))
line = line.rstrip()
sp = line.split(' ')
word, vec = sp[0], sp[1:]
if word in word2idx:
count += 1
embedding[word2idx[word]] = np.asarray(vec, dtype='float32')
print("[%d / %d] words have found pre-trained values"%(count, len(word2idx)))
np.save('../vocab/word.npy', embedding)
print('Saved ../vocab/word.npy')