from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/free_chat/chinese_gaoq1/data')
from collections import Counter
import numpy as np
import random
char_counter = Counter()
with open('raw.txt') as f, open('train.txt', 'w') as f_train, open('test.txt', 'w') as f_test:
sents = f.readlines()
for source, target in zip(sents, sents[1:]):
if source == '\n' or target == '\n':
continue
source = source.strip()
target = target.strip()
char_counter.update(list(source))
char_counter.update(list(target))
if random.random() < 1e-3:
f_test.write(source+'|'+target+'\n')
else:
f_train.write(source+'|'+target+'\n')
chars = ['<pad>', '<start>', '<end>'] + [char for char, freq in char_counter.most_common() if freq >= 5]
with open('../vocab/char.txt', 'w') as f:
for c in chars:
f.write(c+'\n')
char2idx = {}
with open('../vocab/char.txt') as f:
for i, line in enumerate(f):
line = line.rstrip('\n')
char2idx[line] = i
embedding = np.zeros((len(char2idx)+1, 300)) # + 1 for unknown word
with open('../vocab/cc.zh.300.vec') as f:
count = 0
for i, line in enumerate(f):
if i == 0:
continue
if i % 100000 == 0:
print('- At line {}'.format(i))
line = line.rstrip()
sp = line.split(' ')
word, vec = sp[0], sp[1:]
if word in char2idx:
count += 1
embedding[char2idx[word]] = np.asarray(vec, dtype='float32')
print("[%d / %d] characters have found pre-trained values"%(count, len(char2idx)))
np.save('../vocab/char.npy', embedding)
print('Saved ../vocab/char.npy')