from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/free_chat/chinese_lccc/data')
from collections import Counter
from pathlib import Path
import json
import numpy as np
with open('LCCC-base.json') as f:
data = json.load(f)
Path('../vocab').mkdir(exist_ok=True)
char_counter = Counter()
src_lens, tgt_lens = [], []
i = 0
with open('train.txt', 'w') as f_out:
for line in data['train']:
if i == 2000000:
break
if len(line) < 2:
continue
elif len(line) == 2:
src, tgt = line
src = src.lower().split()
tgt = tgt.lower().split()
char_counter.update(src)
char_counter.update(tgt)
src_lens.append(len(src))
tgt_lens.append(len(tgt))
f_out.write(''.join(src)+'<SEP>'+''.join(tgt)+'\n')
i += 1
else:
for src, tgt in zip (line, line[1:]):
src = src.lower().split()
tgt = tgt.lower().split()
char_counter.update(src)
char_counter.update(tgt)
src_lens.append(len(src))
tgt_lens.append(len(tgt))
f_out.write(''.join(src)+'<SEP>'+''.join(tgt)+'\n')
i += 1
print('Source Average Length', sum(src_lens)/len(src_lens))
print('Target Average Length', sum(tgt_lens)/len(tgt_lens))
chars = ['<pad>', '<start>', '<end>'] + [char for char, freq in char_counter.most_common() if freq >= 50]
print(len(chars), 'Chars')
with open('../vocab/char.txt', 'w') as f:
for c in chars:
f.write(c+'\n')
with open('LCCC-base_test.json') as f:
data = json.load(f)
with open('test.txt', 'w') as f_out:
for line in data:
if len(line) < 2:
continue
elif len(line) == 2:
src, tgt = line
src = src.lower().split()
tgt = tgt.lower().split()
f_out.write(''.join(src)+'<SEP>'+''.join(tgt)+'\n')
else:
for src, tgt in zip (line, line[1:]):
src = src.split()
tgt = tgt.split()
f_out.write(''.join(src)+'<SEP>'+''.join(tgt)+'\n')
char2idx = {}
with open('../vocab/char.txt') as f:
for i, line in enumerate(f):
line = line.rstrip('\n')
char2idx[line] = i
embedding = np.zeros((len(char2idx)+1, 300)) # + 1 for unknown word
with open('../vocab/cc.zh.300.vec') as f:
count = 0
for i, line in enumerate(f):
if i == 0:
continue
if i % 100000 == 0:
print('- At line {}'.format(i))
line = line.rstrip()
sp = line.split(' ')
word, vec = sp[0], sp[1:]
if word in char2idx:
count += 1
embedding[char2idx[word]] = np.asarray(vec, dtype='float32')
print("[%d / %d] characters have found pre-trained values"%(count, len(char2idx)))
np.save('../vocab/char.npy', embedding)
print('Saved ../vocab/char.npy')