from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_matching/chinese/data')
from collections import Counter
from pathlib import Path
import csv
import numpy as np
lens_1, lens_2 = [], []
counter = Counter()
with open('train.csv') as f:
for i, line in enumerate(csv.reader(f, delimiter=',')):
if i == 0:
continue
text1, text2, label = line
counter.update(list(text1))
counter.update(list(text2))
lens_1.append(len(list(text1)))
lens_2.append(len(list(text2)))
print('Average Length 1:', sum(lens_1) / len(lens_1))
print('Average Length 2:', sum(lens_2) / len(lens_2))
chars = [w for w, freq in counter.most_common() if freq >= 3]
Path('../vocab').mkdir(exist_ok=True)
with open('../vocab/char.txt', 'w') as f:
f.write('<pad>'+'\n')
for c in chars:
f.write(c+'\n')
char2idx = {}
with open('../vocab/char.txt') as f:
for i, line in enumerate(f):
line = line.rstrip('\n')
char2idx[line] = i
embedding = np.zeros((len(char2idx)+1, 300)) # + 1 for unknown word
with open('../vocab/cc.zh.300.vec') as f:
count = 0
for i, line in enumerate(f):
if i == 0:
continue
if i % 100000 == 0:
print('- At line {}'.format(i))
line = line.rstrip()
sp = line.split(' ')
word, vec = sp[0], sp[1:]
if word in char2idx:
count += 1
embedding[char2idx[word]] = np.asarray(vec, dtype='float32')
print("[%d / %d] characters have found pre-trained values"%(count, len(char2idx)))
np.save('../vocab/char.npy', embedding)
print('Saved ../vocab/char.npy')