from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/clue/main')
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
def get_vocab(f_path):
word2idx = {}
with open(f_path) as f:
for i, line in enumerate(f):
line = line.rstrip()
word2idx[line] = i
return word2idx
def get_data():
x_train = []
y_train = []
x_test = []
y_test = []
with open('../data/train.txt') as f:
for line in f:
line = json.loads(line.rstrip())
text, label = line['content'], line['label']
x_train.append(''.join(list(text)))
y_train.append(label2idx[line['label']])
with open('../data/test.txt') as f:
for line in f:
line = json.loads(line.rstrip())
text, label = line['content'], line['label']
x_test.append(''.join(list(text)))
y_test.append(label2idx[line['label']])
return (x_train, y_train), (x_test, y_test)
label2idx = get_vocab('../vocab/label.txt')
(x_train, y_train), (x_test, y_test) = get_data()
count_model = CountVectorizer(binary = False,
ngram_range = (1,1),
tokenizer = lambda x: list(x))
count_model.fit(x_train)
tfidf_model = TfidfTransformer()
tfidf_model.fit(count_model.transform(x_train))
X_train_tfidf = tfidf_model.transform(count_model.transform(x_train))
X_test_tfidf = tfidf_model.transform(count_model.transform(x_test))
lr_model = LogisticRegression(solver='lbfgs', max_iter=1000)
y_pred = lr_model.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
final_acc = (y_pred == y_test).mean()
print("Testing Accuracy: {:.3f}".format(final_acc))
print('\n'+classification_report(y_true = y_test,
y_pred = y_pred,
labels = list(label2idx.values()),
target_names = list(label2idx.keys()),
digits = 3,))