%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # CPU
os.environ['DISABLE_V2_BEHAVIOR'] = '1' # disable V2 Behavior - required for NER in TF2 right now

from ktrain.text import shallownlp as snlp

ner = snlp.NER('en')
text = """
Xuetao Cao was head of the Chinese Academy of Medical Sciences and is 
the current president of Nankai University.
"""
ner.predict(text)

ner.predict(text, merge_tokens=False)

document = """Paul Newman is a great actor.  Tommy Wiseau is not."""
sents = []
for idx, sent in enumerate(snlp.sent_tokenize(document)):
    sents.append(sent)
    print('sentence #%d: %s' % (idx+1, sent))

ner.predict(sents[0])

ner.predict(sents[1])

ner = snlp.NER('zh')
ner.predict('曹雪涛曾任中国医学科学院院长，现任南开大学校长。')

document = """这是关于史密斯博士的第一句话。第二句话是关于琼斯先生的。"""
for idx, sent in enumerate(snlp.sent_tokenize(document)):
    print('sentence #%d: %s' % (idx+1, sent))

ner = snlp.NER('ru')
russian_sentence = """Катерина Тихонова, младшая дочь президента России Владимира Путина, 
была назначена руководителем нового института искусственного интеллекта в МГУ."""
ner.predict(russian_sentence)

datadir = r'/home/amaiya/data/aclImdb'
(x_train,  y_train, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train', subfolders=['neg', 'pos'])
(x_test,  y_test, _) = snlp.Classifier.load_texts_from_folder(datadir+'/test', shuffle=False, subfolders=['neg', 'pos'])
print('label names: %s' % (label_names))
clf = snlp.Classifier()
clf.create_model('nbsvm', x_train, vec__ngram_range=(1,3), vec__binary=True)
clf = clf.fit(x_train, y_train)
print('validation accuracy: %s%%' % (round(clf.evaluate(x_test, y_test)*100, 2)))
pos_text = 'I loved this movie because it was hilarious.'
neg_text = 'I hated this movie because it was boring.'
print('prediction for "%s": %s (pos)' % (pos_text, clf.predict(pos_text)))
print('prediction for "%s": %s (neg)' % (neg_text, clf.predict(neg_text)))

datadir = '/home/amaiya/data/ChnSentiCorp_htl_ba_6000'
(texts,  labels, label_names) = snlp.Classifier.load_texts_from_folder(datadir)
print('label names: %s' % (label_names))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, random_state=42)
clf = snlp.Classifier()
clf.create_model('logreg', x_train, vec__ngram_range=(1,3), clf__solver='newton-cg')
clf = clf.fit(x_train, y_train)
print('validation accuracy: %s%%' % (round(clf.evaluate(x_test, y_test)*100, 2)))
pos_text = '我喜欢这家酒店，因为它很干净。'  # I loved this hotel because it was very clean.
neg_text = '我讨厌这家酒店，因为它很吵。'  # I hated this hotel because it was noisy.
print('prediction for "%s": %s' % (pos_text, clf.predict(pos_text)))
print('prediction for "%s": %s' % (neg_text, clf.predict(neg_text)))

# setup data
datadir = r'/home/amaiya/data/aclImdb'
(x_train,  y_train, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train')
(x_test,  y_test, _) = snlp.Classifier.load_texts_from_folder(datadir+'/test', shuffle=False)

# initialize a model to optimize
clf = snlp.Classifier()
clf.create_model('logreg', x_train, clf__solver='newton-cg')

# create parameter space for values of C
parameters = {'clf__C': (1e0, 1e-1, 1e-2)}

# tune
clf.grid_search(parameters, x_train[:5000], y_train[:5000], n_jobs=-1)


document1 ="""
Hello there,

Hope this email finds you well.

Are you available to talk about our meeting?

If so, let us plan to schedule the meeting
at the Hefei National Laboratory for Physical Sciences at the Microscale.

As I always say: живи сегодня надейся на завтра

Sincerely,
John Doe
合肥微尺度国家物理科学实验室
"""

document2 ="""
This is a random document with Arabic about our meeting.

عش اليوم الأمل ليوم غد

Bye for now.
"""

docs = [document1, document2]

snlp.search(['physical sciences', 'meeting', 'Arabic'], docs, keys=['doc1', 'doc2'])

snlp.search('合肥微尺度国家物理科学实验室', docs, keys=['doc1', 'doc2'])

for result in snlp.search('عش اليوم الأمل ليوم غد', docs, keys=['doc1', 'doc2']):
    print("doc id:%s"% (result[0]))
    print('query:%s' % (result[1]))
    print('# of matches in document:%s' % (result[2]))

snlp.search('сегодня надейся на завтра', docs, keys=['doc1', 'doc2'])

snlp.find_chinese(document1)

snlp.find_russian(document1)

snlp.find_arabic(document2)