#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # CPU os.environ['DISABLE_V2_BEHAVIOR'] = '1' # disable V2 Behavior - required for NER in TF2 right now # # **ShallowNLP** Tutorial # # The **ShallowNLP** module in *ktrain* is a small collection of text-analytic utilities to help analyze text data in English, Chinese, Russian, and other languages. All methods in **ShallowNLP** are for use on a normal laptop CPU - no GPUs are required. Thus, it is well-suited to those with minimal computational resources and no GPU access. # # Let's begin by importing the `shallownlp` module. # In[2]: from ktrain.text import shallownlp as snlp # ## SECTION 1: Ready-to-Use Named-Entity-Recognition # # **ShallowNLP** includes pre-trained Named Entity Recognition (NER) for English, Chinese, and Russian. # # ### English NER # # Extracting entities from: # >Xuetao Cao was head of the Chinese Academy of Medical Sciences and is the current president of Nankai University. # In[3]: ner = snlp.NER('en') text = """ Xuetao Cao was head of the Chinese Academy of Medical Sciences and is the current president of Nankai University. """ ner.predict(text) # The `ner.predict` method automatically merges tokens by entity. To see the unmerged results, set `merge_tokens=False`: # In[4]: ner.predict(text, merge_tokens=False) # The `ner.predict` method typically operates on single sentences, as in the example above. For multi-sentence documents, sentences can be extracted with `snlp.sent_tokenize`: # In[5]: document = """Paul Newman is a great actor. Tommy Wiseau is not.""" sents = [] for idx, sent in enumerate(snlp.sent_tokenize(document)): sents.append(sent) print('sentence #%d: %s' % (idx+1, sent)) # In[6]: ner.predict(sents[0]) # In[7]: ner.predict(sents[1]) # ### Chinese NER # Extracting entities from the Chinese translation of: # >Xuetao Cao was head of the Chinese Academy of Medical Sciences and is the current president of Nankai University. # In[8]: ner = snlp.NER('zh') ner.predict('曹雪涛曾任中国医学科学院院长,现任南开大学校长。') # Discovered entities with English translations: # - 曹雪涛 = Cao Xuetao (PER) # - 中国医学科学院 = Chinese Academy of Medical Sciences (ORG) # - 南开大学 = Nankai University (ORG) # # The `snlp.sent_tokenize` can also be used with Chinese documents: # In[9]: document = """这是关于史密斯博士的第一句话。第二句话是关于琼斯先生的。""" for idx, sent in enumerate(snlp.sent_tokenize(document)): print('sentence #%d: %s' % (idx+1, sent)) # ### Russian NER # Extracting entities from the Russian translation of: # >Katerina Tikhonova, the youngest daughter of Russian President Vladimir Putin, was appointed head of a new artificial intelligence institute at Moscow State University. # In[10]: ner = snlp.NER('ru') russian_sentence = """Катерина Тихонова, младшая дочь президента России Владимира Путина, была назначена руководителем нового института искусственного интеллекта в МГУ.""" ner.predict(russian_sentence) # Discovered entities with English translations: # - Катерина Тихонова = Katerina Tikhonova (PER) # - России = Russia (LOC) # - Владимира Путина = Vladimir Putin (PER) # - МГУ = Moscow State University (ORG) # ## SECTION 2: Text Classification # # **ShallowNLP** makes it easy to build a text classifier with minimal computational resources. **ShallowNLP** includes the following sklearn-based text classification models: a non-neural version of [NBSVM](https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf), Logistic Regression, and [Linear SVM with SGD training (SGDClassifier)](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html). Logistic regression is the default classifier. For these examples, we will use [NBSVM](https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf). # # A classifier can be trained with minimal effort for both English and Chinese. # # ### English Text Classification # # We'll use the IMDb movie review dataset [available here](https://ai.stanford.edu/~amaas/data/sentiment/) to build a sentiment analysis model for English. # In[11]: datadir = r'/home/amaiya/data/aclImdb' (x_train, y_train, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train') (x_test, y_test, _) = snlp.Classifier.load_texts_from_folder(datadir+'/test', shuffle=False) print('label names: %s' % (label_names)) clf = snlp.Classifier().fit(x_train, y_train, ctype='nbsvm') print('validation accuracy: %s%%' % (round(clf.evaluate(x_test, y_test)*100, 2))) pos_text = 'I loved this movie because it was hilarious.' neg_text = 'I hated this movie because it was boring.' print('prediction for "%s": %s (pos)' % (pos_text, clf.predict(pos_text))) print('prediction for "%s": %s (neg)' % (neg_text, clf.predict(neg_text))) # ### Chinese Text Classification # # We'll use the hotel review dataset [available here](here:https://github.com/Tony607/Chinese_sentiment_analysis/tree/master/data/ChnSentiCorp_htl_ba_6000) to build a sentiment analysis model for Chinese. # In[12]: datadir = '/home/amaiya/data/ChnSentiCorp_htl_ba_6000' (texts, labels, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train') print('label names: %s' % (label_names)) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, random_state=42) clf = snlp.Classifier().fit(x_train, y_train, ctype='nbsvm') print('validation accuracy: %s%%' % (round(clf.evaluate(x_test, y_test)*100, 2))) pos_text = '我喜欢这家酒店,因为它很干净。' # I loved this hotel because it was very clean. neg_text = '我讨厌这家酒店,因为它很吵。' # I hated this hotel because it was noisy. print('prediction for "%s": %s' % (pos_text, clf.predict(pos_text))) print('prediction for "%s": %s' % (neg_text, clf.predict(neg_text))) # ### Tuning Hyperparameters of a Text Classifier # # The hyperparameters of a particular classifier can be tuned using the `grid_search` method. Let's tune the **C** hyperparameter of a Logistic Regression model to see what is the best value for this dataset. # In[14]: # setup data datadir = r'/home/amaiya/data/aclImdb' (x_train, y_train, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train') (x_test, y_test, _) = snlp.Classifier.load_texts_from_folder(datadir+'/test', shuffle=False) # initialize a model to optimize clf = snlp.Classifier() clf.create_model('logreg', x_train) # create parameter space for values of C parameters = {'clf__C': (1e0, 1e-1, 1e-2)} # tune clf.grid_search(parameters, x_train[:5000], y_train[:5000], n_jobs=-1) # It looks like a value of `1.0` is best. We can then re-create the model with this hyperparameter value and proceed to train normally: # # ```python # clf.create_model('logreg', x_train, hp_dict={'C':1.0}) # clf.fit(x_train, y_train) # clf.evaluate(x_test, y_test) # ``` # # # ## SECTION 3: Examples of Searching Text # # Here we will show some simple searches over multi-language documents. # # In[15]: document1 =""" Hello there, Hope this email finds you well. Are you available to talk about our meeting? If so, let us plan to schedule the meeting at the Hefei National Laboratory for Physical Sciences at the Microscale. As I always say: живи сегодня надейся на завтра Sincerely, John Doe 合肥微尺度国家物理科学实验室 """ document2 =""" This is a random document with Arabic about our meeting. عش اليوم الأمل ليوم غد Bye for now. """ docs = [document1, document2] # ### Searching English # # The `search` function returns a list of documents that match query. Each entry shows: # 1. the ID of the document # 2. the query (multiple queries can be supplied in a list, if desired) # 3. the number of word hits in the document # # In[16]: snlp.search(['physical sciences', 'meeting', 'Arabic'], docs, keys=['doc1', 'doc2']) # ### Searching Chinese # # The `search` function returns a list of documents that match query. Each entry shows: # 1. the ID of the document # 2. the query # 3. the number of word hits in the document # # In[17]: snlp.search('合肥微尺度国家物理科学实验室', docs, keys=['doc1', 'doc2']) # For Chinese, the number of word hits is the number of words in the query that appear in the document. Seven of the words in the string 合肥微尺度国家物理科学实验室 were found in `doc1`. # ### Other Searches # # The `search` function can also be used for other languages. # # #### Arabic # In[18]: for result in snlp.search('عش اليوم الأمل ليوم غد', docs, keys=['doc1', 'doc2']): print("doc id:%s"% (result[0])) print('query:%s' % (result[1])) print('# of matches in document:%s' % (result[2])) # #### Russian # In[19]: snlp.search('сегодня надейся на завтра', docs, keys=['doc1', 'doc2']) # #### Extract Chinese, Russian, or Arabic from mixed-language documents # In[20]: snlp.find_chinese(document1) # In[21]: snlp.find_russian(document1) # In[22]: snlp.find_arabic(document2) # In[ ]: