#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In[2]: import ktrain from ktrain import text # ## STEP 1: Load and Preprocess Data # # # The CoNLL2003 NER dataset can be downloaded from [here](https://github.com/amaiya/ktrain/tree/master/ktrain/tests/conll2003). # In[3]: TDATA = 'data/conll2003/train.txt' VDATA = 'data/conll2003/valid.txt' (trn, val, preproc) = text.entities_from_conll2003(TDATA, val_filepath=VDATA) # ## STEP 2: Define a Model # In this example notebook, we will build a Bidirectional LSTM model that employs the use of [pretrained BERT word embeddings](https://arxiv.org/abs/1810.04805). By default, the `sequence_tagger` will use a pretrained multilingual model (i.e., `bert-base-multilingual-cased`) that supports 157 different languages. However, since we are training a English-language model on an English-only dataset, it is better to select the English pretrained BERT model: `bert-base-cased`. Notice that we selected the **cased** model, as case is important for English NER, as entities are often capitalized. A full list of available pretrained models is [listed here](https://huggingface.co/transformers/pretrained_models.html). *ktrain* currently supports any `bert-*` model in addition to any `distilbert-*` model. One can also employ the use of BERT-based [community-uploaded moels](https://huggingface.co/models) that focus on specific domains such as the biomedical or scientific domains (e.g, BioBERT, SciBERT). To use SciBERT, for example, set `bert_model` to `allenai/scibert_scivocab_uncased`. # In[4]: text.print_sequence_taggers() # In[5]: model = text.sequence_tagger('bilstm-bert', preproc, bert_model='bert-base-cased') # From the output above, we see that the model is configured to use both BERT pretrained word embeddings and randomly-initialized word emeddings. Instead of randomly-initialized word vectors, one can also select pretrained fasttext word vectors from [Facebook's fasttext site](https://fasttext.cc/docs/en/crawl-vectors.html) and supply the URL via the `wv_path_or_url` parameter: # ```python # wv_path_or_url='https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz') # ``` # We have not used fasttext word embeddings in this example - only BERT word embeddings. # In[6]: learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128) # ## STEP 3: Train and Evaluate Model # In[7]: learner.fit(0.01, 2, cycle_len=5) # In[8]: learner.validate() # We can use the `view_top_losses` method to inspect the sentences we're getting the most wrong. Here, we can see our model has trouble with titles, which is understandable since it is mixed into a catch-all miscellaneous category. # In[9]: learner.view_top_losses(n=1) # ## Make Predictions on New Data # In[10]: predictor = ktrain.get_predictor(learner.model, preproc) # In[11]: predictor.predict('As of 2019, Donald Trump is still the President of the United States.') # In[12]: predictor.save('/tmp/mypred') # In[13]: reloaded_predictor = ktrain.load_predictor('/tmp/mypred') # In[14]: reloaded_predictor.predict('Paul Newman is my favorite actor.') # In[ ]: