#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In[2]: import ktrain from ktrain import text as txt # ## STEP 1: Load and Preprocess the Dataset # # A Dutch NER dataset can be downloaded from [here](https://www.clips.uantwerpen.be/conll2002/ner/). # # We use the `entities_from_conll2003` function to load and preprocess the data, as the dataset is in a standard **CoNLL** format. (Download the data from the link above to see what the format looks like.) # # See the *ktrain* [sequence-tagging tutorial](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/tutorials/tutorial-06-sequence-tagging.ipynb) for more information on how to load data in different ways. # In[3]: TDATA = 'data/dutch_ner/ned.train' VDATA = 'data/dutch_ner/ned.testb' (trn, val, preproc) = txt.entities_from_conll2003(TDATA, val_filepath=VDATA) # ## STEP 2: Build the Model # # Next, we will build a Bidirectional LSTM model that employs the use of transformer embeddings like [BERT word embeddings](https://arxiv.org/abs/1810.04805). By default, the `bilstm-transformer` model will use a pretrained multilingual model (i.e., `bert-base-multilingual-cased`). However, since we are training a Dutch-language model, it is better to select the Dutch pretrained BERT model: `bert-base-dutch-cased`. A full list of available pretrained models is [listed here](https://huggingface.co/transformers/pretrained_models.html). One can also employ the use of [community-uploaded models](https://huggingface.co/models) that focus on specific domains such as the biomedical or scientific domains (e.g, BioBERT, SciBERT). To use SciBERT, for example, set `bert_model` to `allenai/scibert_scivocab_uncased`. # In[4]: WV_URL='https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz' model = txt.sequence_tagger('bilstm-transformer', preproc, transformer_model='wietsedv/bert-base-dutch-cased', wv_path_or_url=WV_URL) # In the cell above, notice that we suppied the `wv_path_or_url` argument. This directs *ktrain* to initialized word embeddings with one of the pretrained fasttext (word2vec) word vector sets from [Facebook's fasttext site](https://fasttext.cc/docs/en/crawl-vectors.html). When supplied with a valid URL to a `.vec.gz`, the word vectors will be automatically downloaded, extracted, and loaded in STEP 2 (download location is `/ktrain_data`). To disable pretrained word embeddings, set `wv_path_or_url=None` and randomly initialized word embeddings will be employed. Use of pretrained embeddings will typically boost final accuracy. When used in combination with a model that uses an embedding scheme like BERT (e.g., `bilstm-bert`), the different word embeddings are stacked together using concatenation. # # Finally, we will wrap our selected model and datasets in a `Learner` object to facilitate training. # In[5]: learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128) # ## STEP 3: Train the Model # # We will train for 5 epochs and decay the learning rate using cosine annealing. This is equivalent to one cycle with a length of 5 epochs. We will save the weights for each epoch in a checkpoint folder. Will train with a learning rate of `0.01`, previously identified using our [learning-rate finder](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/tutorials/tutorial-02-tuning-learning-rates.ipynb). # In[6]: learner.fit(0.01, 1, cycle_len=5, checkpoint_folder='/tmp/saved_weights') # In[7]: learner.plot('lr') # As shown below, our model achieves an F1-Sccore of 83.04 with only a few minutes of training. # In[8]: learner.validate(class_names=preproc.get_classes()) # ## STEP 4: Make Predictions # In[9]: predictor = ktrain.get_predictor(learner.model, preproc) # In[10]: dutch_text = """Marke Rutte is een Nederlandse politicus die momenteel premier van Nederland is.""" predictor.predict(dutch_text) # In[11]: predictor.save('/tmp/my_dutch_nermodel') # The `predictor` can be re-loaded from disk with with `load_predictor`: # In[13]: predictor = ktrain.load_predictor('/tmp/my_dutch_nermodel') # In[15]: predictor.predict(dutch_text) # In[ ]: