#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # # Sentence Pair Classification with *ktrain* # # This notebook demonstrates sentence pair classification with *ktrain*. # # ## Download a Sentence Pair Classification Dataset # # In this notebook, we will use the Microsoft Research Paraphrase Corpus (MRPC) to build a model that can detect pairs of sentences that are paraphrases of one another. The MRPC train and test datasets can be downloaded from here: # - [MRPC train dataset](https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt) # - [MRPC test dataset](https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt) # # Once downloaded, we will prepare the datasets as arrays of sentence pairs. # In[2]: import pandas as pd import csv TRAIN = 'data/mrpc/msr_paraphrase_train.txt' TEST = 'data/mrpc/msr_paraphrase_test.txt' train_df = pd.read_csv(TRAIN, delimiter='\t', quoting=csv.QUOTE_NONE) test_df = pd.read_csv(TEST, delimiter='\t', quoting=csv.QUOTE_NONE) x_train = train_df[['#1 String', '#2 String']].values y_train = train_df['Quality'].values x_test = test_df[['#1 String', '#2 String']].values y_test = test_df['Quality'].values # IMPORTANT: data format for sentence pair classification is list of tuples of form (str, str) x_train = list(map(tuple, x_train)) x_test = list(map(tuple, x_test)) # In[3]: print(x_train[0]) print(y_train[0]) # ## Build and Train a `BERT` Model # # For demonstration purposes, we only train for 3 epochs. # In[4]: import ktrain from ktrain import text MODEL_NAME = 'bert-base-uncased' t = text.Transformer(MODEL_NAME, maxlen=128, class_names=['not paraphrase', 'paraphrase']) trn = t.preprocess_train(x_train, y_train) val = t.preprocess_test(x_test, y_test) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) # lower bs if OOM occurs learner.fit_onecycle(5e-5, 3) # ## Make Predictions # In[5]: predictor = ktrain.get_predictor(learner.model, t) # Let's select a positive and negative example from `x_test`. # In[6]: y_test[:5] # In[12]: positive = x_test[0] negative = x_test[4] # In[13]: print('Valid Paraphrase:\n%s' %(positive,)) # In[14]: print('Invalid Paraphrase:\n%s' %(negative,)) # In[15]: predictor.predict(positive) # In[16]: predictor.predict(negative) # In[17]: predictor.predict([positive, negative]) # In[18]: predictor.save('/tmp/mrpc_model') # In[19]: p = ktrain.load_predictor('/tmp/mrpc_model') # In[20]: p.predict(positive) # In[ ]: