#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # # Text Classification with Hugging Face Transformers in *ktrain* # # As of v0.8.x, *ktrain* now includes an easy-to-use, thin wrapper to the Hugging Face transformers library for text classification. # ## Load Data Into Arrays # In[2]: categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] from sklearn.datasets import fetch_20newsgroups train_b = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) test_b = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) print('size of training set: %s' % (len(train_b['data']))) print('size of validation set: %s' % (len(test_b['data']))) print('classes: %s' % (train_b.target_names)) x_train = train_b.data y_train = train_b.target x_test = test_b.data y_test = test_b.target # ## STEP 1: Preprocess Data and Build a Transformer Model # # For `MODEL_NAME`, *ktrain* supports both the "official" built-in models [available here](https://huggingface.co/transformers/pretrained_models.html) and the [community-upoaded models available here](https://huggingface.co/models). # In[3]: import ktrain from ktrain import text MODEL_NAME = 'distilbert-base-uncased' t = text.Transformer(MODEL_NAME, maxlen=500, class_names=train_b.target_names) trn = t.preprocess_train(x_train, y_train) val = t.preprocess_test(x_test, y_test) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) # Note that `x_train` and `x_test` are the raw texts that look like this: # ```python # x_train = ['I hate this movie.', 'I like this movie.'] # ``` # The labels are arrays in one of the following forms: # ```python # # string labels # y_train = ['negative', 'positive'] # # integer labels # y_train = [0, 1] # labels must start from 0 if in integer format # # multi or one-hot encoded labels # y_train = [[1,0], [0,1]] # ``` # In the latter two cases, you must supply a `class_names` argument to the `Transformer` constructor, which tells *ktrain* how indices map to class names. In this case, `class_names=['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']` because 0=alt.atheism, 1=comp.graphics, etc. # ## STEP 2 [Optional]: Estimate a Good Learning Rate # # Learning rates between `2e-5` and `5e-5` tend to work well with transformer models based on papers from Google. However, we will run our learning-rate-finder for two epochs to estimate the LR on this particular dataset. # # As shown below, our results are consistent Google's findings. # In[5]: learner.lr_find(show_plot=True, max_epochs=2) # ## STEP 3: Train Model # # Train using a [1cycle learning rate schedule](https://arxiv.org/pdf/1803.09820.pdf). # In[4]: learner.fit_onecycle(8e-5, 4) # ## STEP 4: Evaluate/Inspect Model # In[5]: learner.validate(class_names=t.get_classes()) # In[6]: # the one we got most wrong learner.view_top_losses(n=1, preproc=t) # In[7]: # understandable mistake - this sci.med post talks a lot about computer graphics print(x_test[521]) # ## STEP 5: Make Predictions on New Data in Deployment # In[8]: predictor = ktrain.get_predictor(learner.model, preproc=t) # In[9]: predictor.predict('Jesus Christ is the central figure of Christianity.') # In[10]: predictor.explain('Jesus Christ is the central figure of Christianity.') # In[11]: predictor.save('/tmp/my_20newsgroup_predictor') # In[12]: reloaded_predictor = ktrain.load_predictor('/tmp/my_20newsgroup_predictor') # In[13]: reloaded_predictor.predict('Jesus Christ is the central figure of Christianity.') # In[14]: reloaded_predictor.predict_proba('Jesus Christ is the central figure of Christianity.') # In[15]: reloaded_predictor.get_classes() # In[ ]: