#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # # Text Classification with Hugging Face Transformers in *ktrain* # # As of v0.8.x, *ktrain* now includes an easy-to-use, thin wrapper to the Hugging Face transformers library for text classification. # ## Load Data Into Arrays # In[2]: categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] from sklearn.datasets import fetch_20newsgroups train_b = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) test_b = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) print('size of training set: %s' % (len(train_b['data']))) print('size of validation set: %s' % (len(test_b['data']))) print('classes: %s' % (train_b.target_names)) x_train = train_b.data y_train = train_b.target x_test = test_b.data y_test = test_b.target # ## STEP 1: Preprocess Data and Build a Transformer Model # # For `MODEL_NAME`, *ktrain* supports both the "official" built-in models [available here](https://huggingface.co/transformers/pretrained_models.html) and the [community-upoaded models available here](https://huggingface.co/models). # In[3]: import ktrain from ktrain import text MODEL_NAME = 'distilbert-base-uncased' t = text.Transformer(MODEL_NAME, maxlen=500, class_names=train_b.target_names) trn = t.preprocess_train(x_train, y_train) val = t.preprocess_test(x_test, y_test) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) # Note that `x_train` and `x_test` are the raw texts that look like this: # ```python # x_train = ['I hate this movie.', 'I like this movie.'] # ``` # The labels are arrays in one of the following forms: # ```python # # string labels # y_train = ['negative', 'positive'] # # integer labels # y_train = [0, 1] # labels must start from 0 if in integer format # # multi or one-hot encoded labels # y_train = [[1,0], [0,1]] # ``` # In the latter two cases, you must supply a `class_names` argument to the `Transformer` constructor, which tells *ktrain* how indices map to class names. In this case, `class_names=['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']` because 0=alt.atheism, 1=comp.graphics, etc. # ## STEP 2 [Optional]: Estimate a Good Learning Rate # # Learning rates between `2e-5` and `5e-5` tend to work well with transformer models based on papers from Google. However, we will run our learning-rate-finder for two epochs to estimate the LR on this particular dataset. # # As shown below, our results are consistent Google's findings. # In[5]: learner.lr_find(show_plot=True, max_epochs=2) # ## STEP 3: Train Model # # Train using a [1cycle learning rate schedule](https://arxiv.org/pdf/1803.09820.pdf). # In[4]: learner.fit_onecycle(8e-5, 4) # ## STEP 4: Evaluate/Inspect Model # In[5]: learner.validate(class_names=t.get_classes()) # In[6]: # the one we got most wrong learner.view_top_losses(n=1, preproc=t) # In[7]: # understandable mistake - this sci.med post talks a lot about computer graphics print(x_test[521]) # ## STEP 5: Make Predictions on New Data in Deployment # In[8]: predictor = ktrain.get_predictor(learner.model, preproc=t) # In[9]: predictor.predict('Jesus Christ is the central figure of Christianity.') # In[10]: predictor.explain('Jesus Christ is the central figure of Christianity.') # In[11]: predictor.save('/tmp/my_20newsgroup_predictor') # In[12]: reloaded_predictor = ktrain.load_predictor('/tmp/my_20newsgroup_predictor') # In[13]: reloaded_predictor.predict('Jesus Christ is the central figure of Christianity.') # In[14]: reloaded_predictor.predict_proba('Jesus Christ is the central figure of Christianity.') # In[15]: reloaded_predictor.get_classes() # ## Additional Tips and Tricks # If you have a **transformers** model that has already been trained/fine-tuned, you can easily wrap it into a **ktrain** `Predictor`. The example below loads the pre-fine-tuned [coronabert model](https://huggingface.co/jakelever/coronabert) into **ktrain** to make predictions: # # ```python # # Import ktrain along with a couple things from transformers # import ktrain # from transformers import TFAutoModelForSequenceClassification # # # Load the model and compile it for ktrain/tf.Keras # model = TFAutoModelForSequenceClassification.from_pretrained("jakelever/coronabert") # model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) # # # Pull the categories out of the model (or set class_names manually) # class_names = list(model.config.id2label.values()) # # # Set up a ktrain preprocessor (which manages the tokenization) with the labels # preproc = ktrain.text.Transformer('jakelever/coronabert',maxlen=500,class_names=class_names) # preproc.preprocess_train_called = True # needed to suppress warnings about not calling preprocess_train # # # Get the predictor (which takes in the model and tokenizer info) # predictor = ktrain.get_predictor(model, preproc) # # # Make predictions # text = ["A genomic region associated with protection against severe COVID-19 is inherited from Neandertals."] # predictor.predict(text) # # # OUTPUT: # # [[('Clinical Reports', 0.0003284997), # # ('Comment/Editorial', 0.0022700194), # # ('Communication', 0.00060458254), # # ('Contact Tracing', 0.00027690193), # # ('Diagnostics', 0.0003987006), # # ('Drug Targets', 0.0008852846), # # ('Education', 0.00018228142), # # ('Effect on Medical Specialties', 0.00045943243), # # ('Forecasting & Modelling', 0.00047854715), # # ('Health Policy', 0.00042494797), # # ('Healthcare Workers', 6.292213e-05), # # ('Imaging', 0.00021008229), # # ('Immunology', 0.00072542584), # # ('Inequality', 0.0007106358), # # ('Infection Reports', 0.00033797201), # # ('Long Haul', 0.00034338655), # # ('Medical Devices', 0.0002488097), # # ('Meta-analysis', 0.00030506376), # # ('Misinformation', 0.0012771417), # # ('Model Systems & Tools', 0.0020338537), # # ('Molecular Biology', 0.9950799), # # ('News', 0.00034808667), # # ('Non-human', 0.98562455), # # ('Non-medical', 0.0005655724), # # ('Pediatrics', 0.00042545484), # # ('Prevalence', 0.0011711525), # # ('Prevention', 0.00043099752), # # ('Psychology', 0.00045698017), # # ('Recommendations', 0.0004172316), # # ('Review', 0.002200645), # # ('Risk Factors', 0.00014382145), # # ('Surveillance', 0.00081551325), # # ('Therapeutics', 0.0010580326), # # ('Transmission', 0.0031670583), # # ('Vaccines', 0.0011023124)]] # # ``` # # Finally, to make predictions with a smaller deployment footprint, you can export the model to ONNX format as described in [this example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/develop/examples/text/ktrain-ONNX-TFLite-examples.ipynb). # In[ ]: