Notebook

In [ ]:

# install what we need if not already installed
#!pip install datasets transformers

In [ ]:

# minimal example here should work for either 'pytorch' or 'tensorflow'
framework = 'tensorflow'

Imports¶

In [ ]:

import pandas as pd

from datasets import load_dataset, Dataset, DatasetDict, Value, ClassLabel, Features
from transformers import DataCollatorWithPadding, AutoTokenizer, pipeline

if framework == 'tensorflow':
    from transformers import TFAutoModelForSequenceClassification, create_optimizer
    import tensorflow as tf
else:
    from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

Params¶

In [ ]:

# downsample data for speed in this example
n_train = 1000
n_test = 1000

# ml inputs
batch_size = 16
learning_rate = 2e-5
num_epochs = 2
weight_decay = 0.01

Get Data¶

In [ ]:

# load imdb data
data = load_dataset("imdb")

# pull data into pandas dataframes to downsample
df_train = pd.DataFrame.from_dict(data['train']).sample(n_train)
df_test = pd.DataFrame.from_dict(data['test']).sample(n_test)

# now build back up a DatasetDict based on the downsampled data

# define the features
features = Features({
    "text": Value("string"), 
    "label": ClassLabel(num_classes=2, names=['neg','pos']),
    "__index_level_0__": Value("string") 
    })

# recreate the data object using the smaller df's
data = DatasetDict({
    'train': Dataset.from_pandas(df_train, features=features),
    'test': Dataset.from_pandas(df_test, features=features),
    })

# remove index col (seems to be coming in from pandas for some reason)
data = data.remove_columns(["__index_level_0__"])

# look at data
print(data['train'].features)
print(data['test'].features)
print(data)

WARNING:datasets.builder:Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)

  0%|          | 0/3 [00:00<?, ?it/s]

{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}
{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})

Tokenize¶

In [ ]:

# create tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# tokenize the data
tokenized_data = data.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Train¶

In [ ]:

# train mode based on framework
if framework == 'pytorch':
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=weight_decay,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )    
    
    trainer.train()
    
elif framework == 'tensorflow':
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
    
    tf_train_set = tokenized_data["train"].to_tf_dataset(
        columns=["attention_mask", "input_ids", "label"],
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    tf_validation_set = tokenized_data["test"].to_tf_dataset(
        columns=["attention_mask", "input_ids", "label"],
        shuffle=False,
        batch_size=batch_size,
        collate_fn=data_collator,
    )
    
    batches_per_epoch = len(tokenized_data["train"]) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
    optimizer, schedule = create_optimizer(init_lr=learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps)
    
    model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    
    model.compile(optimizer=optimizer)
    
    model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs)

else: 
    
    raise ValueError('unsupported framework')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_projector', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.

Epoch 1/2
63/63 [==============================] - 56s 726ms/step - loss: 0.5812 - val_loss: 0.3526
Epoch 2/2
63/63 [==============================] - 44s 702ms/step - loss: 0.2713 - val_loss: 0.2748

Inference Pipeline¶

In [ ]:

# create pipeline for inference
classifier = pipeline(
    task="text-classification", 
    model=model, 
    tokenizer=tokenizer, 
    device=0
    )

In [ ]:

classifier("this is a great movie")

Out[ ]:

[{'label': 'LABEL_1', 'score': 0.9083077311515808}]

In [ ]:

classifier("this is a terrible movie")

Out[ ]:

[{'label': 'LABEL_0', 'score': 0.7608168125152588}]