# install what we need if not already installed
#!pip install datasets transformers
# minimal example here should work for either 'pytorch' or 'tensorflow'
framework = 'tensorflow'
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, Value, ClassLabel, Features
from transformers import DataCollatorWithPadding, AutoTokenizer, pipeline
if framework == 'tensorflow':
from transformers import TFAutoModelForSequenceClassification, create_optimizer
import tensorflow as tf
else:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)
# downsample data for speed in this example
n_train = 1000
n_test = 1000
# ml inputs
batch_size = 16
learning_rate = 2e-5
num_epochs = 2
weight_decay = 0.01
# load imdb data
data = load_dataset("imdb")
# pull data into pandas dataframes to downsample
df_train = pd.DataFrame.from_dict(data['train']).sample(n_train)
df_test = pd.DataFrame.from_dict(data['test']).sample(n_test)
# now build back up a DatasetDict based on the downsampled data
# define the features
features = Features({
"text": Value("string"),
"label": ClassLabel(num_classes=2, names=['neg','pos']),
"__index_level_0__": Value("string")
})
# recreate the data object using the smaller df's
data = DatasetDict({
'train': Dataset.from_pandas(df_train, features=features),
'test': Dataset.from_pandas(df_test, features=features),
})
# remove index col (seems to be coming in from pandas for some reason)
data = data.remove_columns(["__index_level_0__"])
# look at data
print(data['train'].features)
print(data['test'].features)
print(data)
WARNING:datasets.builder:Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
0%| | 0/3 [00:00<?, ?it/s]
{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)} {'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)} DatasetDict({ train: Dataset({ features: ['text', 'label'], num_rows: 1000 }) test: Dataset({ features: ['text', 'label'], num_rows: 1000 }) })
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# tokenize the data
tokenized_data = data.map(preprocess_function, batched=True)
0%| | 0/1 [00:00<?, ?ba/s]
0%| | 0/1 [00:00<?, ?ba/s]
# train mode based on framework
if framework == 'pytorch':
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
training_args = TrainingArguments(
output_dir="./results",
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=weight_decay,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_data["train"],
eval_dataset=tokenized_data["test"],
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
elif framework == 'tensorflow':
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
tf_train_set = tokenized_data["train"].to_tf_dataset(
columns=["attention_mask", "input_ids", "label"],
shuffle=True,
batch_size=batch_size,
collate_fn=data_collator,
)
tf_validation_set = tokenized_data["test"].to_tf_dataset(
columns=["attention_mask", "input_ids", "label"],
shuffle=False,
batch_size=batch_size,
collate_fn=data_collator,
)
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps)
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.compile(optimizer=optimizer)
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs)
else:
raise ValueError('unsupported framework')
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_projector', 'activation_13', 'vocab_layer_norm'] - This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
Epoch 1/2 63/63 [==============================] - 56s 726ms/step - loss: 0.5812 - val_loss: 0.3526 Epoch 2/2 63/63 [==============================] - 44s 702ms/step - loss: 0.2713 - val_loss: 0.2748
# create pipeline for inference
classifier = pipeline(
task="text-classification",
model=model,
tokenizer=tokenizer,
device=0
)
classifier("this is a great movie")
[{'label': 'LABEL_1', 'score': 0.9083077311515808}]
classifier("this is a terrible movie")
[{'label': 'LABEL_0', 'score': 0.7608168125152588}]