# !pip install datasets wandb
# utils
import os
import torch
import tqdm
# data
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig, AutoModel, AutoTokenizer
# model
import torch.nn as nn
# training and evaluation
import wandb
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score, f1_score, classification_report
wandb: WARNING W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
cuda
# custom dataset class
class ReviewDataset(Dataset):
def __init__(self, tokenizer, data, text_field='text', label_field='label', max_len=512):
self.tokenizer = tokenizer
self.data = data
self.text_field = text_field
self.label_field = label_field
self.max_len = max_len
def __len__(self):
return len(self.data[self.text_field])
def __getitem__(self, idx):
text = self.data[self.text_field][idx]
target = self.data[self.label_field][idx]
# encode the text and target into tensors return the attention masks as well
encoding = self.tokenizer.encode_plus(
text=text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
truncation=True,
padding='max_length'
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
class Classifier(torch.nn.Module):
def __init__(self, model_name, num_classes=2):
super(Classifier, self).__init__()
# create the model config and BERT initialize the pretrained BERT, also layers wise outputs
self.config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name)
self.base = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name)
# classifier head [not useful]
self.head = nn.Sequential(*[
nn.Linear(in_features=self.config.hidden_size, out_features=256),
nn.ReLU(),
nn.Linear(in_features=256, out_features=num_classes)
])
def forward(self, input_ids, attention_mask=None):
# first output is top layer output, second output is context of input seq and third output will be layerwise token embeddings
top_layer, pooled = None, self.base(input_ids, attention_mask)[0][:, 0]
logits = self.head(pooled)
return logits, pooled, top_layer
class Finetuner(pl.LightningModule):
def __init__(self, config):
super(Finetuner, self).__init__()
# initialize the BERT model
self.config = config
self.model = Classifier(model_name=self.config['model_name'], num_classes=self.config['num_classes'])
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=self.config['model_name'])
def forward(self, input_ids, attention_mask=None):
logits, _, _ = self.model(input_ids, attention_mask)
return logits
def configure_optimizers(self):
return torch.optim.Adam(params=self.parameters(), lr=self.config['lr'])
def train_dataloader(self):
# first 10% data reserved for validation
data = load_dataset("csv", data_files=self.config['root_dir']+self.config['source'], split='train[10%:]')
dataset = ReviewDataset(tokenizer=self.tokenizer, data=data, text_field=self.config['text_field'], label_field=self.config['label_field'], max_len=self.config['max_len'])
loader = DataLoader(dataset=dataset, batch_size=self.config['batch_size'], shuffle=True)
return loader
def training_step(self, batch, batch_idx):
input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['targets']
logits = self(input_ids, attention_mask)
loss = F.cross_entropy(logits, targets)
acc = accuracy_score(targets.cpu(), logits.argmax(dim=1).cpu())
f1 = f1_score(targets.cpu(), logits.argmax(dim=1).cpu())
wandb.log({"Loss": loss, "Accuracy": torch.tensor([acc]), "F1":torch.tensor([f1])})
return {"loss": loss, "accuracy": torch.tensor([acc]), "f1":torch.tensor([f1])}
def val_dataloader(self):
# first 10% data reserved for validation
data = load_dataset("csv", data_files=self.config['root_dir']+self.config['source'], split='train[:10%]')
dataset = ReviewDataset(tokenizer=self.tokenizer, data=data, text_field=self.config['text_field'], label_field=self.config['label_field'], max_len=self.config['max_len'])
loader = DataLoader(dataset=dataset, batch_size=self.config['batch_size'], shuffle=False)
return loader
def validation_step(self, batch, batch_idx):
input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['targets']
logits = self(input_ids, attention_mask)
loss = F.cross_entropy(logits, targets)
acc = accuracy_score(targets.cpu(), logits.argmax(dim=1).cpu())
f1 = f1_score(targets.cpu(), logits.argmax(dim=1).cpu())
wandb.log({"Val_loss": loss, "Val_accuracy": torch.tensor([acc]), "Val_f1":torch.tensor([f1])})
return {"val_loss": loss, "val_accuracy": torch.tensor([acc]), "val_f1":torch.tensor([f1])}
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
avg_acc = torch.stack([x['val_accuracy'] for x in outputs]).mean()
avg_f_score = torch.stack([x['val_f1'] for x in outputs]).mean()
wandb.log({"Val_loss":avg_loss, "Val_accuracy":avg_acc, "Val_f1":avg_f_score})
return {'val_loss': avg_loss, 'val_accuracy': avg_acc, "val_f1":avg_f_score}
def test_dataloader(self):
# test data is same as validation data
data = load_dataset("csv", data_files=self.config['root_dir']+self.config['source'], split='train[:10%]')
dataset = ReviewDataset(tokenizer=self.tokenizer, data=data, text_field=self.config['text_field'], label_field=self.config['label_field'], max_len=self.config['max_len'])
loader = DataLoader(dataset=dataset, batch_size=self.config['batch_size'], shuffle=False)
return loader
def test_step(self, batch, batch_idx):
input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['targets']
logits = self(input_ids, attention_mask)
loss = F.cross_entropy(logits, targets)
acc = accuracy_score(targets.cpu(), logits.argmax(dim=1).cpu())
f1 = f1_score(targets.cpu(), logits.argmax(dim=1).cpu())
return {"test_loss":loss, "test_accuracy":torch.tensor([acc]), "test_f1":torch.tensor([f1])}
def test_epoch_end(self, outputs):
avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
avg_acc = torch.stack([x['test_accuracy'] for x in outputs]).mean()
avg_f1 = torch.stack([x['test_f1'] for x in outputs]).mean()
return {"test_loss":avg_loss, "test_accuracy":avg_acc, "test_f1":avg_f1}
config = {
# data
"root_dir":"../input/amazonproductsreview/amazon-review/",
"source":'books.csv',
"targets":["dvd.csv", "electronics.csv", "kitchen_housewares.csv"],
"max_len":512,
"batch_size":8,
"num_classes":2,
"text_field":"review_text",
"label_field":"sentiment",
# model
"model_name":'xlnet-base-cased',
# training
"lr":1e-5,
"epochs":20,
# logger and checkpoints
"project":"pretrained-model-robustness",
"run_name":"xlnet",
"monitor":"val_accuracy",
"min_delta":0.001,
"filepath":"../working/{epoch}-{val_accuracy:4f}",
"save_dir":"../working/",
}
logger = WandbLogger(
name=config["run_name"],
save_dir=config["save_dir"],
project=config["project"],
log_model=True,
)
early_stopping = EarlyStopping(
monitor=config["monitor"],
min_delta=config["min_delta"],
patience=5,
)
checkpoints = ModelCheckpoint(
filepath=config["filepath"],
monitor=config["monitor"],
save_top_k=1
)
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/utilities/distributed.py:45: UserWarning: Checkpoint directory /kaggle/working exists and is not empty. With save_top_k=1, all files in this directory will be deleted when a checkpoint is saved! warnings.warn(*args, **kwargs)
trainer = pl.Trainer(
logger=logger,
gpus=[0],
checkpoint_callback=checkpoints,
default_root_dir="../working/",
max_epochs=config["epochs"],
callbacks=[early_stopping]
)
GPU available: True, used: True TPU available: False, using: 0 TPU cores LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
model = Finetuner(config)
HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…
/opt/conda/lib/python3.7/site-packages/transformers/configuration_xlnet.py:212: FutureWarning: This config doesn't use attention memories, a core feature of XLNet. Consider setting `mem_len` to a non-zero value, for example `xlnet = XLNetLMHeadModel.from_pretrained('xlnet-base-cased'', mem_len=1024)`, for accurate training performance as well as an order of magnitude faster inference. Starting from version 3.5.0, the default parameter will be 1024, following the implementation in https://arxiv.org/abs/1906.08237 FutureWarning,
HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…
HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…
trainer.fit(model)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter: ········
wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc wandb: wandb version 0.10.11 is available! To upgrade, please run: wandb: $ pip install wandb --upgrade
../working/wandb/run-20201128_021800-3bz466av
| Name | Type | Params ------------------------------------- 0 | model | Classifier | 116 M
HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1562.0, style=ProgressStyle(description…
Using custom data configuration default
Downloading and preparing dataset csv/default-af39d8bf1cb33848 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-af39d8bf1cb33848/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2...
HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))
Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-af39d8bf1cb33848/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2. Subsequent calls will reuse this data.
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/utilities/distributed.py:45: UserWarning: The validation_epoch_end should not return anything as of 9.1.to log, use self.log(...) or self.write(...) directly in the LightningModule warnings.warn(*args, **kwargs) Using custom data configuration default Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-af39d8bf1cb33848/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2)
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…
/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1465: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. average, "true nor predicted", 'F-score is', len(true_sum)
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…
wandb: WARNING Symlinked 0 file into the W&B run directory, call wandb.save again to sync new files.
1
trainer.test(model)
Using custom data configuration default Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-af39d8bf1cb33848/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2)
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…
-------------------------------------------------------------------------------- DATALOADER:0 TEST RESULTS {'test_accuracy': tensor(0.8800, dtype=torch.float64), 'test_f1': tensor(0.8675, dtype=torch.float64), 'test_loss': tensor(0.4570, device='cuda:0')} --------------------------------------------------------------------------------
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/utilities/distributed.py:45: UserWarning: The testing_epoch_end should not return anything as of 9.1.to log, use self.log(...) or self.write(...) directly in the LightningModule warnings.warn(*args, **kwargs)
[{'test_loss': 0.45704737305641174, 'test_accuracy': 0.88, 'test_f1': 0.8674862914862916}]
l = torch.load(f="../working/epoch=6-val_accuracy=0.915000.ckpt")
model.load_state_dict(l['state_dict'])
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-17-5e6a8c1e3fe5> in <module> ----> 1 l = torch.load(f="../working/epoch=10-val_accuracy=0.895000.ckpt") 2 model.load_state_dict(l['state_dict']) /opt/conda/lib/python3.7/site-packages/torch/serialization.py in load(f, map_location, pickle_module, **pickle_load_args) 569 pickle_load_args['encoding'] = 'utf-8' 570 --> 571 with _open_file_like(f, 'rb') as opened_file: 572 if _is_zipfile(opened_file): 573 # The zipfile reader is going to advance the current file position. /opt/conda/lib/python3.7/site-packages/torch/serialization.py in _open_file_like(name_or_buffer, mode) 227 def _open_file_like(name_or_buffer, mode): 228 if _is_path(name_or_buffer): --> 229 return _open_file(name_or_buffer, mode) 230 else: 231 if 'w' in mode: /opt/conda/lib/python3.7/site-packages/torch/serialization.py in __init__(self, name, mode) 208 class _open_file(_opener): 209 def __init__(self, name, mode): --> 210 super(_open_file, self).__init__(open(name, mode)) 211 212 def __exit__(self, *args): FileNotFoundError: [Errno 2] No such file or directory: '../working/epoch=10-val_accuracy=0.895000.ckpt'
trainer.test(model)
Using custom data configuration default Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-c2d784cc01c6211e/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2)
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…
-------------------------------------------------------------------------------- DATALOADER:0 TEST RESULTS {'test_accuracy': tensor(0.8950, dtype=torch.float64), 'test_f1': tensor(0.8734, dtype=torch.float64), 'test_loss': tensor(0.4835, device='cuda:0')} --------------------------------------------------------------------------------
[{'test_loss': 0.4835154712200165, 'test_accuracy': 0.895, 'test_f1': 0.8734305694305696}]
def load_data(file, toknizer):
data = load_dataset("csv", data_files=config['root_dir']+file)
dataset = ReviewDataset(tokenizer=tokenizer, data=data['train'], text_field=config['text_field'], label_field=config['label_field'])
data_loader = DataLoader(dataset=dataset, batch_size=config['batch_size'], shuffle=False)
return data_loader
def test_fn(model, loader):
y_true = []
y_pred = []
model.eval()
for batch in tqdm.tqdm(loader):
input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['targets']
logits = model(input_ids.to(device), attention_mask.to(device))
y_true += targets.tolist()
y_pred += logits.argmax(dim=1).cpu().tolist()
return classification_report(y_true, y_pred)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=config['model_name'])
for target in config['targets']:
loader = load_data(file=target, toknizer=tokenizer)
report = test_fn(model.to(device), loader)
print(f'Target Domain Name: {target}')
print(report)
del loader
Using custom data configuration default
Downloading and preparing dataset csv/default-884b52687aa4816e (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-884b52687aa4816e/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2...
HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))
0%| | 0/248 [00:00<?, ?it/s]
Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-884b52687aa4816e/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2. Subsequent calls will reuse this data.
100%|██████████| 248/248 [00:49<00:00, 5.02it/s]
Target Domain Name: dvd.csv precision recall f1-score support 0 0.87 0.88 0.87 991 1 0.88 0.86 0.87 987 accuracy 0.87 1978 macro avg 0.87 0.87 0.87 1978 weighted avg 0.87 0.87 0.87 1978
Using custom data configuration default
Downloading and preparing dataset csv/default-795561195826f6a7 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-795561195826f6a7/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2...
HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))
0%| | 0/250 [00:00<?, ?it/s]
Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-795561195826f6a7/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2. Subsequent calls will reuse this data.
100%|██████████| 250/250 [00:48<00:00, 5.18it/s]
Target Domain Name: electronics.csv precision recall f1-score support 0 0.79 0.93 0.86 996 1 0.92 0.76 0.83 999 accuracy 0.85 1995 macro avg 0.86 0.85 0.84 1995 weighted avg 0.86 0.85 0.84 1995
Using custom data configuration default
Downloading and preparing dataset csv/default-900436761778b083 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-900436761778b083/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2...
HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))
0%| | 0/250 [00:00<?, ?it/s]
Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-900436761778b083/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2. Subsequent calls will reuse this data.
100%|██████████| 250/250 [00:48<00:00, 5.18it/s]
Target Domain Name: kitchen_housewares.csv precision recall f1-score support 0 0.88 0.89 0.88 1000 1 0.89 0.88 0.88 998 accuracy 0.88 1998 macro avg 0.88 0.88 0.88 1998 weighted avg 0.88 0.88 0.88 1998
# test data is same as validation data
data = load_dataset("csv", data_files=config['root_dir']+config['source'], split='train[:10%]')
dataset = ReviewDataset(tokenizer=tokenizer, data=data, text_field=config['text_field'], label_field=config['label_field'], max_len=config['max_len'])
loader = DataLoader(dataset=dataset, batch_size=config['batch_size'], shuffle=False)
Using custom data configuration default Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-c2d784cc01c6211e/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2)
report = test_fn(model.to(device), loader)
print(f'Source Domain Name: {config["source"]}')
print(report)
100%|██████████| 25/25 [00:04<00:00, 6.02it/s]
Source Domain Name: books.csv precision recall f1-score support 0 0.89 0.90 0.89 97 1 0.90 0.89 0.90 102 accuracy 0.89 199 macro avg 0.89 0.89 0.89 199 weighted avg 0.89 0.89 0.89 199