!pip install transformers -qq !pip install datasets -qq import numpy as np import pandas as pd import torch from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments from datasets import load_dataset, Dataset from sklearn.metrics import accuracy_score, precision_recall_fscore_support model = BertForSequenceClassification.from_pretrained('dkleczek/bert-base-polish-uncased-v1') tokenizer = BertTokenizerFast.from_pretrained('dkleczek/bert-base-polish-uncased-v1') !wget -q https://klejbenchmark.com/static/data/klej_cbd.zip !unzip -q klej_cbd.zip df = pd.read_csv('train.tsv', delimiter='\t') df = df.dropna().reset_index(drop=True) df.columns = ['text', 'label'] df.label = df.label.astype(int) df = df.sample(frac=1, random_state=42) df.to_csv('train.csv', index=False) len(df), len(df[df.label == 1]) df.head() train_dataset, test_dataset = load_dataset('csv', data_files='train.csv', split=['train[:80%]', 'train[80%:]']) # train_dataset[0] def tokenize(batch): return tokenizer(batch['text'], padding=True, truncation=False) train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset)) test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(train_dataset)) # train_dataset[0] train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) # train_dataset[0] def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') acc = accuracy_score(labels, preds) return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall } training_args = TrainingArguments( output_dir='./results', learning_rate=2e-5, num_train_epochs=3, per_device_train_batch_size=64, per_device_eval_batch_size=64, fp16=True, warmup_steps=30, logging_steps=20, weight_decay=0.01, evaluate_during_training=True, logging_dir='./logs', ) trainer = Trainer( model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=train_dataset, eval_dataset=test_dataset ) trainer.train() trainer.evaluate() %load_ext tensorboard %tensorboard --logdir logs test_df = pd.read_csv('test_features.tsv', delimiter='\t') test_df.columns = ['text'] final_test_dataset = Dataset.from_pandas(test_df) final_test_dataset = final_test_dataset.map(tokenize, batched=True, batch_size=len(final_test_dataset)) final_test_dataset.set_format('torch', columns=['input_ids', 'attention_mask']) !wget https://raw.githubusercontent.com/ptaszynski/cyberbullying-Polish/master/task%2001/test_set_clean_only_tags.txt df_lbls = pd.read_csv('test_set_clean_only_tags.txt',names=['label']) labels = df_lbls.label.values preds = trainer.predict(final_test_dataset) outputs = preds.predictions.argmax(axis=1) precision, recall, f1, _ = precision_recall_fscore_support(labels, outputs, average='binary') acc = accuracy_score(labels, outputs) print( { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall })