#!/usr/bin/env python # coding: utf-8 # # DistilBert Multilabel # Multi-label DistilBert model trained (/fine-tuned) on toxicity dataset: # # # - https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge # In[1]: # !conda install watermark -c conda-forge --yes # In[2]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-p torch,transformers,pandas,tqdm') # In[3]: import pandas as pd import torch from tqdm import tqdm from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler from transformers import DistilBertTokenizer, DistilBertModel # # Config # In[4]: MAX_LEN = 512 TRAIN_BATCH_SIZE = 16 VALID_BATCH_SIZE = 16 EPOCHS = 3 LEARNING_RATE = 1e-05 DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu' print(DEVICE) # # Load and Prepare Dataset # In[5]: train_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip') label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] train_data['labels'] = train_data[label_columns].apply(lambda x: list(x), axis=1) train_data.drop(['id'], inplace=True, axis=1) train_data.drop(label_columns, inplace=True, axis=1) train_data.head() # In[6]: class MultiLabelDataset(Dataset): def __init__(self, dataframe, tokenizer, max_len, new_data=False): self.tokenizer = tokenizer self.data = dataframe self.text = dataframe.comment_text self.new_data = new_data if not new_data: self.targets = self.data.labels self.max_len = max_len def __len__(self): return len(self.text) def __getitem__(self, index): text = str(self.text[index]) text = " ".join(text.split()) inputs = self.tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=self.max_len, pad_to_max_length=True, return_token_type_ids=True ) ids = inputs['input_ids'] mask = inputs['attention_mask'] token_type_ids = inputs["token_type_ids"] out = { 'ids': torch.tensor(ids, dtype=torch.long), 'mask': torch.tensor(mask, dtype=torch.long), 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), } if not self.new_data: out['targets'] = torch.tensor(self.targets[index], dtype=torch.float) return out # In[7]: train_size = 1.0 train_df = train_data.sample(frac=train_size, random_state=123) val_df = train_data.drop(train_df.index).reset_index(drop=True) train_df = train_df.reset_index(drop=True) print("Orig Dataset: {}".format(train_data.shape)) print("Training Dataset: {}".format(train_df.shape)) print("Validation Dataset: {}".format(val_df.shape)) tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True) training_set = MultiLabelDataset(train_df, tokenizer, MAX_LEN) val_set = MultiLabelDataset(val_df, tokenizer, MAX_LEN) # In[8]: train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 8 } val_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': False, 'num_workers': 8 } training_loader = DataLoader(training_set, **train_params) #val_loader = DataLoader(val_set, **val_params) # # Model # In[9]: class DistilBERTClass(torch.nn.Module): def __init__(self): super(DistilBERTClass, self).__init__() self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased") self.classifier = torch.nn.Sequential( torch.nn.Linear(768, 768), torch.nn.ReLU(), torch.nn.Dropout(0.1), torch.nn.Linear(768, 6) ) def forward(self, input_ids, attention_mask, token_type_ids): output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask) hidden_state = output_1[0] out = hidden_state[:, 0] out = self.classifier(out) return out model = DistilBERTClass() model.to(DEVICE); # In[10]: optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE) # # Training # In[11]: def train(epoch): model.train() for _, data in tqdm(enumerate(training_loader, 0)): ids = data['ids'].to(DEVICE, dtype=torch.long) mask = data['mask'].to(DEVICE, dtype=torch.long) token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long) targets = data['targets'].to(DEVICE, dtype=torch.float) outputs = model(ids, mask, token_type_ids) optimizer.zero_grad() loss = torch.nn.functional.binary_cross_entropy_with_logits(outputs, targets) if _ % 5000 == 0: print(f'Epoch: {epoch}, Loss: {loss.item()}') loss.backward() optimizer.step() # In[12]: for epoch in range(EPOCHS): train(epoch) # # Generate Test Submissions # In[14]: test_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip') test_data.head() # In[15]: test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True) test_loader = DataLoader(test_set, **val_params) # In[16]: all_test_pred = [] def test(epoch): model.eval() with torch.inference_mode(): for _, data in tqdm(enumerate(test_loader, 0)): ids = data['ids'].to(DEVICE, dtype=torch.long) mask = data['mask'].to(DEVICE, dtype=torch.long) token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long) outputs = model(ids, mask, token_type_ids) probas = torch.sigmoid(outputs) all_test_pred.append(probas) return probas probas = test(model) # In[17]: all_test_pred = torch.cat(all_test_pred) # In[18]: submit_df = test_data.copy() submit_df.drop("comment_text", inplace=True, axis=1) # In[19]: label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] # In[20]: for i,name in enumerate(label_columns): submit_df[name] = all_test_pred[:, i].cpu() submit_df.head() # In[21]: submit_df.to_csv('submission.csv', index=False) # # Scores # - Public leaderboard: 0.98515 # - Private leaderboard: 0.98511 # In[ ]: