#!/usr/bin/env python # coding: utf-8 # # Sentiment Analysis with Hugging Face # Hugging Face is an open-source and platform provider of machine learning technologies. You can use install their package to access some interesting pre-built models to use them directly or to fine-tune (retrain it on your dataset leveraging the prior knowledge coming with the first training), then host your trained models on the platform, so that you may use them later on other devices and apps. # # Please, [go to the website and sign-in](https://huggingface.co/) to access all the features of the platform. # # [Read more about Text classification with Hugging Face](https://huggingface.co/tasks/text-classification) # # The Hugging face models are Deep Learning based, so will need a lot of computational GPU power to train them. Please use [Colab](https://colab.research.google.com/) to do it, or your other GPU cloud provider, or a local machine having NVIDIA GPU. # ## Application of Hugging Face Text classification model Fune-tuning # Find below a simple example, with just 10 epochs of fine-tuning`. # # Read more about the fine-tuning concept : [here](https://deeplizard.com/learn/video/5T-iXNNiwIs#:~:text=Fine%2Dtuning%20is%20a%20way,perform%20a%20second%20similar%20task.) # In[1]: get_ipython().system('pip install datasets') # In[2]: get_ipython().system('pip install transformers') # In[3]: get_ipython().system('pip install --upgrade accelerate') # In[4]: get_ipython().system('pip install sentencepiece') # ## Importing Libraries # In[5]: import huggingface_hub import os import numpy as np import pandas as pd from datasets import load_dataset from sklearn.model_selection import train_test_split from transformers import AutoModelForSequenceClassification from transformers import TFAutoModelForSequenceClassification from transformers import AutoTokenizer, AutoConfig from transformers import TrainingArguments, Trainer, DataCollatorWithPadding from sklearn.metrics import mean_squared_error # In[6]: huggingface_hub.notebook_login() # Now you are logged in to the Hugging Face Hub # In[8]: # Disabe W&B os.environ["WANDB_DISABLED"] = "true" # In[9]: # Load the dataset from a GitHub link url = "https://raw.githubusercontent.com/Azubi-Africa/Career_Accelerator_P5-NLP/master/zindi_challenge/data/Train.csv" df = pd.read_csv(url) # A way to eliminate rows containing NaN values df = df[~df.isna().any(axis=1)] # ## Splitting the dataset # In[10]: # Split the train data => {train, eval} train, eval = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label']) # In[11]: train.head() # In[12]: train.info() # In[13]: eval.head() # In[14]: eval.label.unique() # In[15]: print(f"new dataframe shapes: train is {train.shape}, eval is {eval.shape}") # ## Creating a pytorch dataset # In[16]: from datasets import DatasetDict, Dataset train_dataset = Dataset.from_pandas(train[['tweet_id', 'safe_text', 'label', 'agreement']]) eval_dataset = Dataset.from_pandas(eval[['tweet_id', 'safe_text', 'label', 'agreement']]) dataset = DatasetDict({'train': train_dataset, 'eval': eval_dataset}) dataset = dataset.remove_columns('__index_level_0__') dataset # ## Preprocessing our data # In[17]: # Preprocess text (username and link placeholders) def preprocess(text): new_text = [] for t in text.split(" "): t = '@user' if t.startswith('@') and len(t) > 1 else t t = 'http' if t.startswith('http') else t new_text.append(t) return " ".join(new_text) checkpoint = "cardiffnlp/twitter-xlm-roberta-base-sentiment" tokenizer = AutoTokenizer.from_pretrained(checkpoint) # config = AutoConfig.from_pretrained(MODEL) # In[18]: def transform_labels(label): label = label['label'] num = 0 if label == -1: #'Negative' num = 0 elif label == 0: #'Neutral' num = 1 elif label == 1: #'Positive' num = 2 return {'labels': num} def tokenize_data(example): return tokenizer(example['safe_text'], padding='max_length') # Change the tweets to tokens that the models can exploit dataset = dataset.map(tokenize_data, batched=True) # Transform labels and remove the useless columns remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement'] dataset = dataset.map(transform_labels, remove_columns=remove_columns) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # In[19]: dataset # ## Training # In[20]: # Configure the trianing parameters like `num_train_epochs`: # the number of time the model will repeat the training loop over the dataset training_args = TrainingArguments("test_trainer", num_train_epochs=10, load_best_model_at_end=True, save_strategy='epoch', evaluation_strategy='epoch', logging_strategy='epoch', logging_steps=100, per_device_train_batch_size=8, ) # In[21]: # Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3) # In[22]: train_dataset = dataset['train'].shuffle(seed=10) eval_dataset = dataset['eval'].shuffle(seed=10) # In[23]: def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return {"rmse": mean_squared_error(labels, predictions, squared=False)} # In[24]: trainer = Trainer( model, training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, ) # In[25]: trainer.train() # Don't worry the above issue, it is a `KeyboardInterrupt` that means I stopped the training to avoid taking a long time to finish. # In[26]: # Launch the final evaluation trainer.evaluate() # ## Pushing to HuggingFace # Some checkpoints of the model are automatically saved locally in `test_trainer/` during the training. # You may also upload the model on the Hugging Face Platform... [Read more](https://huggingface.co/docs/hub/models-uploading) # In[27]: # Push the model and tokenizer to Hugging Face # Push model and tokenizer to HugginFace model.push_to_hub("ikoghoemmanuell/finetuned_sentiment_modell") tokenizer.push_to_hub("ikoghoemmanuell/finetuned_sentiment_modell") # In[27]: