#!/usr/bin/env python
# coding: utf-8

# # Sentiment Analysis with Hugging Face

# Hugging Face is an open-source and platform provider of machine learning technologies. You can use install their package to access some interesting pre-built models to use them directly or to fine-tune (retrain it on your dataset leveraging the prior knowledge coming with the first training), then host your trained models on the platform, so that you may use them later on other devices and apps.
# 
# Please, [go to the website and sign-in](https://huggingface.co/) to access all the features of the platform.
# 
# [Read more about Text classification with Hugging Face](https://huggingface.co/tasks/text-classification)
# 
# The Hugging face models are Deep Learning based, so will need a lot of computational GPU power to train them. Please use [Colab](https://colab.research.google.com/) to do it, or your other GPU cloud provider, or a local machine having NVIDIA GPU.

# ## Application of Hugging Face Text classification model Fune-tuning

# Find below a simple example, with just 10 epochs of fine-tuning`. 
# 
# Read more about the fine-tuning concept : [here](https://deeplizard.com/learn/video/5T-iXNNiwIs#:~:text=Fine%2Dtuning%20is%20a%20way,perform%20a%20second%20similar%20task.)

# In[1]:


get_ipython().system('pip install datasets')


# In[2]:


get_ipython().system('pip install transformers')


# In[3]:


get_ipython().system('pip install --upgrade accelerate')


# In[4]:


get_ipython().system('pip install sentencepiece')


# ## Importing Libraries

# In[5]:


import huggingface_hub
import os
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

from sklearn.metrics import mean_squared_error


# In[6]:


huggingface_hub.notebook_login()
# Now you are logged in to the Hugging Face Hub


# In[8]:


# Disabe W&B
os.environ["WANDB_DISABLED"] = "true"


# In[9]:


# Load the dataset from a GitHub link
url = "https://raw.githubusercontent.com/Azubi-Africa/Career_Accelerator_P5-NLP/master/zindi_challenge/data/Train.csv"
df = pd.read_csv(url)

# A way to eliminate rows containing NaN values
df = df[~df.isna().any(axis=1)]


# ## Splitting the dataset

# In[10]:


# Split the train data => {train, eval}
train, eval = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])


# In[11]:


train.head()


# In[12]:


train.info()


# In[13]:


eval.head()


# In[14]:


eval.label.unique()


# In[15]:


print(f"new dataframe shapes: train is {train.shape}, eval is {eval.shape}")


# ## Creating a pytorch dataset

# In[16]:


from datasets import DatasetDict, Dataset
train_dataset = Dataset.from_pandas(train[['tweet_id', 'safe_text', 'label', 'agreement']])
eval_dataset = Dataset.from_pandas(eval[['tweet_id', 'safe_text', 'label', 'agreement']])

dataset = DatasetDict({'train': train_dataset, 'eval': eval_dataset})
dataset = dataset.remove_columns('__index_level_0__')
dataset


# ## Preprocessing our data

# In[17]:


# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

checkpoint = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# config = AutoConfig.from_pretrained(MODEL)


# In[18]:


def transform_labels(label):

    label = label['label']
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2

    return {'labels': num}

def tokenize_data(example):
    return tokenizer(example['safe_text'], padding='max_length')

# Change the tweets to tokens that the models can exploit
dataset = dataset.map(tokenize_data, batched=True)

# Transform	labels and remove the useless columns
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# In[19]:


dataset


# ## Training

# In[20]:


# Configure the trianing parameters like `num_train_epochs`: 
# the number of time the model will repeat the training loop over the dataset
training_args = TrainingArguments("test_trainer", 
                                  num_train_epochs=10, 
                                  load_best_model_at_end=True, 
                                  save_strategy='epoch',
                                  evaluation_strategy='epoch',
                                  logging_strategy='epoch',
                                  logging_steps=100,
                                  per_device_train_batch_size=8,
                                  )


# In[21]:


# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)


# In[22]:


train_dataset = dataset['train'].shuffle(seed=10) 
eval_dataset = dataset['eval'].shuffle(seed=10) 


# In[23]:


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"rmse": mean_squared_error(labels, predictions, squared=False)}


# In[24]:


trainer = Trainer(
    model,
    training_args, 
    train_dataset=train_dataset, 
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


# In[25]:


trainer.train()


# Don't worry the above issue, it is a `KeyboardInterrupt` that means I stopped the training to avoid taking a long time to finish.

# In[26]:


# Launch the final evaluation 
trainer.evaluate()


# ## Pushing to HuggingFace
# Some checkpoints of the model are automatically saved locally in `test_trainer/` during the training.

# You may also upload the model on the Hugging Face Platform... [Read more](https://huggingface.co/docs/hub/models-uploading)

# In[27]:


# Push the model and tokenizer to Hugging Face
# Push model and tokenizer to HugginFace 
model.push_to_hub("ikoghoemmanuell/finetuned_sentiment_modell")
tokenizer.push_to_hub("ikoghoemmanuell/finetuned_sentiment_modell")


# In[27]: