#!/usr/bin/env python
# coding: utf-8

# # Multinomial Naive Bayes

# In[1]:


import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")


# ### Cargamos el conjunto de datos de spam o no spam (ham)
# 
# La idea es poder predecir si un mensaje entrante es spam o no lo es (en caso que no lo sea se le llama *ham*).

# In[2]:


dataset = pd.read_table("./demo_3_dataset/spam_or_ham.txt", header=None, names=["target", "text"])
dataset.head()


# In[3]:


print('ham:')
print(dataset.iloc[4,1])
print('spam:')
print(dataset.iloc[2,1])


# In[4]:


dataset.shape


# ### Vectorize
# 
# Transform the input from text into a bag of words matrix ([documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)).

# In[5]:


vectorizer = CountVectorizer()
vectorized_data= vectorizer.fit_transform(dataset["text"])


# In[6]:


vectorizer.get_feature_names_out()[4000:4200]


# In[7]:


vectorized_data.shape


# In[8]:


vectorized_data.toarray()


# ### Dividimos los datos en conjunto de entrenamiento y de prueba

# In[9]:


X_train, X_test, y_train, y_test = train_test_split(vectorized_data, dataset.target, test_size=0.2, random_state=5)


# ### Entrenamos el modelo con el conjunto de entrenamiento

# In[10]:


clf = MultinomialNB().fit(X_train, y_train)


# ### Evaluamos el modelo

# Podemos ver algún caso en particular

# Busquemos una entrada spam:

# In[11]:


y_train[:20]


# Primero comparemos a mano si nuestro clasificador está prediciendo correctamente ese dato del conjunto de entrenamiento

# In[12]:


caso = 3266


# In[13]:


clf.predict(vectorized_data)[caso] == dataset.target[caso]


# In[14]:


print(*dataset.iloc[caso])


# Podemos obtener la probabilidad de que haya sido spam o ham:

# In[15]:


np.exp(clf.predict_log_proba(vectorized_data[caso]))


# En este caso, la probabilidad de spam es mayor que la de ham. Veamos lo mismo pero para un caso que no era spam:

# In[16]:


print(*dataset.iloc[5199])
print(np.exp(clf.predict_log_proba(vectorized_data[5199])))


# En este caso la probabilidad de ham es mayor que la de spam, por eso lo clasifica como ham.
# 
# Veamos algunos casos del conjunto de prueba:

# In[17]:


clf.predict(X_test)[:20] == y_test[:20]


# In[18]:


caso = 869
print(f"El caso estudiado es:")
print(*dataset.iloc[caso],"\n")
print(f"Clasificó como {clf.predict(vectorized_data)[caso]} y era {dataset.target[caso]}. La probabilidad de ser ham fue {np.exp(clf.predict_log_proba(vectorized_data)[caso][0]):.4f} y la de spam {np.exp(clf.predict_log_proba(vectorized_data)[caso][1]):.4f}")
if clf.predict(vectorized_data)[caso] == dataset.target[caso]:
    print("Clasificó correctamente!")
else:
    print("Clasificó incorrectamente...")


# In[19]:


print(classification_report(y_test, clf.predict(X_test)))


# The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label a negative sample as positive.
# 
# The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.
# 
# The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.
# 
# The F-beta score weights recall more than precision by a factor of beta. beta == 1.0 means recall and precision are equally important.