import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
La idea es poder predecir si un mensaje entrante es spam o no lo es (en caso que no lo sea se le llama ham).
dataset = pd.read_table("./demo_3_dataset/spam_or_ham.txt", header=None, names=["target", "text"])
dataset.head()
target | text | |
---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... |
1 | ham | Ok lar... Joking wif u oni... |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | ham | U dun say so early hor... U c already then say... |
4 | ham | Nah I don't think he goes to usf, he lives aro... |
print('ham:')
print(dataset.iloc[4,1])
print('spam:')
print(dataset.iloc[2,1])
ham: Nah I don't think he goes to usf, he lives around here though spam: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
dataset.shape
(5572, 2)
Transform the input from text into a bag of words matrix (documentation).
vectorizer = CountVectorizer()
vectorized_data= vectorizer.fit_transform(dataset["text"])
vectorizer.get_feature_names_out()[4000:4200]
array(['huge', 'hugging', 'hugh', 'hugs', 'huh', 'hui', 'huiming', 'hum', 'humanities', 'humans', 'hun', 'hundred', 'hundreds', 'hungover', 'hungry', 'hunks', 'hunny', 'hunt', 'hunting', 'hurricanes', 'hurried', 'hurry', 'hurt', 'hurting', 'hurts', 'husband', 'hussey', 'hustle', 'hut', 'hv', 'hv9d', 'hvae', 'hw', 'hyde', 'hype', 'hypertension', 'hypotheticalhuagauahahuagahyuhagga', 'iam', 'ias', 'ibh', 'ibhltd', 'ibiza', 'ibm', 'ibn', 'ibored', 'ibuprofens', 'ic', 'iccha', 'ice', 'icic', 'icicibank', 'icky', 'icmb3cktz8r7', 'icon', 'id', 'idc', 'idea', 'ideal', 'ideas', 'identification', 'identifier', 'idew', 'idiot', 'idk', 'idps', 'idu', 'ie', 'if', 'iff', 'ifink', 'ig11', 'ignorant', 'ignore', 'ignoring', 'ihave', 'ijust', 'ikea', 'ikno', 'iknow', 'il', 'ileave', 'ill', 'illness', 'illspeak', 'ilol', 'im', 'image', 'images', 'imagination', 'imagine', 'imat', 'imf', 'img', 'imin', 'imma', 'immed', 'immediately', 'immunisation', 'imp', 'impatient', 'impede', 'implications', 'important', 'importantly', 'imposed', 'impossible', 'imposter', 'impress', 'impressed', 'impression', 'impressively', 'improve', 'improved', 'imprtant', 'in', 'in2', 'inc', 'inch', 'inches', 'incident', 'inclu', 'include', 'includes', 'including', 'inclusive', 'incomm', 'inconsiderate', 'inconvenience', 'inconvenient', 'incorrect', 'increase', 'incredible', 'increments', 'inde', 'indeed', 'independence', 'independently', 'index', 'india', 'indian', 'indians', 'indicate', 'individual', 'indyarocks', 'inever', 'infact', 'infections', 'infernal', 'influx', 'info', 'inform', 'information', 'informed', 'infra', 'infront', 'ing', 'ingredients', 'initiate', 'ink', 'inlude', 'inmind', 'inner', 'innings', 'innocent', 'innu', 'inperialmusic', 'inpersonation', 'inr', 'insects', 'insha', 'inshah', 'inside', 'inspection', 'inst', 'install', 'installation', 'installing', 'instant', 'instantly', 'instead', 'instituitions', 'instructions', 'insurance', 'intelligent', 'intend', 'intention', 'intentions', 'interest', 'interested', 'interesting', 'interflora', 'interfued', 'internal', 'internet', 'interview', 'interviews', 'interviw', 'intha', 'into', 'intrepid'], dtype=object)
vectorized_data.shape
(5572, 8713)
vectorized_data.toarray()
array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
X_train, X_test, y_train, y_test = train_test_split(vectorized_data, dataset.target, test_size=0.2, random_state=5)
clf = MultinomialNB().fit(X_train, y_train)
Podemos ver algún caso en particular
Busquemos una entrada spam:
y_train[:20]
1658 ham 1509 ham 3266 spam 5199 ham 3217 spam 2579 ham 1330 ham 151 ham 209 ham 2066 ham 691 ham 4727 ham 5289 ham 5235 ham 2147 ham 2409 ham 5194 ham 5099 ham 2315 ham 2506 ham Name: target, dtype: object
Primero comparemos a mano si nuestro clasificador está prediciendo correctamente ese dato del conjunto de entrenamiento
caso = 3266
clf.predict(vectorized_data)[caso] == dataset.target[caso]
True
print(*dataset.iloc[caso])
spam 44 7732584351, Do you want a New Nokia 3510i colour phone DeliveredTomorrow? With 300 free minutes to any mobile + 100 free texts + Free Camcorder reply or call 08000930705.
Podemos obtener la probabilidad de que haya sido spam o ham:
np.exp(clf.predict_log_proba(vectorized_data[caso]))
array([[4.31330826e-18, 1.00000000e+00]])
En este caso, la probabilidad de spam es mayor que la de ham. Veamos lo mismo pero para un caso que no era spam:
print(*dataset.iloc[5199])
print(np.exp(clf.predict_log_proba(vectorized_data[5199])))
ham Ugh my leg hurts. Musta overdid it on mon. [[9.99957311e-01 4.26892776e-05]]
En este caso la probabilidad de ham es mayor que la de spam, por eso lo clasifica como ham.
Veamos algunos casos del conjunto de prueba:
clf.predict(X_test)[:20] == y_test[:20]
2095 True 5343 True 564 True 3849 True 3317 True 5277 True 1674 True 3753 True 5507 True 265 True 4413 True 5111 True 4896 True 3161 True 3743 True 2887 True 869 False 4061 True 1072 True 4559 True Name: target, dtype: bool
caso = 869
print(f"El caso estudiado es:")
print(*dataset.iloc[caso],"\n")
print(f"Clasificó como {clf.predict(vectorized_data)[caso]} y era {dataset.target[caso]}. La probabilidad de ser ham fue {np.exp(clf.predict_log_proba(vectorized_data)[caso][0]):.4f} y la de spam {np.exp(clf.predict_log_proba(vectorized_data)[caso][1]):.4f}")
if clf.predict(vectorized_data)[caso] == dataset.target[caso]:
print("Clasificó correctamente!")
else:
print("Clasificó incorrectamente...")
El caso estudiado es: spam Hello. We need some posh birds and chaps to user trial prods for champneys. Can i put you down? I need your address and dob asap. Ta r Clasificó como ham y era spam. La probabilidad de ser ham fue 1.0000 y la de spam 0.0000 Clasificó incorrectamente...
print(classification_report(y_test, clf.predict(X_test)))
precision recall f1-score support ham 0.99 0.99 0.99 970 spam 0.95 0.97 0.96 145 accuracy 0.99 1115 macro avg 0.97 0.98 0.98 1115 weighted avg 0.99 0.99 0.99 1115
The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label a negative sample as positive.
The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.
The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.
The F-beta score weights recall more than precision by a factor of beta. beta == 1.0 means recall and precision are equally important.