import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.io as plio
plio.templates
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib
import re
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report,precision_recall_curve, average_precision_score
from warnings import filterwarnings
filterwarnings(action='ignore')
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')
data
v1 | v2 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | |
---|---|---|---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... | NaN | NaN | NaN |
1 | ham | Ok lar... Joking wif u oni... | NaN | NaN | NaN |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | NaN | NaN | NaN |
3 | ham | U dun say so early hor... U c already then say... | NaN | NaN | NaN |
4 | ham | Nah I don't think he goes to usf, he lives aro... | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... |
5567 | spam | This is the 2nd time we have tried 2 contact u... | NaN | NaN | NaN |
5568 | ham | Will Ì_ b going to esplanade fr home? | NaN | NaN | NaN |
5569 | ham | Pity, * was in mood for that. So...any other s... | NaN | NaN | NaN |
5570 | ham | The guy did some bitching but I acted like i'd... | NaN | NaN | NaN |
5571 | ham | Rofl. Its true to its name | NaN | NaN | NaN |
5572 rows × 5 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5572 entries, 0 to 5571 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 v1 5572 non-null object 1 v2 5572 non-null object 2 Unnamed: 2 50 non-null object 3 Unnamed: 3 12 non-null object 4 Unnamed: 4 6 non-null object dtypes: object(5) memory usage: 217.8+ KB
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3','Unnamed: 4']
data.drop(columns=columns_to_drop, inplace=True)
data
v1 | v2 | |
---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... |
1 | ham | Ok lar... Joking wif u oni... |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | ham | U dun say so early hor... U c already then say... |
4 | ham | Nah I don't think he goes to usf, he lives aro... |
... | ... | ... |
5567 | spam | This is the 2nd time we have tried 2 contact u... |
5568 | ham | Will Ì_ b going to esplanade fr home? |
5569 | ham | Pity, * was in mood for that. So...any other s... |
5570 | ham | The guy did some bitching but I acted like i'd... |
5571 | ham | Rofl. Its true to its name |
5572 rows × 2 columns
col_rename = {'v1': 'class', 'v2': 'text'}
data = data.rename(columns=col_rename)
data
#data.to_csv('spam_cleaned.csv')
class | text | |
---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... |
1 | ham | Ok lar... Joking wif u oni... |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | ham | U dun say so early hor... U c already then say... |
4 | ham | Nah I don't think he goes to usf, he lives aro... |
... | ... | ... |
5567 | spam | This is the 2nd time we have tried 2 contact u... |
5568 | ham | Will Ì_ b going to esplanade fr home? |
5569 | ham | Pity, * was in mood for that. So...any other s... |
5570 | ham | The guy did some bitching but I acted like i'd... |
5571 | ham | Rofl. Its true to its name |
5572 rows × 2 columns
class_key = data['class'].value_counts().keys()
class_count = data['class'].value_counts().values
labels = [str(val) for val in class_count]
fig = px.bar(x=class_key, y=class_count, text=labels, title='Class vs count', template='plotly', color_discrete_sequence=['darkviolet'])
# Set the labels for the y-axis and x-axis
fig.update_yaxes(title_text='class_count')
fig.update_xaxes(title_text='class_key')
# Show the plot
fig.show()
# Count the occurrences of top 15 unique strings in the 'text' column
value_counts = data['text'].value_counts().head(15).reset_index()
value_counts.columns = ['Category', 'Count']
# Create a bar plot using Plotly
fig = px.bar(value_counts, x='Category', y='Count')
# Add values to the bars
fig.update_traces(texttemplate='%{y}', textposition='outside')
# Set plot labels and title
fig.update_xaxes(title='text')
fig.update_yaxes(title='Occurences of the text')
fig.update_layout(title='Most Common String in the Text Column')
# Show the plot
fig.show()
# Changing Class column : ham -> 0, spam -> 1
category_mapping = {'ham': 0, 'spam': 1}
data['class'] = data['class'].map(category_mapping)
data
class | text | |
---|---|---|
0 | 0 | Go until jurong point, crazy.. Available only ... |
1 | 0 | Ok lar... Joking wif u oni... |
2 | 1 | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | 0 | U dun say so early hor... U c already then say... |
4 | 0 | Nah I don't think he goes to usf, he lives aro... |
... | ... | ... |
5567 | 1 | This is the 2nd time we have tried 2 contact u... |
5568 | 0 | Will Ì_ b going to esplanade fr home? |
5569 | 0 | Pity, * was in mood for that. So...any other s... |
5570 | 0 | The guy did some bitching but I acted like i'd... |
5571 | 0 | Rofl. Its true to its name |
5572 rows × 2 columns
x_train, x_test, y_train, y_test=train_test_split(data.text, data['class'],test_size=0.3)
#x_train.to_csv('x_train.csv',index = False)
#x_test.to_csv('x_test.csv', index=False)
#y_train.to_csv('y_train.csv', index=False)
#y_test.to_csv('y_test.csv',index=False)
#x_train = pd.read_csv('x_train.csv')
#x_test = pd.read_csv('x_test.csv')
#y_train = pd.read_csv('y_train.csv')
#y_test = pd.read_csv('y_test.csv')
cv = CountVectorizer()
x_train_new = cv.fit_transform(x_train)
x_test_new = cv.transform(x_test)
x_train_array = x_train_new.toarray()
x_test_array = x_test_new.toarray()
classifier_1 = MultinomialNB()
classifier_1.fit(x_train_new, y_train)
MultinomialNB()
y_pred_1 = classifier_1.predict(x_test_new)
conf_matrix = confusion_matrix(y_test, y_pred_1)
print("Confusion Matrix:")
print(conf_matrix)
Confusion Matrix: [[1424 6] [ 18 224]]
accuracy_1 = accuracy_score(y_test, y_pred_1)
print("Accuracy:", accuracy_1)
Accuracy: 0.9856459330143541
class_report = classification_report(y_test, y_pred_1)
print("Classification Report:")
print(class_report)
Classification Report: precision recall f1-score support 0 0.99 1.00 0.99 1430 1 0.97 0.93 0.95 242 accuracy 0.99 1672 macro avg 0.98 0.96 0.97 1672 weighted avg 0.99 0.99 0.99 1672
conf_matrix = confusion_matrix(y_test, y_pred_1)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Multinomial Naive Bayes classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
precision, recall, _ = precision_recall_curve(y_test, y_pred_1)
average_precision = average_precision_score(y_test, y_pred_1)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Multinomial Naive Bayes classifier (Average-precision = {average_precision:.2f})')
plt.show()
classifier_2 = LogisticRegression()
classifier_2.fit(x_train_new, y_train)
LogisticRegression()
y_pred_2 = classifier_2.predict(x_test_new)
conf_matrix = confusion_matrix(y_test, y_pred_2)
print("Confusion Matrix:")
print(conf_matrix)
Confusion Matrix: [[1429 1] [ 32 210]]
accuracy_2 = accuracy_score(y_test, y_pred_2)
print("Accuracy:", accuracy_2)
Accuracy: 0.9802631578947368
class_report = classification_report(y_test, y_pred_2)
print("Classification Report:")
print(class_report)
Classification Report: precision recall f1-score support 0 0.98 1.00 0.99 1430 1 1.00 0.87 0.93 242 accuracy 0.98 1672 macro avg 0.99 0.93 0.96 1672 weighted avg 0.98 0.98 0.98 1672
conf_matrix = confusion_matrix(y_test, y_pred_2)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Logistic Regression classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
precision, recall, _ = precision_recall_curve(y_test, y_pred_2)
average_precision = average_precision_score(y_test, y_pred_2)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Logistic Regression classifier (Average-precision = {average_precision:.2f})')
plt.show()
classifier_3 = DecisionTreeClassifier()
classifier_3.fit(x_train_new, y_train)
DecisionTreeClassifier()
y_pred_3 = classifier_3.predict(x_test_new)
conf_matrix = confusion_matrix(y_test, y_pred_3)
print("Confusion Matrix:")
print(conf_matrix)
Confusion Matrix: [[1409 21] [ 37 205]]
accuracy_3 = accuracy_score(y_test, y_pred_3)
print("Accuracy:", accuracy_3)
Accuracy: 0.965311004784689
class_report = classification_report(y_test, y_pred_3)
print("Classification Report:")
print(class_report)
Classification Report: precision recall f1-score support 0 0.97 0.99 0.98 1430 1 0.91 0.85 0.88 242 accuracy 0.97 1672 macro avg 0.94 0.92 0.93 1672 weighted avg 0.96 0.97 0.96 1672
conf_matrix = confusion_matrix(y_test, y_pred_3)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Decision Tree classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
precision, recall, _ = precision_recall_curve(y_test, y_pred_3)
average_precision = average_precision_score(y_test, y_pred_3)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Decision Tree classifier (Average-precision = {average_precision:.2f})')
plt.show()
classifier_4 = RandomForestClassifier()
classifier_4.fit(x_train_new, y_train)
RandomForestClassifier()
y_pred_4 = classifier_4.predict(x_test_new)
conf_matrix = confusion_matrix(y_test, y_pred_4)
print("Confusion Matrix:")
print(conf_matrix)
Confusion Matrix: [[1428 2] [ 44 198]]
accuracy_4 = accuracy_score(y_test, y_pred_4)
print("Accuracy:", accuracy_4)
Accuracy: 0.972488038277512
class_report = classification_report(y_test, y_pred_4)
print("Classification Report:")
print(class_report)
Classification Report: precision recall f1-score support 0 0.97 1.00 0.98 1430 1 0.99 0.82 0.90 242 accuracy 0.97 1672 macro avg 0.98 0.91 0.94 1672 weighted avg 0.97 0.97 0.97 1672
conf_matrix = confusion_matrix(y_test, y_pred_4)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Random Forest classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
precision, recall, _ = precision_recall_curve(y_test, y_pred_4)
average_precision = average_precision_score(y_test, y_pred_4)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Random Forest classifier (Average-precision = {average_precision:.2f})')
plt.show()
svm = SVC()
classifier_5 = svm
classifier_5.fit(x_train_new, y_train)
SVC()
y_pred_5 = classifier_5.predict(x_test_new)
accuracy_5 = accuracy_score(y_test, y_pred_5)
print("Accuracy:", accuracy_5)
Accuracy: 0.9742822966507177
conf_matrix = confusion_matrix(y_test, y_pred_5)
print("Confusion Matrix:")
print(conf_matrix)
Confusion Matrix: [[1429 1] [ 42 200]]
class_report = classification_report(y_test, y_pred_5)
print("Classification Report:")
print(class_report)
Classification Report: precision recall f1-score support 0 0.97 1.00 0.99 1430 1 1.00 0.83 0.90 242 accuracy 0.97 1672 macro avg 0.98 0.91 0.94 1672 weighted avg 0.97 0.97 0.97 1672
conf_matrix = confusion_matrix(y_test, y_pred_5)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Support Vector Machine classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
precision, recall, _ = precision_recall_curve(y_test, y_pred_5)
average_precision = average_precision_score(y_test, y_pred_5)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Support Vector Machine classifier (Average-precision = {average_precision:.2f})')
plt.show()
model_accuracies = {
"MultiNominal Naive Bayes Classifier": accuracy_1,
"Logistic Regression Classifer": accuracy_2,
"Decision Tree Classifier": accuracy_3,
"Random Forest Classfier": accuracy_4,
"Support Vector Machine Classifier": accuracy_5
}
print("1. MultiNominal Naive Bayes Classifier : ", accuracy_1)
print("2. Logistic Regression Classifer : ", accuracy_2)
print("3. Decision Tree Classifier : ", accuracy_3)
print("4. Random Forest Classfier : ", accuracy_4)
print("5. Support Vector Machine Classifier : ", accuracy_5)
1. MultiNominal Naive Bayes Classifier : 0.9856459330143541 2. Logistic Regression Classifer : 0.9802631578947368 3. Decision Tree Classifier : 0.965311004784689 4. Random Forest Classfier : 0.972488038277512 5. Support Vector Machine Classifier : 0.9742822966507177
best_model_name = max(model_accuracies, key=model_accuracies.get)
best_accuracy = model_accuracies[best_model_name]
print(f"Best Performance Model is {best_model_name} with an accuracy score of {best_accuracy}")
Best Performance Model is MultiNominal Naive Bayes Classifier with an accuracy score of 0.9856459330143541
joblib.dump(classifier_1, 'multinomial_nb_model.joblib')
['multinomial_nb_model.joblib']
def model_eval():
loaded_model = joblib.load('multinomial_nb_model.joblib')
def classify_message(input_message):
input_message = re.sub(r'[^a-zA-Z0-9\s]', '', input_message) # Remove special characters
input_message = input_message.lower() # Convert to lowercase
input_message = [input_message]
input_message_vectorized = cv.transform(input_message)
prediction = loaded_model.predict(input_message_vectorized)
return prediction[0]
# User input
user_input = input("Enter a message: ")
# Call the classify_message function to detect spam or non-spam
result = classify_message(user_input)
# Display the result to the user
if result == 1:
print("Spam detected.")
else:
print("The above entered message is a Non-spam (ham) message.")
model_eval()
Enter a message: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... The above entered message is a Non-spam (ham) message.
model_eval()
Enter a message: XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL Spam detected.
model_eval()
Enter a message: Dear ADVAIT CHAVAN, ⚡ Offer Ends Tonight! Get CAT 2023 Mock Series for just ₹ 999. Grab now: https://cracku.in/group-offer/catmocks 📣 Results speak for themselves The above entered message is a Non-spam (ham) message.
model_eval()
Enter a message: Hi Advait, Shhhh. There’s a secret we want to share with you. We know who is going to win the Cricket World Cup. And the answer will leave you amazed ‘cause YOU’re going to be the winner! 🤩 Wondering how? 🤔 By taking part in the World’s Biggest Cricket Quiz Festival, you get a chance to bag rewards that are as exciting as Dhoni’s winning six from 2011: Win cash rewards from a prize pool INR 4 Lakhs 💰 - The overall Grand Finale winner wins a whopping INR 1,00,000/- - Runner up bags INR 50,000/- - Second Runner up bags INR 30,000/- - All 3 remaining finalists win INR 5,000 /- each Exclusive discounts on Unstop Pro 💯 Subscriptions to OTT Play* for Quizzer of the Day 🤩 Amazon vouchers for the rest of the leaderboard worth thousands & much more! *T&C apply Spam detected.