#!/usr/bin/env python
# coding: utf-8
# ## Name : ADVAIT GURUNATH CHAVAN
# ## Contact Number : +91 70214 55852
# ## Mail ID :advaitchavan135@gmail.com
#
#
# ## Oasis Infobyte Data Science Intern
# ## Task 4 : Email Mail Spam Detection with Machine Learning.
#
# ### 1. Importing the necessary dependencies
# In[1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.io as plio
plio.templates
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib
import re
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report,precision_recall_curve, average_precision_score
from warnings import filterwarnings
filterwarnings(action='ignore')
# ### 2. Exploring the dataset
# In[2]:
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')
# In[3]:
data
# In[4]:
data.info()
# #### Dropping unnecessary columns from the dataset
# In[5]:
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3','Unnamed: 4']
data.drop(columns=columns_to_drop, inplace=True)
# In[6]:
data
# #### Renaming columns v1 -> class and v2 -> text
# In[7]:
col_rename = {'v1': 'class', 'v2': 'text'}
data = data.rename(columns=col_rename)
# In[8]:
data
#data.to_csv('spam_cleaned.csv')
# In[9]:
class_key = data['class'].value_counts().keys()
class_count = data['class'].value_counts().values
labels = [str(val) for val in class_count]
fig = px.bar(x=class_key, y=class_count, text=labels, title='Class vs count', template='plotly', color_discrete_sequence=['darkviolet'])
# Set the labels for the y-axis and x-axis
fig.update_yaxes(title_text='class_count')
fig.update_xaxes(title_text='class_key')
# Show the plot
fig.show()
# In[10]:
# Count the occurrences of top 15 unique strings in the 'text' column
value_counts = data['text'].value_counts().head(15).reset_index()
value_counts.columns = ['Category', 'Count']
# Create a bar plot using Plotly
fig = px.bar(value_counts, x='Category', y='Count')
# Add values to the bars
fig.update_traces(texttemplate='%{y}', textposition='outside')
# Set plot labels and title
fig.update_xaxes(title='text')
fig.update_yaxes(title='Occurences of the text')
fig.update_layout(title='Most Common String in the Text Column')
# Show the plot
fig.show()
# #### Encoding the class column
# In[11]:
# Changing Class column : ham -> 0, spam -> 1
category_mapping = {'ham': 0, 'spam': 1}
data['class'] = data['class'].map(category_mapping)
# In[12]:
data
# ### 3. Data Modelling (Preparing the training and testing data)
# In[13]:
x_train, x_test, y_train, y_test=train_test_split(data.text, data['class'],test_size=0.3)
# In[14]:
#x_train.to_csv('x_train.csv',index = False)
#x_test.to_csv('x_test.csv', index=False)
#y_train.to_csv('y_train.csv', index=False)
#y_test.to_csv('y_test.csv',index=False)
# ### 4. Countervectorizing x_train and x_test
# #### CountVectorizer is commonly used in text classification tasks, where it helps convert text strings into a format suitable for training classifiers.
# In[15]:
#x_train = pd.read_csv('x_train.csv')
#x_test = pd.read_csv('x_test.csv')
#y_train = pd.read_csv('y_train.csv')
#y_test = pd.read_csv('y_test.csv')
# In[16]:
cv = CountVectorizer()
# In[17]:
x_train_new = cv.fit_transform(x_train)
# In[18]:
x_test_new = cv.transform(x_test)
# In[19]:
x_train_array = x_train_new.toarray()
x_test_array = x_test_new.toarray()
# ### 5. Training and testing using Classification Models
#
#
# ### [A] Multinomial Naive Bayes Classifier
# In[20]:
classifier_1 = MultinomialNB()
classifier_1.fit(x_train_new, y_train)
# In[21]:
y_pred_1 = classifier_1.predict(x_test_new)
# In[22]:
conf_matrix = confusion_matrix(y_test, y_pred_1)
print("Confusion Matrix:")
print(conf_matrix)
# In[23]:
accuracy_1 = accuracy_score(y_test, y_pred_1)
print("Accuracy:", accuracy_1)
# In[24]:
class_report = classification_report(y_test, y_pred_1)
print("Classification Report:")
print(class_report)
# In[25]:
conf_matrix = confusion_matrix(y_test, y_pred_1)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Multinomial Naive Bayes classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
# In[26]:
precision, recall, _ = precision_recall_curve(y_test, y_pred_1)
average_precision = average_precision_score(y_test, y_pred_1)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Multinomial Naive Bayes classifier (Average-precision = {average_precision:.2f})')
plt.show()
# ### [B] Logistic Regression Classifier
# In[27]:
classifier_2 = LogisticRegression()
classifier_2.fit(x_train_new, y_train)
# In[28]:
y_pred_2 = classifier_2.predict(x_test_new)
# In[29]:
conf_matrix = confusion_matrix(y_test, y_pred_2)
print("Confusion Matrix:")
print(conf_matrix)
# In[30]:
accuracy_2 = accuracy_score(y_test, y_pred_2)
print("Accuracy:", accuracy_2)
# In[31]:
class_report = classification_report(y_test, y_pred_2)
print("Classification Report:")
print(class_report)
# In[32]:
conf_matrix = confusion_matrix(y_test, y_pred_2)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Logistic Regression classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
# In[33]:
precision, recall, _ = precision_recall_curve(y_test, y_pred_2)
average_precision = average_precision_score(y_test, y_pred_2)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Logistic Regression classifier (Average-precision = {average_precision:.2f})')
plt.show()
# ### [C] Decision Tree Classifer
# In[34]:
classifier_3 = DecisionTreeClassifier()
classifier_3.fit(x_train_new, y_train)
# In[35]:
y_pred_3 = classifier_3.predict(x_test_new)
# In[36]:
conf_matrix = confusion_matrix(y_test, y_pred_3)
print("Confusion Matrix:")
print(conf_matrix)
# In[37]:
accuracy_3 = accuracy_score(y_test, y_pred_3)
print("Accuracy:", accuracy_3)
# In[38]:
class_report = classification_report(y_test, y_pred_3)
print("Classification Report:")
print(class_report)
# In[39]:
conf_matrix = confusion_matrix(y_test, y_pred_3)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Decision Tree classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
# In[40]:
precision, recall, _ = precision_recall_curve(y_test, y_pred_3)
average_precision = average_precision_score(y_test, y_pred_3)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Decision Tree classifier (Average-precision = {average_precision:.2f})')
plt.show()
# ### [D] Random Forest Classifier
# In[41]:
classifier_4 = RandomForestClassifier()
classifier_4.fit(x_train_new, y_train)
# In[42]:
y_pred_4 = classifier_4.predict(x_test_new)
# In[43]:
conf_matrix = confusion_matrix(y_test, y_pred_4)
print("Confusion Matrix:")
print(conf_matrix)
# In[44]:
accuracy_4 = accuracy_score(y_test, y_pred_4)
print("Accuracy:", accuracy_4)
# In[45]:
class_report = classification_report(y_test, y_pred_4)
print("Classification Report:")
print(class_report)
# In[46]:
conf_matrix = confusion_matrix(y_test, y_pred_4)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Random Forest classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
# In[47]:
precision, recall, _ = precision_recall_curve(y_test, y_pred_4)
average_precision = average_precision_score(y_test, y_pred_4)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Random Forest classifier (Average-precision = {average_precision:.2f})')
plt.show()
# ### [E] Support Vector Machine Classifier
# In[48]:
svm = SVC()
classifier_5 = svm
classifier_5.fit(x_train_new, y_train)
# In[49]:
y_pred_5 = classifier_5.predict(x_test_new)
# In[50]:
accuracy_5 = accuracy_score(y_test, y_pred_5)
print("Accuracy:", accuracy_5)
# In[51]:
conf_matrix = confusion_matrix(y_test, y_pred_5)
print("Confusion Matrix:")
print(conf_matrix)
# In[52]:
class_report = classification_report(y_test, y_pred_5)
print("Classification Report:")
print(class_report)
# In[53]:
conf_matrix = confusion_matrix(y_test, y_pred_5)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix of performance by Support Vector Machine classifier')
plt.colorbar()
tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam)
plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45)
plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
# In[54]:
precision, recall, _ = precision_recall_curve(y_test, y_pred_5)
average_precision = average_precision_score(y_test, y_pred_5)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve of performance by Support Vector Machine classifier (Average-precision = {average_precision:.2f})')
plt.show()
# #### Individual Accuracy scores of each model
# In[55]:
model_accuracies = {
"MultiNominal Naive Bayes Classifier": accuracy_1,
"Logistic Regression Classifer": accuracy_2,
"Decision Tree Classifier": accuracy_3,
"Random Forest Classfier": accuracy_4,
"Support Vector Machine Classifier": accuracy_5
}
# In[56]:
print("1. MultiNominal Naive Bayes Classifier : ", accuracy_1)
print("2. Logistic Regression Classifer : ", accuracy_2)
print("3. Decision Tree Classifier : ", accuracy_3)
print("4. Random Forest Classfier : ", accuracy_4)
print("5. Support Vector Machine Classifier : ", accuracy_5)
# In[57]:
best_model_name = max(model_accuracies, key=model_accuracies.get)
best_accuracy = model_accuracies[best_model_name]
print(f"Best Performance Model is {best_model_name} with an accuracy score of {best_accuracy}")
# ### 6. Saving the best performing Model
# In[58]:
joblib.dump(classifier_1, 'multinomial_nb_model.joblib')
# ### 7. Evaluating the model performance
# In[59]:
def model_eval():
loaded_model = joblib.load('multinomial_nb_model.joblib')
def classify_message(input_message):
input_message = re.sub(r'[^a-zA-Z0-9\s]', '', input_message) # Remove special characters
input_message = input_message.lower() # Convert to lowercase
input_message = [input_message]
input_message_vectorized = cv.transform(input_message)
prediction = loaded_model.predict(input_message_vectorized)
return prediction[0]
# User input
user_input = input("Enter a message: ")
# Call the classify_message function to detect spam or non-spam
result = classify_message(user_input)
# Display the result to the user
if result == 1:
print("Spam detected.")
else:
print("The above entered message is a Non-spam (ham) message.")
# In[60]:
model_eval()
# In[61]:
model_eval()
# In[64]:
model_eval()
# In[67]:
model_eval()
# In[ ]: