#!/usr/bin/env python # coding: utf-8 # ## Name : ADVAIT GURUNATH CHAVAN # ## Contact Number : +91 70214 55852 # ## Mail ID :advaitchavan135@gmail.com # # # ## Oasis Infobyte Data Science Intern # ## Task 4 : Email Mail Spam Detection with Machine Learning. # # ### 1. Importing the necessary dependencies # In[1]: import pandas as pd import matplotlib.pyplot as plt import numpy as np import plotly.io as plio plio.templates import seaborn as sns import plotly.express as px from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC import joblib import re from sklearn.metrics import confusion_matrix, accuracy_score, classification_report,precision_recall_curve, average_precision_score from warnings import filterwarnings filterwarnings(action='ignore') # ### 2. Exploring the dataset # In[2]: data = pd.read_csv('spam.csv', encoding='ISO-8859-1') # In[3]: data # In[4]: data.info() # #### Dropping unnecessary columns from the dataset # In[5]: columns_to_drop = ['Unnamed: 2', 'Unnamed: 3','Unnamed: 4'] data.drop(columns=columns_to_drop, inplace=True) # In[6]: data # #### Renaming columns v1 -> class and v2 -> text # In[7]: col_rename = {'v1': 'class', 'v2': 'text'} data = data.rename(columns=col_rename) # In[8]: data #data.to_csv('spam_cleaned.csv') # In[9]: class_key = data['class'].value_counts().keys() class_count = data['class'].value_counts().values labels = [str(val) for val in class_count] fig = px.bar(x=class_key, y=class_count, text=labels, title='Class vs count', template='plotly', color_discrete_sequence=['darkviolet']) # Set the labels for the y-axis and x-axis fig.update_yaxes(title_text='class_count') fig.update_xaxes(title_text='class_key') # Show the plot fig.show() # In[10]: # Count the occurrences of top 15 unique strings in the 'text' column value_counts = data['text'].value_counts().head(15).reset_index() value_counts.columns = ['Category', 'Count'] # Create a bar plot using Plotly fig = px.bar(value_counts, x='Category', y='Count') # Add values to the bars fig.update_traces(texttemplate='%{y}', textposition='outside') # Set plot labels and title fig.update_xaxes(title='text') fig.update_yaxes(title='Occurences of the text') fig.update_layout(title='Most Common String in the Text Column') # Show the plot fig.show() # #### Encoding the class column # In[11]: # Changing Class column : ham -> 0, spam -> 1 category_mapping = {'ham': 0, 'spam': 1} data['class'] = data['class'].map(category_mapping) # In[12]: data # ### 3. Data Modelling (Preparing the training and testing data) # In[13]: x_train, x_test, y_train, y_test=train_test_split(data.text, data['class'],test_size=0.3) # In[14]: #x_train.to_csv('x_train.csv',index = False) #x_test.to_csv('x_test.csv', index=False) #y_train.to_csv('y_train.csv', index=False) #y_test.to_csv('y_test.csv',index=False) # ### 4. Countervectorizing x_train and x_test # #### CountVectorizer is commonly used in text classification tasks, where it helps convert text strings into a format suitable for training classifiers. # In[15]: #x_train = pd.read_csv('x_train.csv') #x_test = pd.read_csv('x_test.csv') #y_train = pd.read_csv('y_train.csv') #y_test = pd.read_csv('y_test.csv') # In[16]: cv = CountVectorizer() # In[17]: x_train_new = cv.fit_transform(x_train) # In[18]: x_test_new = cv.transform(x_test) # In[19]: x_train_array = x_train_new.toarray() x_test_array = x_test_new.toarray() # ### 5. Training and testing using Classification Models # # # ### [A] Multinomial Naive Bayes Classifier # In[20]: classifier_1 = MultinomialNB() classifier_1.fit(x_train_new, y_train) # In[21]: y_pred_1 = classifier_1.predict(x_test_new) # In[22]: conf_matrix = confusion_matrix(y_test, y_pred_1) print("Confusion Matrix:") print(conf_matrix) # In[23]: accuracy_1 = accuracy_score(y_test, y_pred_1) print("Accuracy:", accuracy_1) # In[24]: class_report = classification_report(y_test, y_pred_1) print("Classification Report:") print(class_report) # In[25]: conf_matrix = confusion_matrix(y_test, y_pred_1) plt.figure(figsize=(8, 6)) plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues')) plt.title('Confusion Matrix of performance by Multinomial Naive Bayes classifier') plt.colorbar() tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam) plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45) plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam']) plt.tight_layout() for i in range(2): for j in range(2): plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black") plt.show() # In[26]: precision, recall, _ = precision_recall_curve(y_test, y_pred_1) average_precision = average_precision_score(y_test, y_pred_1) plt.figure() plt.step(recall, precision, color='b', where='post') plt.fill_between(recall, precision, alpha=0.2, color='r') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title(f'Precision-Recall curve of performance by Multinomial Naive Bayes classifier (Average-precision = {average_precision:.2f})') plt.show() # ### [B] Logistic Regression Classifier # In[27]: classifier_2 = LogisticRegression() classifier_2.fit(x_train_new, y_train) # In[28]: y_pred_2 = classifier_2.predict(x_test_new) # In[29]: conf_matrix = confusion_matrix(y_test, y_pred_2) print("Confusion Matrix:") print(conf_matrix) # In[30]: accuracy_2 = accuracy_score(y_test, y_pred_2) print("Accuracy:", accuracy_2) # In[31]: class_report = classification_report(y_test, y_pred_2) print("Classification Report:") print(class_report) # In[32]: conf_matrix = confusion_matrix(y_test, y_pred_2) plt.figure(figsize=(8, 6)) plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues')) plt.title('Confusion Matrix of performance by Logistic Regression classifier') plt.colorbar() tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam) plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45) plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam']) plt.tight_layout() for i in range(2): for j in range(2): plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black") plt.show() # In[33]: precision, recall, _ = precision_recall_curve(y_test, y_pred_2) average_precision = average_precision_score(y_test, y_pred_2) plt.figure() plt.step(recall, precision, color='b', where='post') plt.fill_between(recall, precision, alpha=0.2, color='r') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title(f'Precision-Recall curve of performance by Logistic Regression classifier (Average-precision = {average_precision:.2f})') plt.show() # ### [C] Decision Tree Classifer # In[34]: classifier_3 = DecisionTreeClassifier() classifier_3.fit(x_train_new, y_train) # In[35]: y_pred_3 = classifier_3.predict(x_test_new) # In[36]: conf_matrix = confusion_matrix(y_test, y_pred_3) print("Confusion Matrix:") print(conf_matrix) # In[37]: accuracy_3 = accuracy_score(y_test, y_pred_3) print("Accuracy:", accuracy_3) # In[38]: class_report = classification_report(y_test, y_pred_3) print("Classification Report:") print(class_report) # In[39]: conf_matrix = confusion_matrix(y_test, y_pred_3) plt.figure(figsize=(8, 6)) plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues')) plt.title('Confusion Matrix of performance by Decision Tree classifier') plt.colorbar() tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam) plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45) plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam']) plt.tight_layout() for i in range(2): for j in range(2): plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black") plt.show() # In[40]: precision, recall, _ = precision_recall_curve(y_test, y_pred_3) average_precision = average_precision_score(y_test, y_pred_3) plt.figure() plt.step(recall, precision, color='b', where='post') plt.fill_between(recall, precision, alpha=0.2, color='r') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title(f'Precision-Recall curve of performance by Decision Tree classifier (Average-precision = {average_precision:.2f})') plt.show() # ### [D] Random Forest Classifier # In[41]: classifier_4 = RandomForestClassifier() classifier_4.fit(x_train_new, y_train) # In[42]: y_pred_4 = classifier_4.predict(x_test_new) # In[43]: conf_matrix = confusion_matrix(y_test, y_pred_4) print("Confusion Matrix:") print(conf_matrix) # In[44]: accuracy_4 = accuracy_score(y_test, y_pred_4) print("Accuracy:", accuracy_4) # In[45]: class_report = classification_report(y_test, y_pred_4) print("Classification Report:") print(class_report) # In[46]: conf_matrix = confusion_matrix(y_test, y_pred_4) plt.figure(figsize=(8, 6)) plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues')) plt.title('Confusion Matrix of performance by Random Forest classifier') plt.colorbar() tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam) plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45) plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam']) plt.tight_layout() for i in range(2): for j in range(2): plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black") plt.show() # In[47]: precision, recall, _ = precision_recall_curve(y_test, y_pred_4) average_precision = average_precision_score(y_test, y_pred_4) plt.figure() plt.step(recall, precision, color='b', where='post') plt.fill_between(recall, precision, alpha=0.2, color='r') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title(f'Precision-Recall curve of performance by Random Forest classifier (Average-precision = {average_precision:.2f})') plt.show() # ### [E] Support Vector Machine Classifier # In[48]: svm = SVC() classifier_5 = svm classifier_5.fit(x_train_new, y_train) # In[49]: y_pred_5 = classifier_5.predict(x_test_new) # In[50]: accuracy_5 = accuracy_score(y_test, y_pred_5) print("Accuracy:", accuracy_5) # In[51]: conf_matrix = confusion_matrix(y_test, y_pred_5) print("Confusion Matrix:") print(conf_matrix) # In[52]: class_report = classification_report(y_test, y_pred_5) print("Classification Report:") print(class_report) # In[53]: conf_matrix = confusion_matrix(y_test, y_pred_5) plt.figure(figsize=(8, 6)) plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues')) plt.title('Confusion Matrix of performance by Support Vector Machine classifier') plt.colorbar() tick_marks = [0, 1] # Assuming binary classification (e.g., 0 for non-spam and 1 for spam) plt.xticks(tick_marks, ['Predicted Ham(Non-spam)', 'Predicted Spam'], rotation=45) plt.yticks(tick_marks, ['Actual Ham(Non-spam)', 'Actual Spam']) plt.tight_layout() for i in range(2): for j in range(2): plt.text(j, i, format(conf_matrix[i, j], 'd'), ha="center", va="center", color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black") plt.show() # In[54]: precision, recall, _ = precision_recall_curve(y_test, y_pred_5) average_precision = average_precision_score(y_test, y_pred_5) plt.figure() plt.step(recall, precision, color='b', where='post') plt.fill_between(recall, precision, alpha=0.2, color='r') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title(f'Precision-Recall curve of performance by Support Vector Machine classifier (Average-precision = {average_precision:.2f})') plt.show() # #### Individual Accuracy scores of each model # In[55]: model_accuracies = { "MultiNominal Naive Bayes Classifier": accuracy_1, "Logistic Regression Classifer": accuracy_2, "Decision Tree Classifier": accuracy_3, "Random Forest Classfier": accuracy_4, "Support Vector Machine Classifier": accuracy_5 } # In[56]: print("1. MultiNominal Naive Bayes Classifier : ", accuracy_1) print("2. Logistic Regression Classifer : ", accuracy_2) print("3. Decision Tree Classifier : ", accuracy_3) print("4. Random Forest Classfier : ", accuracy_4) print("5. Support Vector Machine Classifier : ", accuracy_5) # In[57]: best_model_name = max(model_accuracies, key=model_accuracies.get) best_accuracy = model_accuracies[best_model_name] print(f"Best Performance Model is {best_model_name} with an accuracy score of {best_accuracy}") # ### 6. Saving the best performing Model # In[58]: joblib.dump(classifier_1, 'multinomial_nb_model.joblib') # ### 7. Evaluating the model performance # In[59]: def model_eval(): loaded_model = joblib.load('multinomial_nb_model.joblib') def classify_message(input_message): input_message = re.sub(r'[^a-zA-Z0-9\s]', '', input_message) # Remove special characters input_message = input_message.lower() # Convert to lowercase input_message = [input_message] input_message_vectorized = cv.transform(input_message) prediction = loaded_model.predict(input_message_vectorized) return prediction[0] # User input user_input = input("Enter a message: ") # Call the classify_message function to detect spam or non-spam result = classify_message(user_input) # Display the result to the user if result == 1: print("Spam detected.") else: print("The above entered message is a Non-spam (ham) message.") # In[60]: model_eval() # In[61]: model_eval() # In[64]: model_eval() # In[67]: model_eval() # In[ ]: