Here we will walk through the stemming and lemmatization procedure for NLP. We will also implement NB classifier as well SVC and Random Forest Classifier to detect spam emails
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import re
from nltk.tokenize import word_tokenize as wt
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
spam = pd.read_csv('spam.csv', encoding='latin-1')
spam.head(10)
v1 | v2 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | |
---|---|---|---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... | NaN | NaN | NaN |
1 | ham | Ok lar... Joking wif u oni... | NaN | NaN | NaN |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | NaN | NaN | NaN |
3 | ham | U dun say so early hor... U c already then say... | NaN | NaN | NaN |
4 | ham | Nah I don't think he goes to usf, he lives aro... | NaN | NaN | NaN |
5 | spam | FreeMsg Hey there darling it's been 3 week's n... | NaN | NaN | NaN |
6 | ham | Even my brother is not like to speak with me. ... | NaN | NaN | NaN |
7 | ham | As per your request 'Melle Melle (Oru Minnamin... | NaN | NaN | NaN |
8 | spam | WINNER!! As a valued network customer you have... | NaN | NaN | NaN |
9 | spam | Had your mobile 11 months or more? U R entitle... | NaN | NaN | NaN |
spam = spam.filter(['v1','v2'], axis=1)
spam.columns = ['label', 'text']
spam
label | text | |
---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... |
1 | ham | Ok lar... Joking wif u oni... |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | ham | U dun say so early hor... U c already then say... |
4 | ham | Nah I don't think he goes to usf, he lives aro... |
... | ... | ... |
5567 | spam | This is the 2nd time we have tried 2 contact u... |
5568 | ham | Will Ì_ b going to esplanade fr home? |
5569 | ham | Pity, * was in mood for that. So...any other s... |
5570 | ham | The guy did some bitching but I acted like i'd... |
5571 | ham | Rofl. Its true to its name |
5572 rows × 2 columns
spam_data = spam[spam['label'] == 'spam']
real_data = spam[spam['label'] == 'ham']
all_data_stem = []
spam_data_stem = []
real_data_stem = []
stemmer = PorterStemmer()
for i in range(spam.shape[0]):
sms = spam.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(stemmer.stem(word))
sms_text = " ".join(sms_processed)
all_data_stem.append(sms_text)
for i in range(spam_data.shape[0]):
sms = spam_data.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(stemmer.stem(word))
sms_text = " ".join(sms_processed)
spam_data_stem.append(sms_text)
for i in range(real_data.shape[0]):
sms = real_data.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(stemmer.stem(word))
sms_text = " ".join(sms_processed)
real_data_stem.append(sms_text)
all_data_stem[0]
'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'
all_data_lemma = []
spam_data_lemma = []
real_data_lemma = []
lemmatizer = WordNetLemmatizer()
for i in range(spam.shape[0]):
sms = spam.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(lemmatizer.lemmatize(word))
sms_text = " ".join(sms_processed)
all_data_lemma.append(sms_text)
for i in range(spam_data.shape[0]):
sms = spam_data.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(lemmatizer.lemmatize(word))
sms_text = " ".join(sms_processed)
spam_data_lemma.append(sms_text)
for i in range(real_data.shape[0]):
sms = real_data.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(lemmatizer.lemmatize(word))
sms_text = " ".join(sms_processed)
real_data_lemma.append(sms_text)
all_data_lemma[0]
'go jurong point crazy available bugis n great world la e buffet cine got amore wat'
spam.drop_duplicates(inplace = True)
spam
label | text | |
---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... |
1 | ham | Ok lar... Joking wif u oni... |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | ham | U dun say so early hor... U c already then say... |
4 | ham | Nah I don't think he goes to usf, he lives aro... |
... | ... | ... |
5567 | spam | This is the 2nd time we have tried 2 contact u... |
5568 | ham | Will Ì_ b going to esplanade fr home? |
5569 | ham | Pity, * was in mood for that. So...any other s... |
5570 | ham | The guy did some bitching but I acted like i'd... |
5571 | ham | Rofl. Its true to its name |
5169 rows × 2 columns
spam.isnull().sum()
label 0 text 0 dtype: int64
spam['num_label'] = spam['label'].map({'ham': 0, 'spam': 1})
spam.head()
label | text | num_label | |
---|---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... | 0 |
1 | ham | Ok lar... Joking wif u oni... | 0 |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 1 |
3 | ham | U dun say so early hor... U c already then say... | 0 |
4 | ham | Nah I don't think he goes to usf, he lives aro... | 0 |
spam_words = ' '.join(list(spam[spam['num_label'] == 1]['text']))
spam_wc = WordCloud(width = 600,height = 512).generate(spam_words)
plt.figure(figsize = (12, 8), facecolor = 'k')
plt.imshow(spam_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
real_words = ' '.join(list(spam[spam['num_label'] == 0]['text']))
real_wc = WordCloud(width = 600,height = 512).generate(real_words)
plt.figure(figsize = (12, 8), facecolor = 'k')
plt.imshow(real_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
from nltk import FreqDist
spam_token = nltk.tokenize.word_tokenize(spam_words)
spam_freq = FreqDist(spam_token)
spam_freq
FreqDist({'.': 1004, 'to': 608, '!': 542, ',': 371, 'a': 358, 'you': 189, 'call': 187, 'your': 187, 'or': 185, '&': 178, ...})
spam_freq.most_common(5)
[('.', 1004), ('to', 608), ('!', 542), (',', 371), ('a', 358)]
FreqDist(spam_data_lemma).most_common(5)
[('private account statement show un redeemed point call identifier code expires', 9), ('u secret admirer looking make contact u find r reveal think ur special call', 6), ('urgent trying contact u today draw show prize guaranteed call land line claim valid hr', 5), ('please call customer service representative freephone pm guaranteed cash prize', 4), ('free st week nokia tone ur mob every week txt nokia get txting tell ur mate www getzed co uk pobox w wq norm p tone', 4)]
FreqDist(real_data_lemma).most_common(5)
[('sorry call later', 30), ('ok', 20), ('cant pick phone right pls send message', 12), ('', 8), ('okie', 7)]
text1 = nltk.Text(spam_token)
text1.dispersion_plot(['free','private','account','contact'])
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(all_data_lemma).toarray()
y = spam.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
GaussianNB()
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rep = classification_report(y_test, y_pred)
print(rep)
precision recall f1-score support ham 0.98 0.77 0.86 968 spam 0.38 0.92 0.53 147 accuracy 0.79 1115 macro avg 0.68 0.84 0.70 1115 weighted avg 0.90 0.79 0.82 1115
cm
array([[744, 224], [ 12, 135]], dtype=int64)
from sklearn.svm import SVC
classifier = SVC()
classifier.fit(X_train, y_train)
SVC()
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rep = classification_report(y_test, y_pred)
print(rep)
precision recall f1-score support ham 0.98 1.00 0.99 968 spam 0.98 0.86 0.92 147 accuracy 0.98 1115 macro avg 0.98 0.93 0.95 1115 weighted avg 0.98 0.98 0.98 1115
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
RandomForestClassifier()
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rep = classification_report(y_test, y_pred)
print(rep)
precision recall f1-score support ham 0.98 1.00 0.99 968 spam 1.00 0.90 0.95 147 accuracy 0.99 1115 macro avg 0.99 0.95 0.97 1115 weighted avg 0.99 0.99 0.99 1115