from sklearn.feature_extraction.text import CountVectorizer
import nltk
import numpy as np
import pandas as pd
import re
nltk.download('stopwords')
corpus = ['The sky is blue and beautiful.',
'Love this blue and beautiful sky!',
'The quick brown fox jumps over the lazy dog.',
"A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
'I love green eggs, ham, sausages and bacon!',
'The brown fox is quick and the blue dog is lazy!',
'The sky is very blue and the sky is very beautiful today',
'The dog is lazy but the brown fox is quick!'
]
labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']
corpus = np.array(corpus)
df = pd.DataFrame({"text": corpus, "label": labels})
df.head()
text | label | |
---|---|---|
0 | The sky is blue and beautiful. | weather |
1 | Love this blue and beautiful sky! | weather |
2 | The quick brown fox jumps over the lazy dog. | animals |
3 | A king's breakfast has sausages, ham, bacon, e... | food |
4 | I love green eggs, ham, sausages and bacon! | food |
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
wpt.tokenize(corpus[0])
['The', 'sky', 'is', 'blue', 'and', 'beautiful', '.']
def preprocess(doc):
doc = doc.lower().strip()
tokens = wpt.tokenize(doc)
tok = [token for token in tokens if token not in stop_words]
doc = ' '.join(tok)
return doc
normalize_corpus = np.vectorize(preprocess)
norm_corp = normalize_corpus(corpus)
norm_corp
array(['sky blue beautiful .', 'love blue beautiful sky !', 'quick brown fox jumps lazy dog .', "king ' breakfast sausages , ham , bacon , eggs , toast beans", 'love green eggs , ham , sausages bacon !', 'brown fox quick blue dog lazy !', 'sky blue sky beautiful today', 'dog lazy brown fox quick !'], dtype='<U60')
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corp).toarray()
cv_matrix
array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0], [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1], [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]])
vocab = cv.get_feature_names()
vocab
['bacon', 'beans', 'beautiful', 'blue', 'breakfast', 'brown', 'dog', 'eggs', 'fox', 'green', 'ham', 'jumps', 'king', 'lazy', 'love', 'quick', 'sausages', 'sky', 'toast', 'today']
pd.DataFrame(cv_matrix, columns=vocab)
bacon | beans | beautiful | blue | breakfast | brown | dog | eggs | fox | green | ham | jumps | king | lazy | love | quick | sausages | sky | toast | today | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 1 |
7 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
cv.transform(['sky is good and beautiful beautiful today']).toarray()
array([[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]])
ngram_range --> if set to (1, 2)
--> creates uni-gram and bi-gram
--> if set to (2, 2) --> creates only bi-gram
--> if set to (1, 3) --> creates only uni-gram, bi-gram and tri-gram
cv = CountVectorizer(ngram_range=(2, 2))
cv_matrix = cv.fit_transform(norm_corp).toarray()
pd.DataFrame(cv_matrix, columns=cv.get_feature_names())
bacon eggs | beautiful sky | beautiful today | blue beautiful | blue dog | blue sky | breakfast sausages | brown fox | dog lazy | eggs ham | eggs toast | fox jumps | fox quick | green eggs | ham bacon | ham sausages | jumps lazy | king breakfast | lazy brown | lazy dog | love blue | love green | quick blue | quick brown | sausages bacon | sausages ham | sky beautiful | sky blue | toast beans | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
cv1 = CountVectorizer(ngram_range=(1, 3))
cv1_matrix = cv1.fit_transform(norm_corp).toarray()
pd.DataFrame(cv1_matrix, columns=cv1.get_feature_names())
bacon | bacon eggs | bacon eggs toast | beans | beautiful | beautiful sky | beautiful today | blue | blue beautiful | blue beautiful sky | blue dog | blue dog lazy | blue sky | blue sky beautiful | breakfast | breakfast sausages | breakfast sausages ham | brown | brown fox | brown fox jumps | brown fox quick | dog | dog lazy | dog lazy brown | eggs | eggs ham | eggs ham sausages | eggs toast | eggs toast beans | fox | fox jumps | fox jumps lazy | fox quick | fox quick blue | green | green eggs | green eggs ham | ham | ham bacon | ham bacon eggs | ham sausages | ham sausages bacon | jumps | jumps lazy | jumps lazy dog | king | king breakfast | king breakfast sausages | lazy | lazy brown | lazy brown fox | lazy dog | love | love blue | love blue beautiful | love green | love green eggs | quick | quick blue | quick blue dog | quick brown | quick brown fox | sausages | sausages bacon | sausages ham | sausages ham bacon | sky | sky beautiful | sky beautiful today | sky blue | sky blue beautiful | sky blue sky | toast | toast beans | today | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 1 |
7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
from sklearn.feature_extraction.text import TfidfVectorizer
Creates vector based on the frequency and inverse document frequency value of ech words and displays the words based on the value or threshold frequency passed to the min_df and max_df value
tfidf = TfidfVectorizer()
tf_matrix = tfidf.fit_transform(norm_corp).toarray()
tf_matrix
array([[0. , 0. , 0.6009782 , 0.52692542, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.6009782 , 0. , 0. ], [0. , 0. , 0.49316188, 0.43239428, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.57150495, 0. , 0. , 0.49316188, 0. , 0. ], [0. , 0. , 0. , 0. , 0. , 0.38036238, 0.38036238, 0. , 0.38036238, 0. , 0. , 0.52594895, 0. , 0.38036238, 0. , 0.38036238, 0. , 0. , 0. , 0. ], [0.32116401, 0.38321492, 0. , 0. , 0.38321492, 0. , 0. , 0.32116401, 0. , 0. , 0.32116401, 0. , 0.38321492, 0. , 0. , 0. , 0.32116401, 0. , 0.38321492, 0. ], [0.39455357, 0. , 0. , 0. , 0. , 0. , 0. , 0.39455357, 0. , 0.47078381, 0.39455357, 0. , 0. , 0. , 0.39455357, 0. , 0.39455357, 0. , 0. , 0. ], [0. , 0. , 0. , 0.3650479 , 0. , 0.41635082, 0.41635082, 0. , 0.41635082, 0. , 0. , 0. , 0. , 0.41635082, 0. , 0.41635082, 0. , 0. , 0. , 0. ], [0. , 0. , 0.36082605, 0.31636491, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.72165209, 0. , 0.49893493], [0. , 0. , 0. , 0. , 0. , 0.4472136 , 0.4472136 , 0. , 0.4472136 , 0. , 0. , 0. , 0. , 0.4472136 , 0. , 0.4472136 , 0. , 0. , 0. , 0. ]])
pd.DataFrame(tf_matrix, columns=tfidf.get_feature_names())
bacon | beans | beautiful | blue | breakfast | brown | dog | eggs | fox | green | ham | jumps | king | lazy | love | quick | sausages | sky | toast | today | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 0.000000 | 0.600978 | 0.526925 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.600978 | 0.000000 | 0.000000 |
1 | 0.000000 | 0.000000 | 0.493162 | 0.432394 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.571505 | 0.000000 | 0.000000 | 0.493162 | 0.000000 | 0.000000 |
2 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.380362 | 0.380362 | 0.000000 | 0.380362 | 0.000000 | 0.000000 | 0.525949 | 0.000000 | 0.380362 | 0.000000 | 0.380362 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
3 | 0.321164 | 0.383215 | 0.000000 | 0.000000 | 0.383215 | 0.000000 | 0.000000 | 0.321164 | 0.000000 | 0.000000 | 0.321164 | 0.000000 | 0.383215 | 0.000000 | 0.000000 | 0.000000 | 0.321164 | 0.000000 | 0.383215 | 0.000000 |
4 | 0.394554 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.394554 | 0.000000 | 0.470784 | 0.394554 | 0.000000 | 0.000000 | 0.000000 | 0.394554 | 0.000000 | 0.394554 | 0.000000 | 0.000000 | 0.000000 |
5 | 0.000000 | 0.000000 | 0.000000 | 0.365048 | 0.000000 | 0.416351 | 0.416351 | 0.000000 | 0.416351 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.416351 | 0.000000 | 0.416351 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
6 | 0.000000 | 0.000000 | 0.360826 | 0.316365 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.721652 | 0.000000 | 0.498935 |
7 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.447214 | 0.447214 | 0.000000 | 0.447214 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.447214 | 0.000000 | 0.447214 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
len(tfidf.get_feature_names()), len(cv.get_feature_names())
(20, 29)