from sklearn.feature_extraction.text import CountVectorizer import nltk import numpy as np import pandas as pd import re nltk.download('stopwords') corpus = ['The sky is blue and beautiful.', 'Love this blue and beautiful sky!', 'The quick brown fox jumps over the lazy dog.', "A king's breakfast has sausages, ham, bacon, eggs, toast and beans", 'I love green eggs, ham, sausages and bacon!', 'The brown fox is quick and the blue dog is lazy!', 'The sky is very blue and the sky is very beautiful today', 'The dog is lazy but the brown fox is quick!' ] labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals'] corpus = np.array(corpus) df = pd.DataFrame({"text": corpus, "label": labels}) df.head() wpt = nltk.WordPunctTokenizer() stop_words = nltk.corpus.stopwords.words('english') wpt.tokenize(corpus[0]) def preprocess(doc): doc = doc.lower().strip() tokens = wpt.tokenize(doc) tok = [token for token in tokens if token not in stop_words] doc = ' '.join(tok) return doc normalize_corpus = np.vectorize(preprocess) norm_corp = normalize_corpus(corpus) norm_corp cv = CountVectorizer(min_df=0., max_df=1.) cv_matrix = cv.fit_transform(norm_corp).toarray() cv_matrix vocab = cv.get_feature_names() vocab pd.DataFrame(cv_matrix, columns=vocab) cv.transform(['sky is good and beautiful beautiful today']).toarray() cv = CountVectorizer(ngram_range=(2, 2)) cv_matrix = cv.fit_transform(norm_corp).toarray() pd.DataFrame(cv_matrix, columns=cv.get_feature_names()) cv1 = CountVectorizer(ngram_range=(1, 3)) cv1_matrix = cv1.fit_transform(norm_corp).toarray() pd.DataFrame(cv1_matrix, columns=cv1.get_feature_names()) from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() tf_matrix = tfidf.fit_transform(norm_corp).toarray() tf_matrix pd.DataFrame(tf_matrix, columns=tfidf.get_feature_names()) len(tfidf.get_feature_names()), len(cv.get_feature_names())