from sklearn.feature_extraction.text import CountVectorizer
import nltk
import numpy as np
import pandas as pd
import re

nltk.download('stopwords')

corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]

labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

corpus = np.array(corpus)
df = pd.DataFrame({"text": corpus, "label": labels})

df.head()

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
wpt.tokenize(corpus[0])

def preprocess(doc):
  doc = doc.lower().strip()
  tokens = wpt.tokenize(doc)
  tok = [token for token in tokens if token not in stop_words]

  doc = ' '.join(tok)
  return doc

normalize_corpus = np.vectorize(preprocess)

norm_corp = normalize_corpus(corpus)

norm_corp

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corp).toarray()

cv_matrix

vocab = cv.get_feature_names()
vocab

pd.DataFrame(cv_matrix, columns=vocab)

cv.transform(['sky is good and beautiful beautiful today']).toarray()

cv = CountVectorizer(ngram_range=(2, 2))

cv_matrix = cv.fit_transform(norm_corp).toarray()

pd.DataFrame(cv_matrix, columns=cv.get_feature_names())

cv1 = CountVectorizer(ngram_range=(1, 3))
cv1_matrix = cv1.fit_transform(norm_corp).toarray()
pd.DataFrame(cv1_matrix, columns=cv1.get_feature_names())

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

tf_matrix = tfidf.fit_transform(norm_corp).toarray()
tf_matrix

pd.DataFrame(tf_matrix, columns=tfidf.get_feature_names())

len(tfidf.get_feature_names()), len(cv.get_feature_names())