import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=0)
twenty_train.keys()
['DESCR', 'data', 'target', 'target_names', 'filenames']
twenty_train.target_names
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
twenty_train.target
array([3, 3, 2, ..., 1, 3, 3], dtype=int64)
len(twenty_train.filenames)
2257
len(twenty_train.data)
2257
This document belongs to the type of religion.
twenty_train.data[0]
u'From: reedr@cgsvax.claremont.edu\nSubject: Re: DID HE REALLY RISE???\nOrganization: The Claremont Graduate School\nLines: 29\n\nIn article <Apr.9.01.11.16.1993.16937@athos.rutgers.edu>, emery@tc.fluke.COM (John Emery) writes:\n> The one single historic event that has had the biggest impact on the\n> world over the centuries is the resurrection of Jesus. At the same\n> time, it is one of the most hotly contested topics....\n> \n> Did Jesus Christ really rise from the dead? Since the eyewitnesses\n> are no longer living, we have only their written accounts. ...\n> ... Because of the magnitude of significance\n> involved here, either the resurrection is the greatest event in the\n> history of man or the greatest deception played on man.\n> [massive amounts of data deleted]\n\nJohn, \n\nWhile I will not take the time to rebut you point by point, I will suggest\nthree current works which I think will be helpful in your quest to answer\nthis question. John Dominic Crossan (Professor of Religion at De Paul Univ)-\n_The Cross That Spoke_ Harper and Row Pub. 1988, Also his latest work \n_The Historical Jesus - The Life of A Mediterranean Jewish Peasant_ Harper\nand Row Pub. 1991, Also two works of Burton Mack (Professor of New Testament\nat the Claremont Graduate School) _A Myth of Innocence_ Fortress Press 1988,\nAnd his latest book _The Lost Gospel: The Book of Q and Christian Origins_\nHarper and Row, 1992. You might start with Mack\'s book on Q and then \nexamine the others afterward. However I think that once you do that you will\nsee that your "evidence" is not as sturdy as you\'d like. Most of the tired\narguements you stated, assume eyewitness accounts, such is not the case. But\nAnyway look at Mack and Crossan and then get back to us.\n\nrandy\n'
The module WordCloud could directly get the word cloud of one document by pass the text to function, or get the word cloud of many documents by pass the word frequency which computed by ourself to function.
from wordcloud import WordCloud, STOPWORDS
#text = twenty_train.data[0].replace("\n", " ")
stopwords=STOPWORDS.copy()
stopwords.add("will")
wordcloud = WordCloud(stopwords = stopwords, max_font_size= 50, max_words = 30).generate(twenty_train.data[0])
plt.imshow(wordcloud)
plt.axis("off");
The word frequency which is indicated by word cloud.
And the top words are Harper, John, Claremont, Jesus, Row, _The, work, book.
wordcloud.words_
[(u'Harper', 1.0), (u'John', 1.0), (u'Claremont', 1.0), (u'Jesus', 1.0), (u'Row', 1.0), (u'_The', 1.0), (u'work', 1.0), (u'book', 1.0), (u'Graduate', 0.6666666666666666), (u'edu', 0.6666666666666666), (u'rise', 0.6666666666666666), (u'event', 0.6666666666666666), (u'really', 0.6666666666666666), (u'Pub', 0.6666666666666666), (u'School', 0.6666666666666666), (u'Professor', 0.6666666666666666), (u'resurrection', 0.6666666666666666), (u'think', 0.6666666666666666), (u'point', 0.6666666666666666), (u'one', 0.6666666666666666), (u'Also', 0.6666666666666666), (u'accounts', 0.6666666666666666), (u'Mack', 0.6666666666666666), (u'Crossan', 0.6666666666666666), (u'man', 0.6666666666666666), (u'emery', 0.6666666666666666), (u'greatest', 0.6666666666666666), (u'time', 0.6666666666666666), (u'latest', 0.6666666666666666), (u'played', 0.3333333333333333)]
There are three methods to compute the word frequency:
Tf-idf is originally a term weighting scheme developed for information retrieval (as a ranking function for search engines results), that has also found good use in document classification and clustering.
Above WordCloud module, which get word cloud from one document by text, is using count method to compute the word frequency automatically.
Below let's look at how to compute three types of frequency from the same documents and to make a word cloud.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
count_vect = CountVectorizer(stop_words ='english', min_df=0)
text_counts = count_vect.fit_transform(twenty_train.data[:2])
fre = zip(count_vect.get_feature_names(), text_counts.toarray().sum(axis = 0))
wordcloud = WordCloud(max_font_size= 50, max_words = 30).generate_from_frequencies(fre)
plt.imshow(wordcloud)
plt.axis("off");
fre
[(u'00', 1), (u'01', 1), (u'03', 1), (u'10101', 1), (u'11', 2), (u'15441', 1), (u'16', 1), (u'16937', 1), (u'17', 1), (u'1988', 2), (u'1991', 1), (u'1992', 1), (u'1993', 3), (u'27', 1), (u'29', 1), (u'30', 1), (u'35', 2), (u'_a', 1), (u'_the', 3), (u'acad3', 1), (u'accounts', 2), (u'acts', 1), (u'afterward', 1), (u'alaska', 1), (u'amounts', 1), (u'answer', 1), (u'apr', 2), (u'arguements', 1), (u'article', 3), (u'ask', 1), (u'asking', 1), (u'assume', 1), (u'athena', 1), (u'athens', 1), (u'athos', 1), (u'behavior', 1), (u'beliefs', 1), (u'believe', 1), (u'believers', 1), (u'biggest', 1), (u'book', 3), (u'burton', 1), (u'careful', 1), (u'case', 1), (u'celebate', 1), (u'centuries', 1), (u'cgsvax', 1), (u'change', 2), (u'changes', 1), (u'christ', 1), (u'christian', 1), (u'christianity', 1), (u'christians', 1), (u'christiansen', 1), (u'claremont', 3), (u'com', 2), (u'come', 2), (u'commits', 1), (u'community', 2), (u'congregations', 1), (u'consider', 1), (u'considered', 1), (u'contested', 1), (u'cost', 1), (u'cross', 1), (u'crossan', 2), (u'cs', 1), (u'current', 1), (u'dangerous', 1), (u'data', 1), (u'dead', 1), (u'deception', 1), (u'deleted', 1), (u'depart', 1), (u'did', 2), (u'doesn', 1), (u'dominic', 1), (u'don', 1), (u'edu', 6), (u'emery', 2), (u'endorsement', 1), (u'event', 2), (u'evidence', 1), (u'examine', 1), (u'eyewitness', 1), (u'eyewitnesses', 1), (u'faith', 1), (u'filled', 1), (u'fluke', 1), (u'forgive', 1), (u'forth', 1), (u'fortress', 1), (u'fsspr', 1), (u'gay', 1), (u'geneva', 2), (u'georgia', 1), (u'gift', 1), (u'gifts', 1), (u'given', 3), (u'god', 3), (u'gospel', 1), (u'graduate', 2), (u'greatest', 2), (u'hand', 1), (u'happen', 2), (u'harper', 3), (u'hebrews', 1), (u'helpful', 1), (u'historic', 1), (u'historical', 1), (u'history', 1), (u'holy', 2), (u'homosexual', 5), (u'homosexuality', 2), (u'homosexuals', 1), (u'hotly', 1), (u'hudson', 3), (u'ignorant', 2), (u'impact', 1), (u'iniquity', 1), (u'innocence_', 1), (u'involved', 1), (u'issues', 1), (u'jesus', 6), (u'jewish', 1), (u'john', 3), (u'jr', 1), (u'just', 1), (u'know', 1), (u'latest', 2), (u'life', 1), (u'lift', 1), (u'like', 1), (u'lines', 2), (u'link', 1), (u'living', 1), (u'lois', 1), (u'loisc', 1), (u'longer', 1), (u'look', 1), (u'lord', 3), (u'lost', 1), (u'lot', 1), (u'lusts', 1), (u'mack', 3), (u'magnitude', 1), (u'man', 2), (u'massive', 1), (u'mediterranean', 1), (u'microsoft', 1), (u'miraculous', 1), (u'morality', 1), (u'morally', 1), (u'myth', 1), (u'nature', 3), (u'new', 1), (u'okay', 1), (u'ones', 1), (u'organization', 2), (u'origins_', 1), (u'partaken', 1), (u'paul', 2), (u'peasant_', 1), (u'people', 3), (u'perfect', 1), (u'played', 1), (u'point', 2), (u'powers', 1), (u'practicing', 2), (u'press', 1), (u'proclaiming', 1), (u'professor', 2), (u'pub', 2), (u'quest', 1), (u'question', 1), (u'randy', 1), (u'really', 2), (u'rebut', 1), (u'reedr', 1), (u'religion', 1), (u'repent', 1), (u'resurrection', 2), (u'rise', 2), (u'row', 3), (u'rutgers', 3), (u'said', 1), (u'sake', 1), (u'say', 1), (u'saying', 1), (u'school', 2), (u'seen', 1), (u'set', 1), (u'sexual', 1), (u'significance', 1), (u'similar', 1), (u'sin', 3), (u'single', 1), (u'soul', 1), (u'spirit', 5), (u'spoke_', 1), (u'start', 1), (u'stated', 1), (u'sturdy', 1), (u'subject', 2), (u'submit', 1), (u'suffered', 1), (u'suggest', 1), (u'tc', 1), (u'tell', 1), (u'testament', 1), (u'think', 4), (u'time', 2), (u'tired', 1), (u'topics', 1), (u'uga', 1), (u'univ', 1), (u'university', 1), (u'violates', 1), (u'visit', 1), (u'weak', 1), (u'work', 1), (u'workers', 1), (u'working', 1), (u'works', 3), (u'world', 2), (u'worth', 1), (u'writes', 2), (u'written', 1), (u'wrote', 1)]
wordcloud.words_
[(u'edu', 1.0), (u'jesus', 1.0), (u'homosexual', 0.83333333333333337), (u'spirit', 0.83333333333333337), (u'think', 0.66666666666666663), (u'1993', 0.5), (u'_the', 0.5), (u'article', 0.5), (u'book', 0.5), (u'claremont', 0.5), (u'given', 0.5), (u'god', 0.5), (u'harper', 0.5), (u'hudson', 0.5), (u'john', 0.5), (u'lord', 0.5), (u'mack', 0.5), (u'nature', 0.5), (u'people', 0.5), (u'row', 0.5), (u'rutgers', 0.5), (u'sin', 0.5), (u'works', 0.5), (u'11', 0.33333333333333331), (u'1988', 0.33333333333333331), (u'35', 0.33333333333333331), (u'accounts', 0.33333333333333331), (u'apr', 0.33333333333333331), (u'change', 0.33333333333333331), (u'com', 0.33333333333333331)]
Now, we compute the count frequency for two religion documents.
The top words are edu, jesus, homosexual, spirit.
tf_vect = TfidfVectorizer(stop_words = 'english', min_df=0, use_idf = False)
text_tf = tf_vect.fit_transform(twenty_train.data[:2])
fre = zip(tf_vect.get_feature_names(), text_tf.toarray().sum(axis = 0))
wordcloud = WordCloud(max_font_size= 50, max_words = 30).generate_from_frequencies(fre)
plt.imshow(wordcloud)
plt.axis("off");
wordcloud.words_
[(u'jesus', 1.0), (u'edu', 0.98549321657988942), (u'homosexual', 0.79706637478305697), (u'spirit', 0.79706637478305697), (u'think', 0.66666666666666663), (u'_the', 0.52176017513016582), (u'book', 0.52176017513016582), (u'claremont', 0.52176017513016582), (u'harper', 0.52176017513016582), (u'john', 0.52176017513016582), (u'mack', 0.52176017513016582), (u'row', 0.52176017513016582), (u'works', 0.50725339171005535), (u'1993', 0.49274660828994471), (u'article', 0.49274660828994471), (u'rutgers', 0.49274660828994471), (u'given', 0.47823982486983418), (u'god', 0.47823982486983418), (u'hudson', 0.47823982486983418), (u'lord', 0.47823982486983418), (u'nature', 0.47823982486983418), (u'people', 0.47823982486983418), (u'sin', 0.47823982486983418), (u'1988', 0.34784011675344384), (u'accounts', 0.34784011675344384), (u'crossan', 0.34784011675344384), (u'did', 0.34784011675344384), (u'emery', 0.34784011675344384), (u'event', 0.34784011675344384), (u'graduate', 0.34784011675344384)]
Now, we compute the tf frequency for two religion documents.
The top words are jesus, edu, homosexual, spirit.
tfidf_vect = TfidfVectorizer(stop_words = 'english', min_df=0)
text_tfidf = tfidf_vect.fit_transform(twenty_train.data[:2])
fre = zip(tfidf_vect.get_feature_names(), text_tfidf.toarray().sum(axis = 0))
wordcloud = WordCloud(max_font_size= 50, max_words = 30).generate_from_frequencies(fre)
plt.imshow(wordcloud)
plt.axis("off");
wordcloud.words_
[(u'homosexual', 1.0), (u'spirit', 1.0), (u'jesus', 0.88772483552726622), (u'edu', 0.87641985146667689), (u'_the', 0.6476662819346316), (u'book', 0.6476662819346316), (u'claremont', 0.6476662819346316), (u'harper', 0.6476662819346316), (u'john', 0.6476662819346316), (u'mack', 0.6476662819346316), (u'row', 0.6476662819346316), (u'given', 0.59999999999999998), (u'god', 0.59999999999999998), (u'hudson', 0.59999999999999998), (u'lord', 0.59999999999999998), (u'nature', 0.59999999999999998), (u'people', 0.59999999999999998), (u'sin', 0.59999999999999998), (u'think', 0.59181655701817748), (u'works', 0.44951490979392777), (u'1993', 0.43820992573333845), (u'article', 0.43820992573333845), (u'rutgers', 0.43820992573333845), (u'1988', 0.43177752128975438), (u'accounts', 0.43177752128975438), (u'crossan', 0.43177752128975438), (u'did', 0.43177752128975438), (u'emery', 0.43177752128975438), (u'event', 0.43177752128975438), (u'graduate', 0.43177752128975438)]
Now, we compute the tf-idf frequency for two religion documents.
The top words are homosexual, spirit, jesus, edu.
The different method to compute the word frequency, we will get the different order of the top word.
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
count_vect = CountVectorizer(stop_words = 'english', min_df=0)
x_train_counts = count_vect.fit_transform(twenty_train.data)
x_train_counts.shape
(2257, 35483)
x_train_tf = TfidfVectorizer(stop_words = 'english', min_df=0, use_idf=False).fit_transform(twenty_train.data)
x_train_tf.shape
(2257, 35483)
x_train_tfidf = TfidfVectorizer(stop_words = 'english', min_df=0).fit_transform(twenty_train.data)
x_train_tfidf.shape
(2257, 35483)
skf = StratifiedKFold(y = twenty_train.target, n_folds = 3, shuffle = True)
features = [x_train_counts, x_train_tf, x_train_tfidf]
for feature in features:
score = cross_val_score(MultinomialNB(), feature, twenty_train.target, cv = skf)
print('score: %s score_mean: %s score_std: %s' % (score, np.mean(score), np.std(score)))
score: [ 0.97609562 0.97343958 0.97736352] score_mean: 0.975632902625 score_std: 0.00163501389227 score: [ 0.95086321 0.93359894 0.92676431] score_mean: 0.937075488547 score_std: 0.0101408101255 score: [ 0.95883134 0.95484728 0.95339547] score_mean: 0.955691363854 score_std: 0.00229804672314
Use model naive bayes and three types of frequency to classify the texts, different types of frequency has different effect.
docs_new = ['God is love', 'OpenGL on the GPU is fast']
x_new_counts = count_vect.transform(docs_new)
model = MultinomialNB()
model.fit(x_train_counts, twenty_train.target)
predicted = model.predict(x_new_counts)
for doc, category in zip(docs_new, predicted):
print('%r => %s' % (doc, twenty_train.target_names[category]))
'God is love' => soc.religion.christian 'OpenGL on the GPU is fast' => comp.graphics
pipe = Pipeline([('vect', CountVectorizer(stop_words = 'english', min_df=0)),
('model', MultinomialNB()) ])
Use train set to do cross validation and compute the accuracy score.
cross_val_score(pipe, twenty_train.data, twenty_train.target, cv = skf)
array([ 0.97609562, 0.9747676 , 0.97736352])
# get the test data
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=0)
Use test set to do cross validation and compute the accuracy score.
skf2 = StratifiedKFold(y = twenty_test.target, n_folds = 3, shuffle = True)
cross_val_score(pipe, twenty_test.data, twenty_test.target, cv = skf2)
array([ 0.95816733, 0.96606786, 0.9739479 ])
Use train set to fit the model and use test set to predict model.
model = pipe.fit(twenty_train.data, twenty_train.target)
predicted = model.predict(twenty_test.data)
accuracy_score(twenty_test.target, predicted)
0.94207723035952062
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier,
AdaBoostClassifier, GradientBoostingClassifier)
sgd = SGDClassifier(random_state=0)
logistic = LogisticRegression(random_state=0)
nb = MultinomialNB()
svc = SVC(kernel='linear')
lin_svc = LinearSVC()
n_estimators = 100
ExtraTree = ExtraTreesClassifier(n_estimators=n_estimators, random_state=0)
clf_list = [sgd, logistic, nb, svc, lin_svc, ExtraTree]
scores_mean = list()
scores_std = list()
skf = StratifiedKFold(y = twenty_train.target, n_folds = 3, shuffle = True)
count_vect = CountVectorizer(stop_words = 'english', min_df=0)
x_train_counts = count_vect.fit_transform(twenty_train.data)
for clf in clf_list:
this_scores = cross_val_score(clf, x_train_counts.toarray(), twenty_train.target, cv = skf)
scores_mean.append(np.mean(this_scores))
scores_std.append(np.std(this_scores))
plt.errorbar(range(6), scores_mean, yerr=scores_std, fmt = 'ob')
plt.xlim(-1, 6)
plt.xticks(range(6), ('sgd', 'logis', 'nb', 'svc', 'lin_svc', 'ETree'))
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title(u'The accuracy of text classification models');
The good models are multinomial naive bayes, logistic regression, support vector machine which kernel is linear.
Let's look at the combination effect of different feature extraction methods and different classifier.
scores_mean = list()
scores_std = list()
skf = StratifiedKFold(y = twenty_train.target, n_folds = 3, shuffle = True)
count_vect = CountVectorizer(stop_words = 'english', min_df=0)
tf_vect = TfidfVectorizer(stop_words = 'english', min_df=0, use_idf=False)
tfidf_vect = TfidfVectorizer(stop_words = 'english', min_df=0)
vect_list = [count_vect, tf_vect, tfidf_vect]
clf_list = [logistic, nb, lin_svc]
for clf in clf_list:
for vect in vect_list:
pipe = Pipeline([('vect', vect),
('model', clf) ])
this_scores = cross_val_score(pipe, twenty_train.data, twenty_train.target, cv = skf)
scores_mean.append(np.mean(this_scores))
scores_std.append(np.std(this_scores))
plt.errorbar(range(9), scores_mean, yerr=scores_std, fmt = 'ob')
plt.xlim(-1, 9)
plt.xticks(range(9), ('log_c', 'l_tf', 'l_ti', 'nb_c', 'nb_t', 'nb_ti', 'ls_c', 'ls_tf', 'ls_ti'))
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title(u'The accuracy of text classification models');
Different classifers bend to different feature extraction methods from text.
nb_model = Pipeline([('vect', count_vect), ('clf', nb)])
nb_para = {'vect__ngram_range': [(1, 1), (1, 2)],
'vect__min_df': (0, 1, 2),
'clf__alpha': (0.001, 0.01, 0.1, 1)}
nb_grid = GridSearchCV(nb_model, nb_para, cv = skf).fit(twenty_train.data, twenty_train.target)
best_parameters, score, _ = max(nb_grid.grid_scores_, key=lambda x: x[1])
print('best score: %s best params: %s' % (score, best_parameters))
best score: 0.979618963226 best params: {'vect__ngram_range': (1, 2), 'vect__min_df': 0, 'clf__alpha': 0.1}
svm_model = Pipeline([('vect', tfidf_vect), ('clf', lin_svc)])
svm_para = {'vect__ngram_range': [(1, 1), (1, 2)],
'vect__min_df': (0, 1, 2),
'clf__C': (1, 10, 100, 1000)}
svm_grid = GridSearchCV(svm_model, svm_para, cv = skf).fit(twenty_train.data, twenty_train.target)
best_parameters, score, _ = max(svm_grid.grid_scores_, key=lambda x: x[1])
print('best score: %s best params: %s' % (score, best_parameters))
best score: 0.975631369074 best params: {'vect__ngram_range': (1, 1), 'clf__C': 1, 'vect__min_df': 0}
nb_grid.best_estimator_
Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, charset=None, charset_error=None, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=0, ngram_range=(1, 2), prep...enizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))])
nb_pred = nb_grid.predict(twenty_test.data)
accuracy_score(twenty_test.target, nb_pred)
0.94340878828229024
pipe = Pipeline([('vect', CountVectorizer(stop_words = 'english', min_df=0, ngram_range = (1, 2))),
('model', MultinomialNB(alpha = 0.01)) ])
nb_pred = pipe.fit(twenty_train.data, twenty_train.target).predict(twenty_test.data)
accuracy_score(twenty_test.target, nb_pred)
0.93941411451398138
predicted = nb_pred
print('Classifiction Report:\n %s \n\nConfusion Matrix: \n\n %s' % (classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names),
confusion_matrix(twenty_test.target, predicted)))
Classifiction Report: precision recall f1-score support alt.atheism 0.95 0.93 0.94 319 comp.graphics 0.93 0.95 0.94 389 sci.med 0.94 0.91 0.93 396 soc.religion.christian 0.94 0.96 0.95 398 avg / total 0.94 0.94 0.94 1502 Confusion Matrix: [[298 4 5 12] [ 6 368 12 3] [ 5 19 361 11] [ 5 3 6 384]]
svm_pred = svm_grid.predict(twenty_test.data)
accuracy_score(twenty_test.target, svm_pred)
0.92876165113182418
predicted = svm_pred
print('Classifiction Report:\n %s \n\nConfusion Matrix: \n\n %s' % (classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names),
confusion_matrix(twenty_test.target, predicted)))
Classifiction Report: precision recall f1-score support alt.atheism 0.96 0.83 0.89 319 comp.graphics 0.91 0.98 0.94 389 sci.med 0.95 0.93 0.94 396 soc.religion.christian 0.91 0.96 0.93 398 avg / total 0.93 0.93 0.93 1502 Confusion Matrix: [[264 10 11 34] [ 1 381 5 2] [ 4 22 367 3] [ 5 7 3 383]]
The combination of count method, which extracte features from text, and Multinomial naive bayes model have accuracy score 0.939 in twenty newsgroups test set.
The combination of tf-idf method, which extracte features from text, and support vector machine which kernel is linear have accuracy score 0.929 in twenty newsgroups test set.
The combination of count method and Multinomial naive bayes model are better than other combinations for these documents.