%matplotlib inline import csv import json import matplotlib.pyplot as plt import numpy as np import pandas as pd import os from sklearn import cross_validation from sklearn import datasets from sklearn import decomposition from sklearn import ensemble from sklearn import feature_extraction from sklearn import feature_selection from sklearn import grid_search from sklearn import metrics from sklearn import naive_bayes from sklearn import pipeline from sklearn import tree import seaborn as sns pd.set_option('display.max_columns', None) _DATA_DIR ='data' _SPAM_DATA_PATH = os.path.join(_DATA_DIR, 'SMSSpamCollection') df = pd.read_csv(_SPAM_DATA_PATH, sep='\t', header=None, names=['Label', 'Text']) df.head() y = (df.Label == 'ham').values.astype(int) X = df.Text.values X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=0) pipe = pipeline.Pipeline([('vect', feature_extraction.text.CountVectorizer()), ('tfidf', feature_extraction.text.TfidfTransformer()), ("bernoulli", naive_bayes.BernoulliNB()), ]) pipe.fit(X_train, y_train) metrics.accuracy_score(pipe.predict(X_test), y_test) sns.heatmap(metrics.confusion_matrix(pipe.predict(X_test), y_test), annot=True, fmt=''); print(metrics.classification_report(best_pipe.predict(X_test), y_test)) params = dict(vect__max_df=[0.5, 1.0], vect__max_features=[None, 10000, 200000], vect__ngram_range=[(1, 1), (1, 2)], tfidf__use_idf=[True, False], tfidf__norm=['l1', 'l2'], bernoulli__alpha=[0, .5, 1], bernoulli__binarize=[None, .1, .5], bernoulli__fit_prior=[True, False] ) n_iter_search = 100 random_search = grid_search.RandomizedSearchCV(pipe, param_distributions=params, n_iter=n_iter_search) random_search.fit(X_train, y_train) random_search.best_estimator_ random_search.grid_scores_ random_search = grid_search.RandomizedSearchCV(pipe, param_distributions=params, n_iter=n_iter_search, scoring='f1') random_search.fit(X_train, y_train) random_search.best_estimator_ random_search.grid_scores_ best_pipe = pipeline.Pipeline([('vect', feature_extraction.text.CountVectorizer(ngram_range=(1, 1), max_df=1.0, max_features=20000)), ('tfidf', feature_extraction.text.TfidfTransformer(use_idf=True, norm='l2')), ("bernoulli", naive_bayes.BernoulliNB(binarize=0.1, alpha=.5, fit_prior=True)), ]) best_pipe.fit(X_train, y_train) sns.heatmap(metrics.confusion_matrix(best_pipe.predict(X_test), y_test), annot=True, fmt=''); print(metrics.classification_report(best_pipe.predict(X_test), y_test))