%matplotlib inline import csv import itertools import matplotlib.pyplot as plt import numpy as np import os from sklearn import cross_validation from sklearn import ensemble from sklearn.feature_extraction import text from sklearn import feature_extraction from sklearn import feature_selection from sklearn import linear_model from sklearn import metrics from sklearn import naive_bayes from sklearn import svm from sklearn import tree plt.style.use('fivethirtyeight') _DATA_DIR = 'data' _NYT_DATA_PATH = os.path.join(_DATA_DIR, 'nyt_title_data.csv') _PLT_LEGEND_OPTIONS = dict(loc="upper center", bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=3) colors = [ii.strip() for ii in '#30a2da, #fc4f30, #e5ae38, #6d904f, #8b8b8b'.split(',')] colors += ['#' + ii.strip() for ii in '348ABD, A60628, 7A68A6, 467821,D55E00, CC79A7, 56B4E9, 009E73, F0E442, 0072B2'.split(',')] markers = itertools.cycle(["o", "D"]) colors = itertools.cycle(colors) def cv(X, y, clf, nfeats, clfname, scoring=metrics.accuracy_score, n_folds=10): stratified_k_fold = cross_validation.StratifiedKFold(y, n_folds=n_folds) accuracy, ii = 0., 0 for train, test in stratified_k_fold: ii += 1 X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] clf.fit(X_train, y_train) y_pred = clf.predict(X_test) score = scoring(y_test, y_pred) accuracy += score accuracy /= float(n_folds) return accuracy def plot_accuracies(accuracies, xvals, legends): fig = plt.figure(figsize=(16, 12)) ax = fig.add_subplot(111) for ii in range(0, accuracies.shape[0]): ax.plot(xvals, accuracies[ii, :], color=next(colors), marker=next(markers), label=legends[ii]) plt.xlabel("Number of Features") plt.ylabel("Accuracy") plt.title("Accuracy vs Number of Features") ax.set_xscale("log") box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * 0.3, box.width, box.height * 0.7]) ax.legend(**_PLT_LEGEND_OPTIONS) plt.show() def estimator_name(clf): return type(clf).__name__ def select_model(X, y, scoring=metrics.accuracy_score): n_features = np.array([10, 100, 500, 1000, 5000, 10000, 20000, 50000, 100000]) clfs = [ naive_bayes.BernoulliNB(), naive_bayes.MultinomialNB(), naive_bayes.GaussianNB(), tree.DecisionTreeClassifier(), ensemble.RandomForestClassifier(n_estimators=10), svm.LinearSVC(random_state=0), linear_model.LogisticRegression(), linear_model.SGDClassifier(), linear_model.RidgeClassifier(), ] classifier_names = map(estimator_name, clfs) feature_selection_methods = [feature_selection.f_classif] accuracies = np.zeros((len(clfs), len(n_features), len(feature_selection_methods))) for kk in range(len(feature_selection_methods)): X_feature_selected = X.copy().toarray() for jj in range(len(n_features)): for ii in range(len(clfs)): accuracies[ii, jj, kk] = cv(X_feature_selected, y, clfs[ii], n_features[jj], classifier_names[ii], scoring=scoring) for k in range(len(feature_selection_methods)): for i in range(len(clfs)): print "%22s " % classifier_names[i], for j in range(accuracies.shape[1]): print "%5.3f" % accuracies[i, j, k], print plot_accuracies(accuracies[:, :, k], n_features, classifier_names) with open(_NYT_DATA_PATH) as nyt: nyt_data = [] nyt_labels = [] csv_reader = csv.reader(nyt) for line in csv_reader: nyt_labels.append(int(line[0])) nyt_data.append(line[1]) X = np.array([''.join(el) for el in nyt_data]) y = np.array([el for el in nyt_labels]) vectorizer = text.TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') example = unicode(X[5]) print("Example string: {}".format(example)) print("Preprocessed string: {}".format(vectorizer.build_preprocessor()(example))) print("Tokenized string: {}".format(str(vectorizer.build_tokenizer()(example)))) print("N-gram data string: {}".format(str(vectorizer.build_analyzer()(example)))) X X = vectorizer.fit_transform(X) X select_model(X, y) select_model(X, y, scoring=metrics.f1_score) select_model(X, y, scoring=metrics.precision_score) select_model(X, y, scoring=metrics.recall_score)