%matplotlib inline import csv import matplotlib.pyplot as plt import numpy as np import os from sklearn import cross_validation from sklearn import ensemble from sklearn.feature_extraction import text from sklearn import feature_extraction from sklearn import feature_selection from sklearn import linear_model from sklearn import metrics from sklearn import naive_bayes from sklearn import pipeline from sklearn import svm from sklearn import tree from sklearn import externals _DATA_DIR = 'data' _NYT_DATA_PATH = os.path.join(_DATA_DIR, 'nyt_title_data.csv') _SERIALIZATION_DIR = 'serializations' _SERIALIZED_PIPELINE_NAME = 'pipe.pickle' _SERIALIZATION_PATH = os.path.join(_SERIALIZATION_DIR, _SERIALIZED_PIPELINE_NAME) with open(_NYT_DATA_PATH) as nyt: nyt_data = [] nyt_labels = [] csv_reader = csv.reader(nyt) for line in csv_reader: nyt_labels.append(int(line[0])) nyt_data.append(line[1]) X = np.array([''.join(el) for el in nyt_data]) y = np.array([el for el in nyt_labels]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y) vectorizer = text.TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') pipe = pipeline.Pipeline([("vectorizer", vectorizer), ("svm", linear_model.RidgeClassifier())]) pipe.fit(X_train, y_train) if not os.path.exists(_SERIALIZATION_DIR): os.makedirs(_SERIALIZATION_DIR) externals.joblib.dump(pipe, _SERIALIZATION_PATH) pipe = externals.joblib.load(_SERIALIZATION_PATH) pipe y_test = pipe.predict(X_test) y_test