%pylab inline from pandas import * import numpy as np from sklearn.grid_search import GridSearchCV import sklearn.cross_validation as cv import sklearn.metrics as metrics from sklearn.svm import l1_min_c from sklearn.linear_model import Lasso, LassoCV, LogisticRegression import scipy.linalg as la from math import pi import matplotlib.pyplot as plt import statsmodels.api as sm from statsmodels.formula.api import ols from patsy import dmatrix import re import os import math from tdm_df import tdm_df sin_data = DataFrame({'x' : np.linspace(0, 1, 101)}) sin_data['y'] = np.sin(2 * pi * sin_data['x']) + np.random.normal(0, 0.1, 101) sin_linpred = ols('y ~ x', df = sin_data).fit().predict() plt.plot(sin_data['x'], sin_data['y'], 'k.') plt.plot(sin_data['x'], sin_linpred, 'r-', label = 'Linear Model Prediction') plt.title('Sine wave with Gaussian noise') plt.legend(loc = 'upper left') x = sin_data['x'] y = sin_data['y'] Xpoly = dmatrix('C(x, Poly)') Xpoly1 = Xpoly[:, :2] Xpoly3 = Xpoly[:, :4] Xpoly5 = Xpoly[:, :6] Xpoly25 = Xpoly[:, :26] polypred1 = sm.OLS(y, Xpoly1).fit().predict() polypred3 = sm.OLS(y, Xpoly3).fit().predict() polypred5 = sm.OLS(y, Xpoly5).fit().predict() polypred25 = sm.OLS(y, Xpoly25).fit().predict() plt.figure(figsize(10, 8)) fig, ax = plt.subplots(2, 2, sharex = True, sharey = True) fig.subplots_adjust(hspace = 0.0, wspace = 0.0) fig.suptitle('Polynomial fits to noisy sine data', fontsize = 16.0) # Iterate through panels (a), model predictions (p), and the polynomial # degree of the model (d). Plot the data, the predictions, and label # the graph to indicate the degree used. for a, p, d in zip(ax.ravel(), [polypred1, polypred3, polypred5, polypred25], ['1', '3', '5', '25']): a.plot(x, y, '.', color = 'steelblue', alpha = 0.5) a.plot(x, p) a.text(.5, .95, 'D = ' + d, fontsize = 12, verticalalignment = 'top', horizontalalignment = 'center', transform = a.transAxes) a.grid() # Alternate axes that have tick labels to avoid overlap. plt.setp(fig.axes[2].get_yaxis().get_ticklabels(), visible = False) plt.setp(fig.axes[3].get_yaxis(), ticks_position = 'right') plt.setp(fig.axes[1].get_xaxis(), ticks_position = 'top') plt.setp(fig.axes[3].get_xaxis().get_ticklabels(), visible = False) def poly(x, degree): ''' Generate orthonormal polynomial basis functions from a vector. ''' xbar = np.mean(x) X = np.power.outer(x - x.mean(), arange(0, degree + 1)) Q, R = la.qr(X) diagind = np.subtract.outer(arange(R.shape[0]), arange(R.shape[1])) == 0 z = R * diagind Qz = np.dot(Q, z) norm2 = (Qz**2).sum(axis = 0) Z = Qz / np.sqrt(norm2) Z = Z[:, 1:] return Z np.random.seed(1) rand_ind = arange(100) np.random.shuffle(rand_ind) train_ind = rand_ind[:50] test_ind = rand_ind[50:] lasso_model = LassoCV(cv = 15, copy_X = True, normalize = True) lasso_fit = lasso_model.fit(Xpoly[:, 1:11], y) lasso_path = lasso_model.score(Xpoly[:, 1:11], y) # Plot the average MSE across folds plt.plot(-np.log(lasso_fit.alphas_), np.sqrt(lasso_fit.mse_path_).mean(axis = 1)) plt.ylabel('RMSE (avg. across folds)') plt.xlabel(r'$-\log(\lambda)$') # Indicate the lasso parameter that minimizes the average MSE across folds. plt.axvline(-np.log(lasso_fit.alpha_), color = 'red') print 'Deg. Coefficient' print Series(np.r_[lasso_fit.intercept_, lasso_fit.coef_]) plt.plot(x, y, '.') plt.plot(np.sort(x), lasso_fit.predict(Xpoly[:, 1:11])[np.argsort(x)], '-r', label = 'Training data lasso fit') ranks_df = read_csv('data/oreilly.csv') ranks_df.columns = ['ipfamily', 'title', 'isbn', 'rank', 'long_desc'] print ranks_df.head() # Reverse the ranks, so that 100 is the best seller, and 1 is the worst. ranks = ranks_df['rank'] rank_rev_map = {i: j for i, j in zip(ranks, ranks[::-1])} ranks = ranks.replace(rank_rev_map) docs = ranks_df['long_desc'] print docs[0] tag_pattern = r'<.*?>' docs_clean = docs.str.replace(tag_pattern, '') print docs_clean[0] rsw = read_csv('../ch3/r_stopwords.csv', squeeze = True).tolist() desc_tdm = tdm_df(docs_clean, stopwords = rsw) lasso_model = LassoCV(cv = 10) lasso_fit = lasso_model.fit(desc_tdm.values, ranks.values) plt.plot(-np.log(lasso_fit.alphas_), np.sqrt(lasso_fit.mse_path_), alpha = .5) plt.plot(-np.log(lasso_fit.alphas_), np.sqrt(lasso_fit.mse_path_).mean(axis = 1), lw = 2, color = 'black') plt.ylim(0, 60) plt.xlim(0, np.max(-np.log(lasso_fit.alphas_))) plt.title('Lasso regression RMSE') plt.xlabel(r'$-\log(\lambda)$') plt.ylabel('RMSE (and avg. across folds)') # Create the TDM. desc_tdm = tdm_df(docs_clean, stopwords = rsw) # Make the rank variable binary, indicating Top 50 status. top50 = 1 * (ranks_df['rank'] <= 50) # Compute the minimum value of the L1 penalty parameter. Below this # value, all model parameters will be shrunk to zero. min_l1_C = l1_min_c(desc_tdm.values, top50.values) # Create a space of penalty parameters to test. cs = min_l1_C * np.logspace(0, 3, 10) # Create a dictionary whose keys are the candidate values of C. # The dictionary will hold the error rates in each CV trial for that # value of C. cdict = {} for c in cs: cdict[c] = [] # Add the target variable to the data, so it gets shuffled and split along # with the predictor data. desc_tdm['Top50'] = top50 # Set up logistic model logit_model = LogisticRegression(C = 1.0, penalty = 'l1', tol = 1e-6) def train_test_split(df, train_fraction = .5, shuffle = True, preserve_index = True, seed = None): ''' Split a dataframe into training and test sets by randomly assigning its observations to one set or the other. Parameters ---------- df: a DataFrame train_fraction: the fraction of `df` to be assigned to the training set (1-train_fraction will go to the test set). preserve_index: If True, the split dataframes keep their index values from `df`. If False, each set gets a new integer index of arange(# rows). seed: the random number generator seed for row shuffling. ''' if seed: random.seed(seed) nrows = df.shape[0] split_point = int(train_fraction * nrows) rows = range(nrows) if shuffle: random.shuffle(rows) train_rows = rows[:split_point] test_rows = rows[split_point:] train_index = df.index[train_rows] test_index = df.index[test_rows] train_df = df.ix[train_index, :] test_df = df.ix[test_index, :] if not preserve_index: train_df.index = np.arange(train_df.shape[0]) test_df.index = np.arange(test_df.shape[0]) return train_df, test_df np.random.seed(4) for i in xrange(50): # Split the data train_docs, test_docs = train_test_split(desc_tdm, .8) trainy, testy = train_docs.pop('Top50'), test_docs.pop('Top50') trainX, testX = train_docs, test_docs # Fit the model for each candidate penalty parameter, C. # then compute the error rate in the test data and add # it to the dictionary entry for that candidate. for c in cdict: logit_model.set_params(C=c) logit_fit = logit_model.fit(trainX.values, trainy.values) predy = logit_fit.predict(testX.values) error_rate = np.mean(predy != testy) cdict[c].append(error_rate) error_path = DataFrame(cdict).mean() error_path.plot(style = 'o-k', label = 'Error rate') error_path.cummin().plot(style = 'r-', label = 'Lower envelope') plt.xlabel('C (regularization parameter)') plt.ylabel('Prediction error rate') plt.legend(loc = 'upper right') # If you run this block, you have to re-run the blocks starting from four above # this block (it starts with the comment '# Create the TDM'). Otherwise the # attempt to pop Top50 will throw an error ('cause you can't pop it twice). min_error_c = error_path.index[error_path.argmin()] logit_model_best = LogisticRegression(C = min_error_c, penalty = 'l1') ally = desc_tdm.pop('Top50') allX = desc_tdm logit_fit_best = logit_model_best.fit(allX.values, ally.values) keep_terms = desc_tdm.columns[np.where(logit_fit_best.coef_ > 0)[1]] print Series(keep_terms) trainX, testX, trainy, testy = cv.train_test_split( allX, ally, test_size = 0.2) # Dictionary holding parameters to perform a grid-search over, # and the arrays of values to use in the search. c_grid = [{'C': cs}] # No. of CV folds n_cv_folds = 20 clf = GridSearchCV(LogisticRegression(C = 1.0, penalty = 'l1'), c_grid, score_func = metrics.zero_one_score, cv = n_cv_folds) clf.fit(trainX, trainy) clf.best_params_, 1.0 - clf.best_score_ rates = np.array([1.0 - x[1] for x in clf.grid_scores_]) stds = [np.std(1.0 - x[2]) / math.sqrt(n_cv_folds) for x in clf.grid_scores_] plt.fill_between(cs, rates - stds, rates + stds, color = 'steelblue', alpha = .4) plt.plot(cs, rates, 'o-k', label = 'Avg. error rate across folds') plt.xlabel('C (regularization parameter)') plt.ylabel('Avg. error rate (and +/- 1 s.e.)') plt.legend(loc = 'best') plt.gca().grid() print metrics.classification_report(testy, clf.predict(testX)) print ' Predicted' print ' Class' print DataFrame(metrics.confusion_matrix(testy, clf.predict(testX)))