%pylab inline

from pandas import *
import numpy as np
from sklearn.grid_search import GridSearchCV
import sklearn.cross_validation as cv
import sklearn.metrics as metrics
from sklearn.svm import l1_min_c
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression
import scipy.linalg as la
from math import pi
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from patsy import dmatrix
import re
import os
import math
from tdm_df import tdm_df

sin_data = DataFrame({'x' : np.linspace(0, 1, 101)})
sin_data['y'] = np.sin(2 * pi * sin_data['x']) + np.random.normal(0, 0.1, 101)

sin_linpred = ols('y ~ x', df = sin_data).fit().predict()

plt.plot(sin_data['x'], sin_data['y'], 'k.')
plt.plot(sin_data['x'], sin_linpred, 'r-', label = 'Linear Model Prediction')
plt.title('Sine wave with Gaussian noise')
plt.legend(loc = 'upper left')

x = sin_data['x']
y = sin_data['y']
Xpoly = dmatrix('C(x, Poly)')
Xpoly1 = Xpoly[:, :2]
Xpoly3 = Xpoly[:, :4]
Xpoly5 = Xpoly[:, :6]
Xpoly25 = Xpoly[:, :26]

polypred1 = sm.OLS(y, Xpoly1).fit().predict()
polypred3 = sm.OLS(y, Xpoly3).fit().predict()
polypred5 = sm.OLS(y, Xpoly5).fit().predict()
polypred25 = sm.OLS(y, Xpoly25).fit().predict()

plt.figure(figsize(10, 8))
fig, ax = plt.subplots(2, 2, sharex = True, sharey = True)
fig.subplots_adjust(hspace = 0.0, wspace = 0.0)
fig.suptitle('Polynomial fits to noisy sine data', fontsize = 16.0)

# Iterate through panels (a), model predictions (p), and the polynomial 
# degree of the model (d). Plot the data, the predictions, and label
# the graph to indicate the degree used.
for a, p, d in zip(ax.ravel(), [polypred1, polypred3, polypred5, polypred25],
      ['1', '3', '5', '25']):
    a.plot(x, y, '.', color = 'steelblue', alpha = 0.5)
    a.plot(x, p)
    a.text(.5, .95, 'D = ' + d, fontsize = 12,
           verticalalignment = 'top',
           horizontalalignment = 'center',
           transform = a.transAxes)
    a.grid()

# Alternate axes that have tick labels to avoid overlap.
plt.setp(fig.axes[2].get_yaxis().get_ticklabels(), visible = False)
plt.setp(fig.axes[3].get_yaxis(), ticks_position = 'right')   
plt.setp(fig.axes[1].get_xaxis(), ticks_position = 'top')
plt.setp(fig.axes[3].get_xaxis().get_ticklabels(), visible = False)

def poly(x, degree):
    '''
    Generate orthonormal polynomial basis functions from a vector.
    '''
    xbar = np.mean(x)
    X = np.power.outer(x - x.mean(), arange(0, degree + 1))
    Q, R = la.qr(X)
    diagind = np.subtract.outer(arange(R.shape[0]), arange(R.shape[1])) == 0
    z = R * diagind
    Qz = np.dot(Q, z)
    norm2 = (Qz**2).sum(axis = 0)
    Z = Qz / np.sqrt(norm2)
    Z = Z[:, 1:]
    return Z

np.random.seed(1)
rand_ind = arange(100)
np.random.shuffle(rand_ind)
train_ind = rand_ind[:50] 
test_ind = rand_ind[50:]

lasso_model = LassoCV(cv = 15, copy_X = True, normalize = True)
lasso_fit = lasso_model.fit(Xpoly[:, 1:11], y)
lasso_path = lasso_model.score(Xpoly[:, 1:11], y)

# Plot the average MSE across folds
plt.plot(-np.log(lasso_fit.alphas_), np.sqrt(lasso_fit.mse_path_).mean(axis = 1))
plt.ylabel('RMSE (avg. across folds)')
plt.xlabel(r'$-\log(\lambda)$')
# Indicate the lasso parameter that minimizes the average MSE across folds.
plt.axvline(-np.log(lasso_fit.alpha_), color = 'red')

print 'Deg.  Coefficient'
print Series(np.r_[lasso_fit.intercept_, lasso_fit.coef_])

plt.plot(x, y, '.')
plt.plot(np.sort(x), lasso_fit.predict(Xpoly[:, 1:11])[np.argsort(x)],
         '-r', label = 'Training data lasso fit')

ranks_df = read_csv('data/oreilly.csv')
ranks_df.columns = ['ipfamily', 'title', 'isbn', 'rank', 'long_desc']
print ranks_df.head()

# Reverse the ranks, so that 100 is the best seller, and 1 is the worst.
ranks = ranks_df['rank']
rank_rev_map = {i: j for i, j in zip(ranks, ranks[::-1])} 
ranks = ranks.replace(rank_rev_map)

docs = ranks_df['long_desc']

print docs[0]

tag_pattern = r'<.*?>'
docs_clean = docs.str.replace(tag_pattern, '')

print docs_clean[0]

rsw = read_csv('../ch3/r_stopwords.csv', squeeze = True).tolist()
desc_tdm = tdm_df(docs_clean, stopwords = rsw)

lasso_model = LassoCV(cv = 10)
lasso_fit = lasso_model.fit(desc_tdm.values, ranks.values)

plt.plot(-np.log(lasso_fit.alphas_), np.sqrt(lasso_fit.mse_path_), alpha = .5)
plt.plot(-np.log(lasso_fit.alphas_), np.sqrt(lasso_fit.mse_path_).mean(axis = 1), 
         lw = 2, color = 'black')
plt.ylim(0, 60)
plt.xlim(0, np.max(-np.log(lasso_fit.alphas_)))
plt.title('Lasso regression RMSE')
plt.xlabel(r'$-\log(\lambda)$')
plt.ylabel('RMSE (and avg. across folds)')

# Create the TDM.
desc_tdm = tdm_df(docs_clean, stopwords = rsw)

# Make the rank variable binary, indicating Top 50 status.
top50 = 1 * (ranks_df['rank'] <= 50)

# Compute the minimum value of the L1 penalty parameter. Below this
# value, all model parameters will be shrunk to zero.
min_l1_C = l1_min_c(desc_tdm.values, top50.values) 

# Create a space of penalty parameters to test.
cs = min_l1_C * np.logspace(0, 3, 10)

# Create a dictionary whose keys are the candidate values of C. 
# The dictionary will hold the error rates in each CV trial for that
# value of C.
cdict = {}
for c in cs:
    cdict[c] = []
    
# Add the target variable to the data, so it gets shuffled and split along
# with the predictor data.
desc_tdm['Top50'] = top50

# Set up logistic model
logit_model = LogisticRegression(C = 1.0, penalty = 'l1', tol = 1e-6)

def train_test_split(df, train_fraction = .5, shuffle = True, 
                     preserve_index = True, seed = None):
    '''
    Split a dataframe into training and test sets by randomly assigning
    its observations to one set or the other.

    Parameters
    ----------
    df: a DataFrame
    train_fraction: the fraction of `df` to be assigned to the 
        training set (1-train_fraction will go to the test set).
    preserve_index: If True, the split dataframes keep their index
        values from `df`. If False, each set gets a new integer index
        of arange(# rows).
    seed: the random number generator seed for row shuffling.
    '''
    if seed: 
        random.seed(seed)
        
    nrows = df.shape[0]
    split_point = int(train_fraction * nrows)
    
    rows = range(nrows)
    if shuffle: 
        random.shuffle(rows)
    
    train_rows = rows[:split_point]
    test_rows  = rows[split_point:]
    
    train_index = df.index[train_rows]
    test_index  = df.index[test_rows]
    
    train_df = df.ix[train_index, :]
    test_df  = df.ix[test_index, :]
    
    if not preserve_index:
        train_df.index = np.arange(train_df.shape[0])
        test_df.index  = np.arange(test_df.shape[0])
    
    return train_df, test_df

np.random.seed(4)
for i in xrange(50):
    # Split the data
    train_docs, test_docs = train_test_split(desc_tdm, .8)
    trainy, testy = train_docs.pop('Top50'), test_docs.pop('Top50')
    trainX, testX = train_docs, test_docs
    
    # Fit the model for each candidate penalty parameter, C.
    # then compute the error rate in the test data and add
    # it to the dictionary entry for that candidate.
    for c in cdict:
        logit_model.set_params(C=c)
        logit_fit = logit_model.fit(trainX.values, trainy.values)
        predy = logit_fit.predict(testX.values)
        error_rate = np.mean(predy != testy)
        cdict[c].append(error_rate)

error_path = DataFrame(cdict).mean()
error_path.plot(style = 'o-k', label = 'Error rate')
error_path.cummin().plot(style = 'r-', label = 'Lower envelope')
plt.xlabel('C (regularization parameter)')
plt.ylabel('Prediction error rate')
plt.legend(loc = 'upper right')

# If you run this block, you have to re-run the blocks starting from four above
# this block (it starts with the comment '# Create the TDM'). Otherwise the
# attempt to pop Top50 will throw an error ('cause you can't pop it twice).

min_error_c = error_path.index[error_path.argmin()]
logit_model_best = LogisticRegression(C = min_error_c, penalty = 'l1')

ally = desc_tdm.pop('Top50')
allX = desc_tdm

logit_fit_best = logit_model_best.fit(allX.values, ally.values)
keep_terms = desc_tdm.columns[np.where(logit_fit_best.coef_ > 0)[1]]

print Series(keep_terms)

trainX, testX, trainy, testy = cv.train_test_split(
    allX, ally, test_size = 0.2)

# Dictionary holding parameters to perform a grid-search over, 
# and the arrays of values to use in the search.
c_grid = [{'C': cs}]

# No. of CV folds
n_cv_folds = 20

clf = GridSearchCV(LogisticRegression(C = 1.0, penalty = 'l1'), c_grid, 
                   score_func = metrics.zero_one_score, cv = n_cv_folds)
clf.fit(trainX, trainy)

clf.best_params_, 1.0 - clf.best_score_

rates = np.array([1.0 - x[1] for x in clf.grid_scores_])
stds   = [np.std(1.0 - x[2]) / math.sqrt(n_cv_folds) for x in clf.grid_scores_]

plt.fill_between(cs, rates - stds, rates + stds, color = 'steelblue', alpha = .4)
plt.plot(cs, rates, 'o-k', label = 'Avg. error rate across folds')
plt.xlabel('C (regularization parameter)')
plt.ylabel('Avg. error rate (and +/- 1 s.e.)')
plt.legend(loc = 'best')
plt.gca().grid()

print metrics.classification_report(testy, clf.predict(testX))

print '   Predicted'
print '   Class'
print DataFrame(metrics.confusion_matrix(testy, clf.predict(testX)))