#!/usr/bin/env python # coding: utf-8 #
This notebook is a demonstration of two types of feature engineering methods:
#
Now that we've created the functions, let's generate some data. Again, the goal is to generate an X-Y relationship with noise, but where we know the underlying data generating distribution. #
# In[2]: betas = [0, 4, -3.5, 1] n=200 sig=2.2 sp=20 x_init = np.random.uniform(0,1,n) e_init = np.random.normal(0, sig, n) dat = genY(x_init, e_init, betas) dat = makePolyFeat(dat, 6) #Now we want to see the effect of fitting polynomial curves of different degrees to our noisy data set. Ultimately, we want to illustrate how model specification (and feature engineering) affects the bias-variance tradeoff. # # #
# In[3]: def PlotLinDeg(X_train, y_train, X_test, y_test, i, t): ''' This function builds a regression model on the simulated data 1. plots the test data 2. plots the fitted line 3. Shows sum-square error ''' regr = linear_model.LinearRegression(fit_intercept=True) regr.fit(X_train, y_train) y_hat=regr.predict(X_train) y_hat_test=regr.predict(X_test) ss_train=((y_train-y_hat)**2).mean() ss_test=((y_test-y_hat_test)**2).mean() #Plot train X vs. Predicted Y_train plt.subplot(2, 3, i) plt.plot(X_train['x'], y_train, 'b.') plt.plot(X_train['x'], y_hat, 'r.') plt.title('{}\n Train MSE={}'.format(t,round(ss_train,4))) #Plot test X vs. Predicted Y_test plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') plt.subplot(2,3,i+3) plt.plot(X_test['x'], y_test, 'b.') plt.plot(X_test['x'], y_hat_test, 'r.') plt.title('Test MSE={}'.format(round(ss_test,4))) plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') def PlotLinBin(X_train, y_train, X_test, y_test, i, t, x, x_t): ''' This function builds a regression model on the simulated data 1. plots the test data 2. plots the fitted line 3. Shows sum-square error ''' regr = linear_model.LinearRegression(fit_intercept=True) regr.fit(X_train, y_train) y_hat=regr.predict(X_train) y_hat_test=regr.predict(X_test) ss_train=((y_train-y_hat)**2).mean() ss_test=((y_test-y_hat_test)**2).mean() #Plot train X vs. Predicted Y_train plt.subplot(2, 3, i) plt.plot(x, y_train, 'b.') plt.plot(x, y_hat, 'r.') plt.title('{}\n Train MSE={}'.format(t,round(ss_train,4))) #Plot test X vs. Predicted Y_test plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') plt.subplot(2,3,i+3) plt.plot(x_t, y_test, 'b.') plt.plot(x_t, y_hat_test, 'r.') plt.title('Test MSE={}'.format(round(ss_test,4))) plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # In[4]: sp = 20 f1=['x']; f3=['x', 'x2', 'x3']; f6=['x', 'x2', 'x3', 'x4', 'x5', 'x6'] fig=plt.figure(figsize=(12,10)) j=1 for plot in [f1,f3,f6]: PlotLinDeg(dat[plot][:sp], dat['y'][:sp], dat[plot][sp:], dat['y'][sp:], j, 'Degree '+str(len(plot))) j+=1 fig.tight_layout() plt.show() #The above plot is a great illustration of bias-variance tradeoffs. The top row shows an nth-degree model fit to a sparse training set. The bottom row shows this fitted model against the test data (with actuals in blue and predicted in red). We can see multiple things here:
#