# fake BMI data
import numpy.random as random
def generate_fake_bmi_data(N=100,noise=2,seed=2):
random.seed(seed)
ages=random.uniform(low=15,high=80,size=N)
heights=random.normal(1.75,0.15,size=N)
bmis=random.normal(30,5,size=N)
weights=heights*heights*bmis+noise*random.normal(0,1,N)
return (ages,heights,bmis,weights)
def compute_weights(x_train,y_train):
#w=np.dot(np.dot(np.linalg.inv(np.dot(x_train,x_train.transpose())),x_train),y_train)
w=np.dot(np.linalg.pinv(x_train.transpose()),y_train)
return w
def compute_ridge_weights(x_train,y_train,l):
w=np.dot(np.dot(np.linalg.inv(l*np.eye(x_train.shape[0])+np.dot(x_train,x_train.transpose())),x_train),y_train)
#w=np.dot(np.linalg.pinv(x_train.transpose()),y_train)
return w
def compute_mse(w,x_test,y_test):
y=np.dot(x_test.transpose(),w)
mse=np.dot((y-y_test).transpose(),(y-y_test))/y.size
if len(mse.shape)==2:
mse=mse[0][0]
return mse,y
# generate higher order features by padding products of features up to max_order
def gen_high_order_features(x1,x2,x3,max_order=2):
# assert(xs.shape[0] == 4) # 'Only work with 3 basics features'
from sympy.abc import x
from sympy.abc import y
from sympy.abc import z
mat=[np.ones(x1.size)]
p=sympy.poly(x+y+z)
for order in range(1,max_order+1):
for mon in p.monoms():
mat.append(x1**mon[0]*x2**mon[1]*x3**mon[2])
p=p*p
return np.array(mat)
def bmi_train(ages,heights,bmis,weights,order):
N=ages.size
x_train=gen_high_order_features(ages,heights,bmis,order)
y_train=weights.reshape(N,1)
ws=compute_weights(x_train,y_train)
mse,y=compute_mse(ws,x_train,y_train)
return (ws,mse,y)
def bmi_test(agest,heightst,bmist,weightst,ws):
N_test=agest.size
x_test=gen_high_order_features(agest,heightst,bmist,order)
y_test=weightst.reshape(N_test,1)
mse_test,y=compute_mse(ws,x_test,y_test)
return (mse_test,y_test)
import numpy as np
N=30
N_test=20
max_order=3
ages,heights,bmis,weights=generate_fake_bmi_data(N,2,1)
agest,heightst,bmist,weightst=generate_fake_bmi_data(N_test,2,5) # no noise when testing
#x_train=np.concatenate((np.ones(N),heights,bmis,ages),axis=0).reshape(4,N)
mse_train_list=[]
mse_test_list=[]
for order in range(1,max_order+1):
(ws,mse,y_train)=bmi_train(ages,heights,bmis,weights,order)
(mse_test,y_test)=bmi_test(agest,heightst,bmist,weightst,ws)
mse_train_list.append(mse)
mse_test_list.append(mse_test)
print(np.array(mse_train_list))
print(np.array(mse_test_list))
line1,=plt.plot(range(1,max_order+1),mse_train_list,label='Training error')
line2,=plt.plot(range(1,max_order+1),mse_test_list,label='Testing error')
plt.xlabel('Maximum degree of features')
plt.ylabel('MSE')
plt.legend(handles=[line1,line2])
#plt.savefig("bmi0.pdf")
[ 6.62858971 1.00510313 0.3199968 ] [ 8.64919034 4.7262191 7.70046893]
<matplotlib.legend.Legend at 0x7f9debd33990>
# general polynomial data
import numpy as np
def gen_poly_data(N,isRandom=True,seed=1):
if isRandom:
random.seed(seed)
x=random.uniform(0,10,N)
y=(x-3)**2+random.normal(0,2,N)
else:
x=np.linspace(0,10,N)
y=(x-3)**2
return (x,y)
def gen_poly_features(x,max_order=3):
mat=[np.ones(x.size)]
p=x
for order in range(1,max_order+1):
mat.append(p)
p=p*x
return np.array(mat)
%matplotlib inline
import matplotlib.pyplot as plt
max_order=9
mse_train_list=[]
mse_test_list=[]
for order in range(1,max_order+1):
x,y=gen_poly_data(10,True,3)
x_train = gen_poly_features(x,order)
y_train = y
xt,yt=gen_poly_data(100,False)
x_test = gen_poly_features(xt,order)
y_test = yt
ws=compute_weights(x_train,y_train)
mse,_=compute_mse(ws,x_train,y_train)
print(mse)
mse_train_list.append(mse)
mse_test,ye=compute_mse(ws,x_test,y_test)
print(mse_test)
mse_test_list.append(mse_test)
line1,=plt.plot(xt,ye,label="Estimated curve")
plt.plot(x,y,'x')
line2,=plt.plot(xt,yt,'r',label="Original curve")
plt.legend(handles=[line1,line2])
# plt.savefig('curve_fit'+str(order)+'.pdf') # save figure
plt.cla()
p_order=6
line1,=plt.plot(range(1,p_order+1),mse_train_list[0:p_order],label="Training error")
line2,=plt.plot(range(1,p_order+1),mse_test_list[0:p_order],label="Testing error")
plt.legend(handles=[line1,line2])
plt.xlabel('Max degree')
plt.ylabel('MSE')
plt.ylim([0,10])
# plt.savefig('curve_fit_err.pdf') # save figure
64.0703368074 58.1449568138 1.76608000787 2.24223953871 1.76602144107 2.24943481913 1.74206072203 2.80766073752 0.734274250627 44.2056154586 0.575730627164 228.218092695 0.575420075101 194.306626484 0.55113926434 624.354036092 1.96999204674e-10 41349.6855465
%matplotlib inline
from sklearn import linear_model
import matplotlib.pyplot as plt
order = 9
# Needed to parse for matplotlib to parse latex input
# plt.clf()
# plt.rc('text', usetex=True)
# plt.rc('font', family='serif')
for alpha in [0.15,0.3,0.5,1,2,4,8]:
x,y=gen_poly_data(10,True,3)
x_train = gen_poly_features(x,order)
y_train = y
xt,yt=gen_poly_data(100,False)
x_test = gen_poly_features(xt,order)
y_test = yt
clf = linear_model.Lasso(alpha=alpha)
clf.fit(x_train.T,y_train)
ws=compute_ridge_weights(x_train,y_train,alpha)
mse_test,yer=compute_mse(ws,x_test,y_test)
# clfr = linear_model.Ridge(alpha=alpha)
# clfr.fit(x_train.T,y_train)
ye = clf.predict(x_test.T)
# yer = clf.predict(x_test.T)
plt.clf()
line1,=plt.plot(xt,ye,label="Lasso")
line3,=plt.plot(xt,yer,label="Ridge regression")
plt.plot(x,y,'x')
line2,=plt.plot(xt,yt,'r',label="Original curve")
plt.legend(handles=[line1,line3,line2])
# plt.title(r'\lambda = '+str(alpha)) # matplotlib appears to have bug to parse latex input
plt.title('lambda = '+str(alpha))
# plt.savefig('regularized_fit_'+str(alpha)+'.pdf') # save figure
/home/phsamuel/anaconda3/envs/conda2/lib/python2.7/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems. ConvergenceWarning)