from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.svm import SVR from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from scipy.spatial.distance import cdist import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib as mpl mpl.rcParams['figure.figsize'] = [12,12] from warnings import filterwarnings filterwarnings('ignore') data = load_diabetes() data['data'].shape, data['target'].shape, data['feature_names'] X = pd.DataFrame(data['data'], columns=data['feature_names']) y = pd.DataFrame(data['target'], columns=['target']) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.25, random_state=66) %%time params = {'svr__epsilon': np.logspace(-4, 4, 10), 'svr__C': np.logspace(-4, 4, 10), 'svr__gamma': np.logspace(-4, 4, 10), } pipeline = make_pipeline(StandardScaler(), SVR()) model = GridSearchCV(pipeline, params, cv = 3) model.fit(Xtrain, ytrain) k_rbf = lambda x,y,g:np.exp(-g*np.linalg.norm(x-y)**2.) mm = model.best_estimator_.steps[-1][-1] x_sc = model.best_estimator_.steps[0][-1].transform(Xtrain) C = cdist(mm.support_vectors_, x_sc, lambda a,b: k_rbf(a,b, mm.gamma)) yc=(mm.dual_coef_ @ C).T ycalc = model.predict(Xtrain) plt.plot(ycalc, yc+mm.intercept_, 'o') plt.axis('square') # SVR完全に理解した plt.show() def calc_contribs(svr_model, x_tr): def coeff(a, b): "coefficient for the rbf kernel" return (-2.*b) * (a-b) C = cdist(svr_model.support_vectors_, x_tr, lambda a,b: k_rbf(a,b, svr_model.gamma)) * svr_model.gamma A = np.zeros((svr_model.support_.shape[0], svr_model.support_vectors_.shape[1], x_tr.shape[0])) for j in range(x_tr.shape[1]): D = cdist(svr_model.support_vectors_[:,j].reshape(-1,1), x_tr[:,j].reshape(-1,1), metric=lambda a,b: coeff(a,b) ) A[:,j,:] = D contribs_ = (svr_model.dual_coef_.T.reshape(-1,1,1) * (A * C[:,None,:])).sum(axis=0).T return contribs_ contrib = {} for dtype, xinput in zip(['train','test'], [Xtrain, Xtest]): x_train = model.best_estimator_.steps[0][-1].transform(xinput) contrib[dtype] = calc_contribs(mm, x_train) for i_var in range(contrib['train'].shape[1]): fig = plt.figure() _,bins,_ = plt.hist(contrib['train'][:, i_var], bins=10, alpha=.7) plt.hist(contrib['test'][:, i_var], bins=bins, alpha=.3) plt.xlabel(data['feature_names'][i_var]) plt.show() cons = {} for dname in ['train', 'test']: cons[dname] = contrib[dname] #np.divide(contrib[dname], np.linalg.norm(contrib[dname],axis=1).reshape(-1,1)) ytrain.min(),ytrain.max(),ytest.min(), ytest.max() fig=plt.figure() n_trains = Xtrain.shape[0] n_tests = Xtest.shape[0] plt.stackplot(range(1,1+10), *cons['test'][:10,:].T)# what an artistic! plt.show()