#!/usr/bin/env python # coding: utf-8 # **7장 – 앙상블 학습과 랜덤 포레스트** # _이 노트북은 7장에 있는 모든 샘플 코드와 연습문제 해답을 가지고 있습니다._ # # #

# # 설정 # 먼저 몇 개의 모듈을 임포트합니다. 맷플롯립 그래프를 인라인으로 출력하도록 만들고 그림을 저장하는 함수를 준비합니다. 또한 파이썬 버전이 3.5 이상인지 확인합니다(파이썬 2.x에서도 동작하지만 곧 지원이 중단되므로 파이썬 3을 사용하는 것이 좋습니다). 사이킷런 버전이 0.20 이상인지도 확인합니다. # In[1]: # 파이썬 ≥3.5 필수 import sys assert sys.version_info >= (3, 5) # 사이킷런 ≥0.20 필수 import sklearn assert sklearn.__version__ >= "0.20" # 공통 모듈 임포트 import numpy as np import os # 노트북 실행 결과를 동일하게 유지하기 위해 np.random.seed(42) # 깔끔한 그래프 출력을 위해 get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib as mpl import matplotlib.pyplot as plt mpl.rc('axes', labelsize=14) mpl.rc('xtick', labelsize=12) mpl.rc('ytick', labelsize=12) # 그림을 저장할 위치 PROJECT_ROOT_DIR = "." CHAPTER_ID = "ensembles" IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID) os.makedirs(IMAGES_PATH, exist_ok=True) def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300): path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension) print("그림 저장:", fig_id) if tight_layout: plt.tight_layout() plt.savefig(path, format=fig_extension, dpi=resolution) # # 투표기반 분류기 # In[2]: heads_proba = 0.51 coin_tosses = (np.random.rand(10000, 10) < heads_proba).astype(np.int32) cumulative_heads_ratio = np.cumsum(coin_tosses, axis=0) / np.arange(1, 10001).reshape(-1, 1) # **<그림 7-3. 큰 수의 법칙> 생성 코드** # In[3]: plt.figure(figsize=(8,3.5)) plt.plot(cumulative_heads_ratio) plt.plot([0, 10000], [0.51, 0.51], "k--", linewidth=2, label="51%") plt.plot([0, 10000], [0.5, 0.5], "k-", label="50%") plt.xlabel("Number of coin tosses") plt.ylabel("Heads ratio") plt.legend(loc="lower right") plt.axis([0, 10000, 0.42, 0.58]) save_fig("law_of_large_numbers_plot") plt.show() # moons 데이터셋을 사용해 보죠: # In[4]: from sklearn.model_selection import train_test_split from sklearn.datasets import make_moons X, y = make_moons(n_samples=500, noise=0.30, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # **노트**: 향후 버전을 위해 사이킷런에서 기본 값이 될 `solver="lbfgs"`, `n_estimators=100`, `gamma="scale"`로 지정합니다. # **코드 예제:** # In[5]: from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC log_clf = LogisticRegression(solver="lbfgs", random_state=42) rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42) svm_clf = SVC(gamma="scale", random_state=42) voting_clf = VotingClassifier( estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard') # In[6]: voting_clf.fit(X_train, y_train) # In[7]: from sklearn.metrics import accuracy_score for clf in (log_clf, rnd_clf, svm_clf, voting_clf): clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(clf.__class__.__name__, accuracy_score(y_test, y_pred)) # **노트**: 사이킷런 알고리즘이 이따금 업데이트되기 때문에 이 노트북의 결과가 책과 조금 다를 수 있습니다. # 간접 투표: # In[8]: log_clf = LogisticRegression(solver="lbfgs", random_state=42) rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42) svm_clf = SVC(gamma="scale", probability=True, random_state=42) voting_clf = VotingClassifier( estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='soft') voting_clf.fit(X_train, y_train) # In[9]: from sklearn.metrics import accuracy_score for clf in (log_clf, rnd_clf, svm_clf, voting_clf): clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(clf.__class__.__name__, accuracy_score(y_test, y_pred)) # # 배깅과 페이스팅 # ## 사이킷런의 배깅과 페이스팅 # In[10]: from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier bag_clf = BaggingClassifier( DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, random_state=42) bag_clf.fit(X_train, y_train) y_pred = bag_clf.predict(X_test) # In[11]: from sklearn.metrics import accuracy_score print(accuracy_score(y_test, y_pred)) # In[12]: tree_clf = DecisionTreeClassifier(random_state=42) tree_clf.fit(X_train, y_train) y_pred_tree = tree_clf.predict(X_test) print(accuracy_score(y_test, y_pred_tree)) # **<그림 7-5. 단일 경정 트리(왼쪽)와 500개 트리로 만든 배깅 앙상블(오른쪽) 비교> 생성 코드** # In[13]: from matplotlib.colors import ListedColormap def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.5, contour=True): x1s = np.linspace(axes[0], axes[1], 100) x2s = np.linspace(axes[2], axes[3], 100) x1, x2 = np.meshgrid(x1s, x2s) X_new = np.c_[x1.ravel(), x2.ravel()] y_pred = clf.predict(X_new).reshape(x1.shape) custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0']) plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap) if contour: custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50']) plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8) plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha) plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha) plt.axis(axes) plt.xlabel(r"$x_1$", fontsize=18) plt.ylabel(r"$x_2$", fontsize=18, rotation=0) # In[14]: fig, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True) plt.sca(axes[0]) plot_decision_boundary(tree_clf, X, y) plt.title("Decision Tree", fontsize=14) plt.sca(axes[1]) plot_decision_boundary(bag_clf, X, y) plt.title("Decision Trees with Bagging", fontsize=14) plt.ylabel("") save_fig("decision_tree_without_and_with_bagging_plot") plt.show() # ## OOB 평가 # In[15]: bag_clf = BaggingClassifier( DecisionTreeClassifier(), n_estimators=500, bootstrap=True, oob_score=True, random_state=40) bag_clf.fit(X_train, y_train) bag_clf.oob_score_ # In[16]: bag_clf.oob_decision_function_ # In[17]: from sklearn.metrics import accuracy_score y_pred = bag_clf.predict(X_test) accuracy_score(y_test, y_pred) # # 랜덤 포레스트 # In[18]: from sklearn.ensemble import RandomForestClassifier rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42) rnd_clf.fit(X_train, y_train) y_pred_rf = rnd_clf.predict(X_test) # 랜덤 포레스트는 결정 트리의 배깅과 비슷합니다: # In[19]: bag_clf = BaggingClassifier( DecisionTreeClassifier(max_features="sqrt", max_leaf_nodes=16), n_estimators=500, random_state=42) # In[20]: bag_clf.fit(X_train, y_train) y_pred = bag_clf.predict(X_test) # In[21]: np.sum(y_pred == y_pred_rf) / len(y_pred) # 거의 에측이 동일합니다. # ## 특성 중요도 # In[22]: from sklearn.datasets import load_iris iris = load_iris() rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42) rnd_clf.fit(iris["data"], iris["target"]) for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_): print(name, score) # In[23]: rnd_clf.feature_importances_ # 다음 그림은 15개 결정 트리의 결정 경계를 중첩한 것입니다. 여기서 볼 수 있듯이 개별 결정 트리는 불완전하지만 앙상블되면 매우 좋은 결정 경계를 만듭니다: # In[24]: plt.figure(figsize=(6, 4)) for i in range(15): tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i) indices_with_replacement = np.random.randint(0, len(X_train), len(X_train)) tree_clf.fit(X_train[indices_with_replacement], y_train[indices_with_replacement]) plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.02, contour=False) plt.show() # **<그림 7-6. (랜덤 포레스트 분류기에서 얻은) MNIST 픽셀 중요도> 생성 코드** # In[25]: from sklearn.datasets import fetch_openml mnist = fetch_openml('mnist_784', version=1) mnist.target = mnist.target.astype(np.uint8) # In[26]: rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42) rnd_clf.fit(mnist["data"], mnist["target"]) # In[27]: def plot_digit(data): image = data.reshape(28, 28) plt.imshow(image, cmap = mpl.cm.hot, interpolation="nearest") plt.axis("off") # In[28]: plot_digit(rnd_clf.feature_importances_) cbar = plt.colorbar(ticks=[rnd_clf.feature_importances_.min(), rnd_clf.feature_importances_.max()]) cbar.ax.set_yticklabels(['Not important', 'Very important']) save_fig("mnist_feature_importance_plot") plt.show() # # 부스팅 # # ## 에이다부스트 # In[29]: from sklearn.ensemble import AdaBoostClassifier ada_clf = AdaBoostClassifier( DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5, random_state=42) ada_clf.fit(X_train, y_train) # In[30]: plot_decision_boundary(ada_clf, X, y) # **<그림 7-8. 연속된 예측기의 결정 경계> 생성 코드** # In[31]: m = len(X_train) fig, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True) for subplot, learning_rate in ((0, 1), (1, 0.5)): sample_weights = np.ones(m) / m plt.sca(axes[subplot]) for i in range(5): svm_clf = SVC(kernel="rbf", C=0.2, gamma=0.6, random_state=42) svm_clf.fit(X_train, y_train, sample_weight=sample_weights * m) y_pred = svm_clf.predict(X_train) r = sample_weights[y_pred != y_train].sum() / sample_weights.sum() # equation 7-1 alpha = learning_rate * np.log((1 - r) / r) # equation 7-2 sample_weights[y_pred != y_train] *= np.exp(alpha) # equation 7-3 sample_weights /= sample_weights.sum() # normalization step plot_decision_boundary(svm_clf, X, y, alpha=0.2) plt.title("learning_rate = {}".format(learning_rate), fontsize=16) if subplot == 0: plt.text(-0.7, -0.65, "1", fontsize=14) plt.text(-0.6, -0.10, "2", fontsize=14) plt.text(-0.5, 0.10, "3", fontsize=14) plt.text(-0.4, 0.55, "4", fontsize=14) plt.text(-0.3, 0.90, "5", fontsize=14) else: plt.ylabel("") save_fig("boosting_plot") plt.show() # # 그레이디언트 부스팅 # 간단한 이차식 형태의 데이터셋을 만들어 보죠: # In[32]: np.random.seed(42) X = np.random.rand(100, 1) - 0.5 y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100) # 이제 이 데이터셋에 결정 트리 회귀 모델을 훈련시킵니다: # In[33]: from sklearn.tree import DecisionTreeRegressor tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42) tree_reg1.fit(X, y) # In[34]: y2 = y - tree_reg1.predict(X) tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42) tree_reg2.fit(X, y2) # In[35]: y3 = y2 - tree_reg2.predict(X) tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42) tree_reg3.fit(X, y3) # In[36]: X_new = np.array([[0.8]]) # In[37]: y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3)) # In[38]: y_pred # **<그림 7-9> 생성 코드** # In[39]: def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None): x1 = np.linspace(axes[0], axes[1], 500) y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors) plt.plot(X[:, 0], y, data_style, label=data_label) plt.plot(x1, y_pred, style, linewidth=2, label=label) if label or data_label: plt.legend(loc="upper center", fontsize=16) plt.axis(axes) # In[40]: plt.figure(figsize=(11,11)) plt.subplot(321) plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set") plt.ylabel("$y$", fontsize=16, rotation=0) plt.title("Residuals and tree predictions", fontsize=16) plt.subplot(322) plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set") plt.ylabel("$y$", fontsize=16, rotation=0) plt.title("Ensemble predictions", fontsize=16) plt.subplot(323) plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals") plt.ylabel("$y - h_1(x_1)$", fontsize=16) plt.subplot(324) plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$") plt.ylabel("$y$", fontsize=16, rotation=0) plt.subplot(325) plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+") plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16) plt.xlabel("$x_1$", fontsize=16) plt.subplot(326) plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$") plt.xlabel("$x_1$", fontsize=16) plt.ylabel("$y$", fontsize=16, rotation=0) save_fig("gradient_boosting_plot") plt.show() # 이제 그레이디언트 부스팅 회귀 모델을 사용해 보죠: # In[41]: from sklearn.ensemble import GradientBoostingRegressor gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42) gbrt.fit(X, y) # **<그림 7-10. 예측기가 부족한 경우(왼쪽)과 너무 많은 경우(오른쪽)의 GBRT 앙상블> 생성 코드** # In[42]: gbrt_slow = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42) gbrt_slow.fit(X, y) # In[43]: fig, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True) plt.sca(axes[0]) plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions") plt.title("learning_rate={}, n_estimators={}".format(gbrt.learning_rate, gbrt.n_estimators), fontsize=14) plt.xlabel("$x_1$", fontsize=16) plt.ylabel("$y$", fontsize=16, rotation=0) plt.sca(axes[1]) plot_predictions([gbrt_slow], X, y, axes=[-0.5, 0.5, -0.1, 0.8]) plt.title("learning_rate={}, n_estimators={}".format(gbrt_slow.learning_rate, gbrt_slow.n_estimators), fontsize=14) plt.xlabel("$x_1$", fontsize=16) save_fig("gbrt_learning_rate_plot") plt.show() # **조기 종료를 사용한 그래디언트 부스팅** # In[44]: import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49) gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42) gbrt.fit(X_train, y_train) errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)] bst_n_estimators = np.argmin(errors) + 1 gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators, random_state=42) gbrt_best.fit(X_train, y_train) # **<그림 7-11. 조기 종료를 사용하여 트리 수 튜닝> 생성 코드** # In[45]: min_error = np.min(errors) # In[46]: plt.figure(figsize=(10, 4)) plt.subplot(121) plt.plot(np.arange(1, len(errors) + 1), errors, "b.-") plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], "k--") plt.plot([0, 120], [min_error, min_error], "k--") plt.plot(bst_n_estimators, min_error, "ko") plt.text(bst_n_estimators, min_error*1.2, "Minimum", ha="center", fontsize=14) plt.axis([0, 120, 0, 0.01]) plt.xlabel("Number of trees") plt.ylabel("Error", fontsize=16) plt.title("Validation error", fontsize=14) plt.subplot(122) plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8]) plt.title("Best model (%d trees)" % bst_n_estimators, fontsize=14) plt.ylabel("$y$", fontsize=16, rotation=0) plt.xlabel("$x_1$", fontsize=16) save_fig("early_stopping_gbrt_plot") plt.show() # 어느 정도 유예를 갖는 조기 종료(5 에포크 동안 향상되지 않을 때만 훈련 중지): # In[47]: gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42) min_val_error = float("inf") error_going_up = 0 for n_estimators in range(1, 120): gbrt.n_estimators = n_estimators gbrt.fit(X_train, y_train) y_pred = gbrt.predict(X_val) val_error = mean_squared_error(y_val, y_pred) if val_error < min_val_error: min_val_error = val_error error_going_up = 0 else: error_going_up += 1 if error_going_up == 5: break # early stopping # In[48]: print(gbrt.n_estimators) # In[49]: print("Minimum validation MSE:", min_val_error) # **XGBoost 사용하기** # In[50]: try: import xgboost except ImportError as ex: print("에러: xgboost 라이브러리 설치되지 않았습니다.") xgboost = None # In[51]: if xgboost is not None: # 책에 없음 xgb_reg = xgboost.XGBRegressor(random_state=42) xgb_reg.fit(X_train, y_train) y_pred = xgb_reg.predict(X_val) val_error = mean_squared_error(y_val, y_pred) # 책에 없음 print("Validation MSE:", val_error) # 책에 없음 # In[52]: if xgboost is not None: # 책에 없음 xgb_reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=2) y_pred = xgb_reg.predict(X_val) val_error = mean_squared_error(y_val, y_pred) # 책에 없음 print("Validation MSE:", val_error) # 책에 없음 # In[53]: get_ipython().run_line_magic('timeit', 'xgboost.XGBRegressor().fit(X_train, y_train) if xgboost is not None else None') # In[54]: get_ipython().run_line_magic('timeit', 'GradientBoostingRegressor().fit(X_train, y_train)') # # 연습문제 해답 # ## 1. to 7. # 부록 A 참조. # ## 8. 투표 기반 분류기 # 문제: _MNIST 데이터를 불러들여 훈련 세트, 검증 세트, 테스트 세트로 나눕니다(예를 들면 훈련에 40,000개 샘플, 검증에 10,000개 샘플, 테스트에 10,000개 샘플)._ # MNIST 데이터셋은 앞에서 로드했습니다. # In[55]: from sklearn.model_selection import train_test_split # In[56]: X_train_val, X_test, y_train_val, y_test = train_test_split( mnist.data, mnist.target, test_size=10000, random_state=42) X_train, X_val, y_train, y_val = train_test_split( X_train_val, y_train_val, test_size=10000, random_state=42) # 문제: _그런 다음 랜덤 포레스트 분류기, 엑스트라 트리 분류기, SVM 같은 여러 종류의 분류기를 훈련시킵니다._ # In[57]: from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.svm import LinearSVC from sklearn.neural_network import MLPClassifier # In[58]: random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42) extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42) svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42) mlp_clf = MLPClassifier(random_state=42) # In[59]: estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf] for estimator in estimators: print("Training the", estimator) estimator.fit(X_train, y_train) # In[60]: [estimator.score(X_val, y_val) for estimator in estimators] # 선형 SVM이 다른 분류기보다 성능이 많이 떨어집니다. 그러나 투표 기반 분류기의 성능을 향상시킬 수 있으므로 그대로 두겠습니다. # 문제: _그리고 검증 세트에서 개개의 분류기보다 더 높은 성능을 내도록 이들을 간접 또는 직접 투표 분류기를 사용하는 앙상블로 연결해보세요._ # In[61]: from sklearn.ensemble import VotingClassifier # In[62]: named_estimators = [ ("random_forest_clf", random_forest_clf), ("extra_trees_clf", extra_trees_clf), ("svm_clf", svm_clf), ("mlp_clf", mlp_clf), ] # In[63]: voting_clf = VotingClassifier(named_estimators) # In[64]: voting_clf.fit(X_train, y_train) # In[65]: voting_clf.score(X_val, y_val) # In[66]: [estimator.score(X_val, y_val) for estimator in voting_clf.estimators_] # SVM 모델을 제거해서 성능이 향상되는지 확인해 보죠. 다음과 같이 `set_params()`를 사용하여 `None`으로 지정하면 특정 예측기를 제외시킬 수 있습니다: # In[67]: voting_clf.set_params(svm_clf=None) # 예측기 목록이 업데이트되었습니다: # In[68]: voting_clf.estimators # 하지만 훈련된 예측기 목록은 업데이트되지 않습니다: # In[69]: voting_clf.estimators_ # `VotingClassifier`를 다시 훈련시키거나 그냥 훈련된 예측기 목록에서 SVM 모델을 제거할 수 있습니다: # In[70]: del voting_clf.estimators_[2] # `VotingClassifier`를 다시 평가해 보죠: # In[71]: voting_clf.score(X_val, y_val) # 훨씬 나아졌네요! SVM 모델이 성능을 저하시켰습니다. 이제 간접 투표 분류기를 사용해 보죠. 분류기를 다시 훈련시킬 필요는 없고 `voting`을 `"soft"`로 지정하면 됩니다: # In[72]: voting_clf.voting = "soft" # In[73]: voting_clf.score(X_val, y_val) # 이 경우는 직접 투표 방식이 낫네요. # _앙상블을 얻고 나면 테스트 세트로 확인해보세요. 개개의 분류기와 비교해서 성능이 얼마나 향상되나요?_ # In[74]: voting_clf.voting = "hard" voting_clf.score(X_test, y_test) # In[75]: [estimator.score(X_test, y_test) for estimator in voting_clf.estimators_] # 여기서는 투표 기반 분류기가 최선의 모델의 오차율을 아주 조금만 감소시킵니다. # ## 9. 스태킹 앙상블 # 문제: _이전 연습문제의 각 분류기를 실행해서 검증 세트에서 예측을 만들고 그 결과로 새로운 훈련 세트를 만들어보세요. 각 훈련 샘플은 하나의 이미지에 대한 전체 분류기의 예측을 담은 벡터고 타깃은 이미지의 클래스입니다. 새로운 훈련 세트에 분류기 하나를 훈련시켜 보세요._ # In[76]: X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32) for index, estimator in enumerate(estimators): X_val_predictions[:, index] = estimator.predict(X_val) # In[77]: X_val_predictions # In[78]: rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42) rnd_forest_blender.fit(X_val_predictions, y_val) # In[79]: rnd_forest_blender.oob_score_ # 이 블렌더를 세밀하게 튜닝하거나 다른 종류의 블렌더(예를 들어, `MLPClassifier`)를 시도해 볼 수 있습니다. 그런 늘 하던대로 다음 교차 검증을 사용해 가장 좋은 것을 선택합니다. # 문제: _축하합니다. 방금 블렌더를 훈련시켰습니다. 그리고 이 분류기를 모아서 스태킹 앙상블을 구성했습니다. 이제 테스트 세트에 앙상블을 평가해보세요. 테스트 세트의 각 이미지에 대해 모든 분류기로 예측을 만들고 앙상블의 예측 결과를 만들기 위해 블렌더에 그 예측을 주입합니다. 앞서 만든 투표 분류기와 비교하면 어떤가요?_ # In[80]: X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32) for index, estimator in enumerate(estimators): X_test_predictions[:, index] = estimator.predict(X_test) # In[81]: y_pred = rnd_forest_blender.predict(X_test_predictions) # In[82]: from sklearn.metrics import accuracy_score # In[83]: accuracy_score(y_test, y_pred) # 이 스태킹 앙상블은 앞서 만든 투표 기반 분류기만큼 성능을 내지는 못합니다. 최선의 개별 분류기만큼 뛰어나지는 않습니다.