7장 – 앙상블 학습과 랜덤 포레스트
이 노트북은 7장에 있는 모든 샘플 코드와 연습문제 해답을 가지고 있습니다.
먼저 몇 개의 모듈을 임포트합니다. 맷플롯립 그래프를 인라인으로 출력하도록 만들고 그림을 저장하는 함수를 준비합니다. 또한 파이썬 버전이 3.5 이상인지 확인합니다(파이썬 2.x에서도 동작하지만 곧 지원이 중단되므로 파이썬 3을 사용하는 것이 좋습니다). 사이킷런 버전이 0.20 이상인지도 확인합니다.
# 파이썬 ≥3.5 필수
import sys
assert sys.version_info >= (3, 5)
# 사이킷런 ≥0.20 필수
import sklearn
assert sklearn.__version__ >= "0.20"
# 공통 모듈 임포트
import numpy as np
import os
# 노트북 실행 결과를 동일하게 유지하기 위해
np.random.seed(42)
# 깔끔한 그래프 출력을 위해
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
# 그림을 저장할 위치
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
print("그림 저장:", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)
heads_proba = 0.51
coin_tosses = (np.random.rand(10000, 10) < heads_proba).astype(np.int32)
cumulative_heads_ratio = np.cumsum(coin_tosses, axis=0) / np.arange(1, 10001).reshape(-1, 1)
<그림 7-3. 큰 수의 법칙> 생성 코드
plt.figure(figsize=(8,3.5))
plt.plot(cumulative_heads_ratio)
plt.plot([0, 10000], [0.51, 0.51], "k--", linewidth=2, label="51%")
plt.plot([0, 10000], [0.5, 0.5], "k-", label="50%")
plt.xlabel("Number of coin tosses")
plt.ylabel("Heads ratio")
plt.legend(loc="lower right")
plt.axis([0, 10000, 0.42, 0.58])
save_fig("law_of_large_numbers_plot")
plt.show()
그림 저장: law_of_large_numbers_plot
moons 데이터셋을 사용해 보죠:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
노트: 향후 버전을 위해 사이킷런에서 기본 값이 될 solver="lbfgs"
, n_estimators=100
, gamma="scale"
로 지정합니다.
코드 예제:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42)
voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='hard')
voting_clf.fit(X_train, y_train)
VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)), ('rf', RandomForestClassifier(random_state=42)), ('svc', SVC(random_state=42))])
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
LogisticRegression 0.864 RandomForestClassifier 0.896 SVC 0.896 VotingClassifier 0.912
노트: 사이킷런 알고리즘이 이따금 업데이트되기 때문에 이 노트북의 결과가 책과 조금 다를 수 있습니다.
간접 투표:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", probability=True, random_state=42)
voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='soft')
voting_clf.fit(X_train, y_train)
VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)), ('rf', RandomForestClassifier(random_state=42)), ('svc', SVC(probability=True, random_state=42))], voting='soft')
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
LogisticRegression 0.864 RandomForestClassifier 0.896 SVC 0.896 VotingClassifier 0.92
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
0.904
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))
0.856
<그림 7-5. 단일 경정 트리(왼쪽)와 500개 트리로 만든 배깅 앙상블(오른쪽) 비교> 생성 코드
from matplotlib.colors import ListedColormap
def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.5, contour=True):
x1s = np.linspace(axes[0], axes[1], 100)
x2s = np.linspace(axes[2], axes[3], 100)
x1, x2 = np.meshgrid(x1s, x2s)
X_new = np.c_[x1.ravel(), x2.ravel()]
y_pred = clf.predict(X_new).reshape(x1.shape)
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
if contour:
custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
plt.axis(axes)
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
fig, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)
plt.sca(axes[0])
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
plt.sca(axes[1])
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
plt.ylabel("")
save_fig("decision_tree_without_and_with_bagging_plot")
plt.show()
그림 저장: decision_tree_without_and_with_bagging_plot
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
bootstrap=True, oob_score=True, random_state=40)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_
0.8986666666666666
bag_clf.oob_decision_function_
array([[0.32275132, 0.67724868], [0.34117647, 0.65882353], [1. , 0. ], [0. , 1. ], [0. , 1. ], [0.09497207, 0.90502793], [0.31147541, 0.68852459], [0.01754386, 0.98245614], [0.97109827, 0.02890173], [0.97765363, 0.02234637], [0.74404762, 0.25595238], [0. , 1. ], [0.7173913 , 0.2826087 ], [0.85026738, 0.14973262], [0.97222222, 0.02777778], [0.0625 , 0.9375 ], [0. , 1. ], [0.97837838, 0.02162162], [0.94642857, 0.05357143], [1. , 0. ], [0.01704545, 0.98295455], [0.39473684, 0.60526316], [0.88700565, 0.11299435], [1. , 0. ], [0.97790055, 0.02209945], [0. , 1. ], [0.99428571, 0.00571429], [1. , 0. ], [0. , 1. ], [0.62569832, 0.37430168], [0. , 1. ], [1. , 0. ], [0. , 1. ], [0. , 1. ], [0.13402062, 0.86597938], [1. , 0. ], [0. , 1. ], [0.38251366, 0.61748634], [0. , 1. ], [1. , 0. ], [0.27093596, 0.72906404], [0.34146341, 0.65853659], [1. , 0. ], [1. , 0. ], [0. , 1. ], [1. , 0. ], [1. , 0. ], [0. , 1. ], [1. , 0. ], [0.00531915, 0.99468085], [0.98843931, 0.01156069], [0.91428571, 0.08571429], [0.97282609, 0.02717391], [0.98019802, 0.01980198], [0. , 1. ], [0.07361963, 0.92638037], [0.98019802, 0.01980198], [0.0052356 , 0.9947644 ], [0. , 1. ], [0. , 1. ], [0.97790055, 0.02209945], [0.8 , 0.2 ], [0.42424242, 0.57575758], [1. , 0. ], [0. , 1. ], [0.66477273, 0.33522727], [1. , 0. ], [1. , 0. ], [0.86781609, 0.13218391], [1. , 0. ], [0.56725146, 0.43274854], [0.1576087 , 0.8423913 ], [0.66492147, 0.33507853], [0.91709845, 0.08290155], [0. , 1. ], [0.16759777, 0.83240223], [0.87434555, 0.12565445], [1. , 0. ], [0. , 1. ], [0.995 , 0.005 ], [0. , 1. ], [0.07878788, 0.92121212], [0.05418719, 0.94581281], [0.29015544, 0.70984456], [1. , 0. ], [0. , 1. ], [0.83040936, 0.16959064], [0.01092896, 0.98907104], [0. , 1. ], [0. , 1. ], [0.21465969, 0.78534031], [1. , 0. ], [0. , 1. ], [0. , 1. ], [0. , 1. ], [0.94660194, 0.05339806], [0.77094972, 0.22905028], [0. , 1. ], [1. , 0. ], [0.16574586, 0.83425414], [0.65306122, 0.34693878], [0. , 1. ], [0.02564103, 0.97435897], [0.50555556, 0.49444444], [1. , 0. ], [0.03208556, 0.96791444], [0.99435028, 0.00564972], [0.23699422, 0.76300578], [0.49509804, 0.50490196], [0.9947644 , 0.0052356 ], [0.00555556, 0.99444444], [0.98963731, 0.01036269], [0.26153846, 0.73846154], [0.92972973, 0.07027027], [1. , 0. ], [1. , 0. ], [0. , 1. ], [0. , 1. ], [0.80113636, 0.19886364], [1. , 0. ], [0.0106383 , 0.9893617 ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0.98181818, 0.01818182], [1. , 0. ], [0.01036269, 0.98963731], [0.97752809, 0.02247191], [0.99453552, 0.00546448], [0.01960784, 0.98039216], [0.17857143, 0.82142857], [0.98387097, 0.01612903], [0.29533679, 0.70466321], [0.98295455, 0.01704545], [0. , 1. ], [0.00561798, 0.99438202], [0.75690608, 0.24309392], [0.38624339, 0.61375661], [0.40625 , 0.59375 ], [0.87368421, 0.12631579], [0.92462312, 0.07537688], [0.05181347, 0.94818653], [0.82802548, 0.17197452], [0.01546392, 0.98453608], [0. , 1. ], [0.02298851, 0.97701149], [0.9726776 , 0.0273224 ], [1. , 0. ], [1. , 0. ], [0.01041667, 0.98958333], [0. , 1. ], [0.03804348, 0.96195652], [0.02040816, 0.97959184], [1. , 0. ], [1. , 0. ], [0.94915254, 0.05084746], [1. , 0. ], [1. , 0. ], [0.99462366, 0.00537634], [0. , 1. ], [0.39378238, 0.60621762], [0.33152174, 0.66847826], [0.00609756, 0.99390244], [0. , 1. ], [0.3172043 , 0.6827957 ], [1. , 0. ], [1. , 0. ], [0. , 1. ], [1. , 0. ], [0.00588235, 0.99411765], [0. , 1. ], [0.98924731, 0.01075269], [0. , 1. ], [0. , 1. ], [1. , 0. ], [0. , 1. ], [0.62893082, 0.37106918], [0.92344498, 0.07655502], [0. , 1. ], [0.99526066, 0.00473934], [1. , 0. ], [0.98888889, 0.01111111], [0. , 1. ], [0. , 1. ], [1. , 0. ], [0.06989247, 0.93010753], [1. , 0. ], [0.03608247, 0.96391753], [0. , 1. ], [1. , 0. ], [0. , 1. ], [0.02185792, 0.97814208], [1. , 0. ], [0.95808383, 0.04191617], [0.78362573, 0.21637427], [0.56650246, 0.43349754], [0. , 1. ], [0.18023256, 0.81976744], [1. , 0. ], [0.93121693, 0.06878307], [0.97175141, 0.02824859], [1. , 0. ], [0.00531915, 0.99468085], [0. , 1. ], [0.43010753, 0.56989247], [0.85858586, 0.14141414], [0. , 1. ], [0. , 1. ], [1. , 0. ], [0.00558659, 0.99441341], [0. , 1. ], [0.96923077, 0.03076923], [0. , 1. ], [0.21649485, 0.78350515], [0. , 1. ], [1. , 0. ], [0. , 1. ], [0. , 1. ], [0.98477157, 0.01522843], [0.8 , 0.2 ], [0.99441341, 0.00558659], [0. , 1. ], [0.09497207, 0.90502793], [0.99492386, 0.00507614], [0.01714286, 0.98285714], [0. , 1. ], [0.02747253, 0.97252747], [1. , 0. ], [0.77005348, 0.22994652], [0. , 1. ], [0.90229885, 0.09770115], [0.98387097, 0.01612903], [0.22222222, 0.77777778], [0.20348837, 0.79651163], [1. , 0. ], [0. , 1. ], [0. , 1. ], [0. , 1. ], [0.20338983, 0.79661017], [0.98181818, 0.01818182], [0. , 1. ], [1. , 0. ], [0.98969072, 0.01030928], [0. , 1. ], [0.48663102, 0.51336898], [1. , 0. ], [0.00529101, 0.99470899], [1. , 0. ], [0. , 1. ], [0. , 1. ], [0.08379888, 0.91620112], [0.12352941, 0.87647059], [0.99415205, 0.00584795], [0.03517588, 0.96482412], [1. , 0. ], [0.39790576, 0.60209424], [0.05434783, 0.94565217], [0.53191489, 0.46808511], [0.51898734, 0.48101266], [0. , 1. ], [1. , 0. ], [0. , 1. ], [0. , 1. ], [0.60869565, 0.39130435], [0. , 1. ], [1. , 0. ], [0.24157303, 0.75842697], [0.81578947, 0.18421053], [0.08717949, 0.91282051], [0.99453552, 0.00546448], [0.82142857, 0.17857143], [0. , 1. ], [0. , 1. ], [0.11904762, 0.88095238], [0.04188482, 0.95811518], [0. , 1. ], [1. , 0. ], [0.89150943, 0.10849057], [0.19230769, 0.80769231], [0.95238095, 0.04761905], [0.00515464, 0.99484536], [0.59375 , 0.40625 ], [0.07692308, 0.92307692], [0.99484536, 0.00515464], [0.83684211, 0.16315789], [0. , 1. ], [0.99484536, 0.00515464], [0.95360825, 0.04639175], [0. , 1. ], [0. , 1. ], [1. , 0. ], [0. , 1. ], [1. , 0. ], [0.26395939, 0.73604061], [0.98461538, 0.01538462], [1. , 0. ], [0. , 1. ], [0.00574713, 0.99425287], [0.85142857, 0.14857143], [0. , 1. ], [1. , 0. ], [0.75301205, 0.24698795], [0.8969697 , 0.1030303 ], [1. , 0. ], [0.75555556, 0.24444444], [0.48863636, 0.51136364], [0. , 1. ], [0.92473118, 0.07526882], [0. , 1. ], [1. , 0. ], [0.87709497, 0.12290503], [1. , 0. ], [1. , 0. ], [0.74752475, 0.25247525], [0.09146341, 0.90853659], [0.42268041, 0.57731959], [0.22395833, 0.77604167], [0. , 1. ], [0.87046632, 0.12953368], [0.78212291, 0.21787709], [0.00507614, 0.99492386], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0. , 1. ], [0.02884615, 0.97115385], [0.96 , 0.04 ], [0.93478261, 0.06521739], [1. , 0. ], [0.50731707, 0.49268293], [1. , 0. ], [0. , 1. ], [1. , 0. ], [0.01604278, 0.98395722], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0. , 1. ], [0.96987952, 0.03012048], [0. , 1. ], [0.05172414, 0.94827586], [0. , 1. ], [0. , 1. ], [1. , 0. ], [1. , 0. ], [0. , 1. ], [0.99494949, 0.00505051], [0.01675978, 0.98324022], [1. , 0. ], [0.14583333, 0.85416667], [0. , 1. ], [0.00546448, 0.99453552], [0. , 1. ], [0.41836735, 0.58163265], [0.13095238, 0.86904762], [0.22110553, 0.77889447], [1. , 0. ], [0.97647059, 0.02352941], [0.21195652, 0.78804348], [0.98882682, 0.01117318], [0. , 1. ], [0. , 1. ], [1. , 0. ], [0.96428571, 0.03571429], [0.34554974, 0.65445026], [0.98235294, 0.01764706], [1. , 0. ], [0. , 1. ], [0.99465241, 0.00534759], [0. , 1. ], [0.06043956, 0.93956044], [0.98214286, 0.01785714], [1. , 0. ], [0.03108808, 0.96891192], [0.58854167, 0.41145833]])
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)
0.912
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
랜덤 포레스트는 결정 트리의 배깅과 비슷합니다:
bag_clf = BaggingClassifier(
DecisionTreeClassifier(max_features="sqrt", max_leaf_nodes=16),
n_estimators=500, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
np.sum(y_pred == y_pred_rf) / len(y_pred) # 거의 에측이 동일합니다.
1.0
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
print(name, score)
sepal length (cm) 0.11249225099876375 sepal width (cm) 0.02311928828251033 petal length (cm) 0.4410304643639577 petal width (cm) 0.4233579963547682
rnd_clf.feature_importances_
array([0.11249225, 0.02311929, 0.44103046, 0.423358 ])
다음 그림은 15개 결정 트리의 결정 경계를 중첩한 것입니다. 여기서 볼 수 있듯이 개별 결정 트리는 불완전하지만 앙상블되면 매우 좋은 결정 경계를 만듭니다:
plt.figure(figsize=(6, 4))
for i in range(15):
tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)
indices_with_replacement = np.random.randint(0, len(X_train), len(X_train))
tree_clf.fit(X_train[indices_with_replacement], y_train[indices_with_replacement])
plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.02, contour=False)
plt.show()
<그림 7-6. (랜덤 포레스트 분류기에서 얻은) MNIST 픽셀 중요도> 생성 코드
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.uint8)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rnd_clf.fit(mnist["data"], mnist["target"])
RandomForestClassifier(random_state=42)
def plot_digit(data):
image = data.reshape(28, 28)
plt.imshow(image, cmap = mpl.cm.hot,
interpolation="nearest")
plt.axis("off")
plot_digit(rnd_clf.feature_importances_)
cbar = plt.colorbar(ticks=[rnd_clf.feature_importances_.min(), rnd_clf.feature_importances_.max()])
cbar.ax.set_yticklabels(['Not important', 'Very important'])
save_fig("mnist_feature_importance_plot")
plt.show()
그림 저장: mnist_feature_importance_plot
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), learning_rate=0.5, n_estimators=200, random_state=42)
plot_decision_boundary(ada_clf, X, y)
<그림 7-8. 연속된 예측기의 결정 경계> 생성 코드
m = len(X_train)
fig, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)
for subplot, learning_rate in ((0, 1), (1, 0.5)):
sample_weights = np.ones(m) / m
plt.sca(axes[subplot])
for i in range(5):
svm_clf = SVC(kernel="rbf", C=0.2, gamma=0.6, random_state=42)
svm_clf.fit(X_train, y_train, sample_weight=sample_weights * m)
y_pred = svm_clf.predict(X_train)
r = sample_weights[y_pred != y_train].sum() / sample_weights.sum() # equation 7-1
alpha = learning_rate * np.log((1 - r) / r) # equation 7-2
sample_weights[y_pred != y_train] *= np.exp(alpha) # equation 7-3
sample_weights /= sample_weights.sum() # normalization step
plot_decision_boundary(svm_clf, X, y, alpha=0.2)
plt.title("learning_rate = {}".format(learning_rate), fontsize=16)
if subplot == 0:
plt.text(-0.7, -0.65, "1", fontsize=14)
plt.text(-0.6, -0.10, "2", fontsize=14)
plt.text(-0.5, 0.10, "3", fontsize=14)
plt.text(-0.4, 0.55, "4", fontsize=14)
plt.text(-0.3, 0.90, "5", fontsize=14)
else:
plt.ylabel("")
save_fig("boosting_plot")
plt.show()
그림 저장: boosting_plot
간단한 이차식 형태의 데이터셋을 만들어 보죠:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)
이제 이 데이터셋에 결정 트리 회귀 모델을 훈련시킵니다:
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)
DecisionTreeRegressor(max_depth=2, random_state=42)
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)
DecisionTreeRegressor(max_depth=2, random_state=42)
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)
DecisionTreeRegressor(max_depth=2, random_state=42)
X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred
array([0.75026781])
<그림 7-9> 생성 코드
def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
x1 = np.linspace(axes[0], axes[1], 500)
y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
plt.plot(X[:, 0], y, data_style, label=data_label)
plt.plot(x1, y_pred, style, linewidth=2, label=label)
if label or data_label:
plt.legend(loc="upper center", fontsize=16)
plt.axis(axes)
plt.figure(figsize=(11,11))
plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Residuals and tree predictions", fontsize=16)
plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Ensemble predictions", fontsize=16)
plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=16)
plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=16)
plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)
save_fig("gradient_boosting_plot")
plt.show()
그림 저장: gradient_boosting_plot
이제 그레이디언트 부스팅 회귀 모델을 사용해 보죠:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)
gbrt.fit(X, y)
GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3, random_state=42)
<그림 7-10. 예측기가 부족한 경우(왼쪽)과 너무 많은 경우(오른쪽)의 GBRT 앙상블> 생성 코드
gbrt_slow = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42)
gbrt_slow.fit(X, y)
GradientBoostingRegressor(max_depth=2, n_estimators=200, random_state=42)
fig, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)
plt.sca(axes[0])
plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt.learning_rate, gbrt.n_estimators), fontsize=14)
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.sca(axes[1])
plot_predictions([gbrt_slow], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("learning_rate={}, n_estimators={}".format(gbrt_slow.learning_rate, gbrt_slow.n_estimators), fontsize=14)
plt.xlabel("$x_1$", fontsize=16)
save_fig("gbrt_learning_rate_plot")
plt.show()
그림 저장: gbrt_learning_rate_plot
조기 종료를 사용한 그래디언트 부스팅
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
gbrt.fit(X_train, y_train)
errors = [mean_squared_error(y_val, y_pred)
for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators, random_state=42)
gbrt_best.fit(X_train, y_train)
GradientBoostingRegressor(max_depth=2, n_estimators=56, random_state=42)
<그림 7-11. 조기 종료를 사용하여 트리 수 튜닝> 생성 코드
min_error = np.min(errors)
plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.plot(np.arange(1, len(errors) + 1), errors, "b.-")
plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], "k--")
plt.plot([0, 120], [min_error, min_error], "k--")
plt.plot(bst_n_estimators, min_error, "ko")
plt.text(bst_n_estimators, min_error*1.2, "Minimum", ha="center", fontsize=14)
plt.axis([0, 120, 0, 0.01])
plt.xlabel("Number of trees")
plt.ylabel("Error", fontsize=16)
plt.title("Validation error", fontsize=14)
plt.subplot(122)
plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("Best model (%d trees)" % bst_n_estimators, fontsize=14)
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.xlabel("$x_1$", fontsize=16)
save_fig("early_stopping_gbrt_plot")
plt.show()
그림 저장: early_stopping_gbrt_plot
어느 정도 유예를 갖는 조기 종료(5 에포크 동안 향상되지 않을 때만 훈련 중지):
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
gbrt.n_estimators = n_estimators
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
if val_error < min_val_error:
min_val_error = val_error
error_going_up = 0
else:
error_going_up += 1
if error_going_up == 5:
break # early stopping
print(gbrt.n_estimators)
61
print("Minimum validation MSE:", min_val_error)
Minimum validation MSE: 0.002712853325235463
XGBoost 사용하기
try:
import xgboost
except ImportError as ex:
print("에러: xgboost 라이브러리 설치되지 않았습니다.")
xgboost = None
if xgboost is not None: # 책에 없음
xgb_reg = xgboost.XGBRegressor(random_state=42)
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred) # 책에 없음
print("Validation MSE:", val_error) # 책에 없음
Validation MSE: 0.004000408205406276
if xgboost is not None: # 책에 없음
xgb_reg.fit(X_train, y_train,
eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred) # 책에 없음
print("Validation MSE:", val_error) # 책에 없음
[0] validation_0-rmse:0.22834 [1] validation_0-rmse:0.16224 [2] validation_0-rmse:0.11843 [3] validation_0-rmse:0.08760 [4] validation_0-rmse:0.06848 [5] validation_0-rmse:0.05709 [6] validation_0-rmse:0.05297 [7] validation_0-rmse:0.05129 [8] validation_0-rmse:0.05155 [9] validation_0-rmse:0.05211 Validation MSE: 0.002630868681577655
%timeit xgboost.XGBRegressor().fit(X_train, y_train) if xgboost is not None else None
1min 12s ± 2.2 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit GradientBoostingRegressor().fit(X_train, y_train)
44.4 ms ± 8.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
부록 A 참조.
문제: MNIST 데이터를 불러들여 훈련 세트, 검증 세트, 테스트 세트로 나눕니다(예를 들면 훈련에 40,000개 샘플, 검증에 10,000개 샘플, 테스트에 10,000개 샘플).
MNIST 데이터셋은 앞에서 로드했습니다.
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(
mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train_val, y_train_val, test_size=10000, random_state=42)
문제: 그런 다음 랜덤 포레스트 분류기, 엑스트라 트리 분류기, SVM 같은 여러 종류의 분류기를 훈련시킵니다.
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)
mlp_clf = MLPClassifier(random_state=42)
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
print("Training the", estimator)
estimator.fit(X_train, y_train)
Training the RandomForestClassifier(random_state=42) Training the ExtraTreesClassifier(random_state=42) Training the LinearSVC(max_iter=100, random_state=42, tol=20) Training the MLPClassifier(random_state=42)
[estimator.score(X_val, y_val) for estimator in estimators]
[0.9692, 0.9715, 0.859, 0.9666]
선형 SVM이 다른 분류기보다 성능이 많이 떨어집니다. 그러나 투표 기반 분류기의 성능을 향상시킬 수 있으므로 그대로 두겠습니다.
문제: 그리고 검증 세트에서 개개의 분류기보다 더 높은 성능을 내도록 이들을 간접 또는 직접 투표 분류기를 사용하는 앙상블로 연결해보세요.
from sklearn.ensemble import VotingClassifier
named_estimators = [
("random_forest_clf", random_forest_clf),
("extra_trees_clf", extra_trees_clf),
("svm_clf", svm_clf),
("mlp_clf", mlp_clf),
]
voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, y_train)
VotingClassifier(estimators=[('random_forest_clf', RandomForestClassifier(random_state=42)), ('extra_trees_clf', ExtraTreesClassifier(random_state=42)), ('svm_clf', LinearSVC(max_iter=100, random_state=42, tol=20)), ('mlp_clf', MLPClassifier(random_state=42))])
voting_clf.score(X_val, y_val)
0.9708
[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]
[0.9692, 0.9715, 0.859, 0.9666]
SVM 모델을 제거해서 성능이 향상되는지 확인해 보죠. 다음과 같이 set_params()
를 사용하여 None
으로 지정하면 특정 예측기를 제외시킬 수 있습니다:
voting_clf.set_params(svm_clf=None)
VotingClassifier(estimators=[('random_forest_clf', RandomForestClassifier(random_state=42)), ('extra_trees_clf', ExtraTreesClassifier(random_state=42)), ('svm_clf', None), ('mlp_clf', MLPClassifier(random_state=42))])
예측기 목록이 업데이트되었습니다:
voting_clf.estimators
[('random_forest_clf', RandomForestClassifier(random_state=42)), ('extra_trees_clf', ExtraTreesClassifier(random_state=42)), ('svm_clf', None), ('mlp_clf', MLPClassifier(random_state=42))]
하지만 훈련된 예측기 목록은 업데이트되지 않습니다:
voting_clf.estimators_
[RandomForestClassifier(random_state=42), ExtraTreesClassifier(random_state=42), LinearSVC(max_iter=100, random_state=42, tol=20), MLPClassifier(random_state=42)]
VotingClassifier
를 다시 훈련시키거나 그냥 훈련된 예측기 목록에서 SVM 모델을 제거할 수 있습니다:
del voting_clf.estimators_[2]
VotingClassifier
를 다시 평가해 보죠:
voting_clf.score(X_val, y_val)
0.9741
훨씬 나아졌네요! SVM 모델이 성능을 저하시켰습니다. 이제 간접 투표 분류기를 사용해 보죠. 분류기를 다시 훈련시킬 필요는 없고 voting
을 "soft"
로 지정하면 됩니다:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)
0.972
이 경우는 직접 투표 방식이 낫네요.
앙상블을 얻고 나면 테스트 세트로 확인해보세요. 개개의 분류기와 비교해서 성능이 얼마나 향상되나요?
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)
0.971
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]
[0.9645, 0.9691, 0.9643]
여기서는 투표 기반 분류기가 최선의 모델의 오차율을 아주 조금만 감소시킵니다.
문제: 이전 연습문제의 각 분류기를 실행해서 검증 세트에서 예측을 만들고 그 결과로 새로운 훈련 세트를 만들어보세요. 각 훈련 샘플은 하나의 이미지에 대한 전체 분류기의 예측을 담은 벡터고 타깃은 이미지의 클래스입니다. 새로운 훈련 세트에 분류기 하나를 훈련시켜 보세요.
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)
for index, estimator in enumerate(estimators):
X_val_predictions[:, index] = estimator.predict(X_val)
X_val_predictions
array([[5., 5., 5., 5.], [8., 8., 8., 8.], [2., 2., 3., 2.], ..., [7., 7., 7., 7.], [6., 6., 6., 6.], [7., 7., 7., 7.]], dtype=float32)
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_predictions, y_val)
RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.oob_score_
0.9707
이 블렌더를 세밀하게 튜닝하거나 다른 종류의 블렌더(예를 들어, MLPClassifier
)를 시도해 볼 수 있습니다. 그런 늘 하던대로 다음 교차 검증을 사용해 가장 좋은 것을 선택합니다.
문제: 축하합니다. 방금 블렌더를 훈련시켰습니다. 그리고 이 분류기를 모아서 스태킹 앙상블을 구성했습니다. 이제 테스트 세트에 앙상블을 평가해보세요. 테스트 세트의 각 이미지에 대해 모든 분류기로 예측을 만들고 앙상블의 예측 결과를 만들기 위해 블렌더에 그 예측을 주입합니다. 앞서 만든 투표 분류기와 비교하면 어떤가요?
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)
for index, estimator in enumerate(estimators):
X_test_predictions[:, index] = estimator.predict(X_test)
y_pred = rnd_forest_blender.predict(X_test_predictions)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
0.9695
이 스태킹 앙상블은 앞서 만든 투표 기반 분류기만큼 성능을 내지는 못합니다. 최선의 개별 분류기만큼 뛰어나지는 않습니다.