아래 링크를 통해 이 노트북을 주피터 노트북 뷰어(nbviewer.org)로 보거나 구글 코랩(colab.research.google.com)에서 실행할 수 있습니다.
주피터 노트북 뷰어로 보기 | 구글 코랩(Colab)에서 실행하기 |
이 노트북은 맷플롯립 그래프에 한글을 쓰기 위해 나눔 폰트를 사용합니다. 코랩의 경우 다음 셀에서 나눔 폰트를 직접 설치합니다.
# 노트북이 코랩에서 실행 중인지 체크합니다.
import os
import sys
if 'google.colab' in sys.modules:
# 사이킷런 최신 버전을 설치합니다.
!pip install -q --upgrade scikit-learn
if not os.path.isdir('mglearn'):
# mglearn을 다운받고 압축을 풉니다.
!wget -q -O mglearn.tar.gz https://bit.ly/mglearn-tar-gz
!tar -xzf mglearn.tar.gz
# 나눔 폰트를 설치합니다.
!sudo apt-get -qq -y install fonts-nanum
import matplotlib.font_manager as fm
font_files = fm.findSystemFonts(fontpaths=['/usr/share/fonts/truetype/nanum'])
for fpath in font_files:
fm.fontManager.addfont(fpath)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.8/10.8 MB 66.7 MB/s eta 0:00:00 debconf: unable to initialize frontend: Dialog debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.) debconf: falling back to frontend: Readline debconf: unable to initialize frontend: Readline debconf: (This frontend requires a controlling tty.) debconf: falling back to frontend: Teletype dpkg-preconfigure: unable to re-open stdin: Selecting previously unselected package fonts-nanum. (Reading database ... 120893 files and directories currently installed.) Preparing to unpack .../fonts-nanum_20200506-1_all.deb ... Unpacking fonts-nanum (20200506-1) ... Setting up fonts-nanum (20200506-1) ... Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...
import sklearn
from preamble import *
import matplotlib
# 나눔 폰트를 사용합니다.
matplotlib.rc('font', family='NanumBarunGothic')
matplotlib.rcParams['axes.unicode_minus'] = False
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
# 데이터 적재와 분할
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0)
# 훈련 데이터의 최솟값, 최댓값을 계산합니다
scaler = MinMaxScaler().fit(X_train)
# 훈련 데이터의 스케일을 조정합니다
X_train_scaled = scaler.transform(X_train)
svm = SVC()
# 스케일 조정된 훈련데이터에 SVM을 학습시킵니다
svm.fit(X_train_scaled, y_train)
# 테스트 데이터의 스케일을 조정하고 점수를 계산합니다
X_test_scaled = scaler.transform(X_test)
print("테스트 점수: {:.2f}".format(svm.score(X_test_scaled, y_test)))
테스트 점수: 0.97
from sklearn.model_selection import GridSearchCV
# 이 코드는 예를 위한 것입니다. 실제로 사용하지 마세요.
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5)
grid.fit(X_train_scaled, y_train)
print("최상의 교차 검증 정확도: {:.2f}".format(grid.best_score_))
print("테스트 점수: {:.2f}".format(grid.score(X_test_scaled, y_test)))
print("최적의 매개변수: ", grid.best_params_)
최상의 교차 검증 정확도: 0.98 테스트 점수: 0.97 최적의 매개변수: {'C': 1, 'gamma': 1}
mglearn.plots.plot_improper_processing()
from sklearn.pipeline import Pipeline
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])
pipe.fit(X_train, y_train)
Pipeline(steps=[('scaler', MinMaxScaler()), ('svm', SVC())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('scaler', MinMaxScaler()), ('svm', SVC())])
MinMaxScaler()
SVC()
print("테스트 점수: {:.2f}".format(pipe.score(X_test, y_test)))
테스트 점수: 0.97
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("최상의 교차 검증 정확도: {:.2f}".format(grid.best_score_))
print("테스트 세트 점수: {:.2f}".format(grid.score(X_test, y_test)))
print("최적의 매개변수:", grid.best_params_)
최상의 교차 검증 정확도: 0.98 테스트 세트 점수: 0.97 최적의 매개변수: {'svm__C': 1, 'svm__gamma': 1}
mglearn.plots.plot_proper_processing()
rnd = np.random.RandomState(seed=0)
X = rnd.normal(size=(100, 10000))
y = rnd.normal(size=(100,))
from sklearn.feature_selection import SelectPercentile, f_regression
select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y)
X_selected = select.transform(X)
print("X_selected.shape:", X_selected.shape)
X_selected.shape: (100, 500)
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
print("교차 검증 점수 (릿지): {:.2f}".format(
np.mean(cross_val_score(Ridge(), X_selected, y, cv=5))))
교차 검증 점수 (릿지): 0.91
pipe = Pipeline([("select", SelectPercentile(score_func=f_regression,
percentile=5)),
("ridge", Ridge())])
print("교차 검증 점수 (파이프라인): {:.2f}".format(
np.mean(cross_val_score(pipe, X, y, cv=5))))
교차 검증 점수 (파이프라인): -0.25
def fit(self, X, y):
X_transformed = X
for name, estimator in self.steps[:-1]:
# 마지막 단계를 빼고 fit과 transform을 반복합니다
X_transformed = estimator.fit_transform(X_transformed, y)
# 마지막 단계 fit을 호출합니다
self.steps[-1][1].fit(X_transformed, y)
return self
def predict(self, X):
X_transformed = X
for step in self.steps[:-1]:
# 마지막 단계를 빼고 transform을 반복합니다
X_transformed = step[1].transform(X_transformed)
# 마지막 단계 predict을 호출합니다
return self.steps[-1][1].predict(X_transformed)
from sklearn import set_config
set_config(display='diagram')
pipe
Pipeline(steps=[('select', SelectPercentile(percentile=5, score_func=<function f_regression at 0x7c5d2a0d3130>)), ('ridge', Ridge())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('select', SelectPercentile(percentile=5, score_func=<function f_regression at 0x7c5d2a0d3130>)), ('ridge', Ridge())])
SelectPercentile(percentile=5, score_func=<function f_regression at 0x7c5d2a0d3130>)
Ridge()
make_pipleline
을 사용한 파이프라인 생성¶from sklearn.pipeline import make_pipeline
# 표준적인 방법
pipe_long = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC(C=100))])
# 간소화된 방법
pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100))
print("파이프라인 단계:\n", pipe_short.steps)
파이프라인 단계: [('minmaxscaler', MinMaxScaler()), ('svc', SVC(C=100))]
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pipe = make_pipeline(StandardScaler(), PCA(n_components=2), StandardScaler())
print("파이프라인 단계:\n", pipe.steps)
파이프라인 단계: [('standardscaler-1', StandardScaler()), ('pca', PCA(n_components=2)), ('standardscaler-2', StandardScaler())]
# cancer 데이터셋에 앞서 만든 파이프라인을 적용합니다
pipe.fit(cancer.data)
# "pca" 단계의 두 개 주성분을 추출합니다
components = pipe.named_steps["pca"].components_
print("components.shape:", components.shape)
components.shape: (2, 30)
from sklearn.linear_model import LogisticRegression
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=4)
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=Pipeline(steps=[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression(max_iter=1000))]), param_grid={'logisticregression__C': [0.01, 0.1, 1, 10, 100]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=5, estimator=Pipeline(steps=[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression(max_iter=1000))]), param_grid={'logisticregression__C': [0.01, 0.1, 1, 10, 100]})
Pipeline(steps=[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression(max_iter=1000))])
StandardScaler()
LogisticRegression(max_iter=1000)
print("최상의 모델:\n", grid.best_estimator_)
최상의 모델: Pipeline(steps=[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression(C=1, max_iter=1000))])
print("로지스틱 회귀 단계:\n",
grid.best_estimator_.named_steps["logisticregression"])
로지스틱 회귀 단계: LogisticRegression(C=1, max_iter=1000)
print("로지스틱 회귀 계수:\n",
grid.best_estimator_.named_steps["logisticregression"].coef_)
로지스틱 회귀 계수: [[-0.436 -0.343 -0.408 -0.534 -0.15 0.61 -0.726 -0.785 0.039 0.275 -1.298 0.049 -0.673 -0.934 -0.139 0.45 -0.13 -0.101 0.434 0.716 -1.091 -1.095 -0.852 -1.064 -0.743 0.073 -0.823 -0.653 -0.644 -0.42 ]]
# 보스턴 주택 데이터셋이 1.2 버전에서 삭제되므로 직접 다운로드합니다.
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
X_train, X_test, y_train, y_test = train_test_split(data, target,
random_state=0)
from sklearn.preprocessing import PolynomialFeatures
pipe = make_pipeline(
StandardScaler(),
PolynomialFeatures(),
Ridge())
param_grid = {'polynomialfeatures__degree': [1, 2, 3],
'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=Pipeline(steps=[('standardscaler', StandardScaler()), ('polynomialfeatures', PolynomialFeatures()), ('ridge', Ridge())]), n_jobs=-1, param_grid={'polynomialfeatures__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=5, estimator=Pipeline(steps=[('standardscaler', StandardScaler()), ('polynomialfeatures', PolynomialFeatures()), ('ridge', Ridge())]), n_jobs=-1, param_grid={'polynomialfeatures__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]})
Pipeline(steps=[('standardscaler', StandardScaler()), ('polynomialfeatures', PolynomialFeatures()), ('ridge', Ridge())])
StandardScaler()
PolynomialFeatures()
Ridge()
mglearn.tools.heatmap(grid.cv_results_['mean_test_score'].reshape(3, -1),
xlabel="ridge__alpha", ylabel="polynomialfeatures__degree",
xticklabels=param_grid['ridge__alpha'],
yticklabels=param_grid['polynomialfeatures__degree'], vmin=0)
plt.show() # 책에는 없음
print("최적의 매개변수:", grid.best_params_)
최적의 매개변수: {'polynomialfeatures__degree': 2, 'ridge__alpha': 10}
print("테스트 세트 점수: {:.2f}".format(grid.score(X_test, y_test)))
테스트 세트 점수: 0.77
param_grid = {'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
pipe = make_pipeline(StandardScaler(), Ridge())
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print("다항 특성이 없을 때 점수: {:.2f}".format(grid.score(X_test, y_test)))
다항 특성이 없을 때 점수: 0.63
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])
from sklearn.ensemble import RandomForestClassifier
param_grid = [
{'classifier': [SVC()], 'preprocessing': [StandardScaler()],
'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
{'classifier': [RandomForestClassifier(n_estimators=100)],
'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0)
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print("최적의 매개변수:\n{}\n".format(grid.best_params_))
print("최상의 교차 검증 점수: {:.2f}".format(grid.best_score_))
print("테스트 세트 점수: {:.2f}".format(grid.score(X_test, y_test)))
최적의 매개변수: {'classifier': SVC(C=10, gamma=0.01), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler()} 최상의 교차 검증 점수: 0.99 테스트 세트 점수: 0.98
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())],
memory="cache_folder")