%load_ext watermark
%watermark -v -p numpy,scipy,sklearn,pandas,matplotlib
CPython 3.7.3 IPython 7.5.0 numpy 1.16.3 scipy 1.2.1 sklearn 0.21.1 pandas 0.24.2 matplotlib 3.0.3
2장 – 머신러닝 프로젝트의 처음부터 끝까지
머신러닝 주택 회사에 오신 것을 환영합니다! 여러분이 해야 할 일은 캘리포니아 인구조사 데이터를 사용해 이 지역의 주택 가격 모델을 만드는 것입니다.
이 노트북은 2장에 있는 모든 샘플 코드와 연습문제 해답을 가지고 있습니다.
노트: 이 주피터 노트북의 결과가 책에 있는 것과 조금 다를 수 있습니다. 대부분은 훈련 알고리즘들이 가지고 있는 무작위성 때문입니다. 가능하면 노트북의 결과를 동일하게 유지하려고 하지만 모든 플랫폼에서 동일한 출력을 낸다고 보장하긴 어렵습니다. 어떤 데이터 구조(가령 딕셔너리)는 아이템의 순서가 일정하지 않습니다. 마지막으로 몇 가지 사소한 버그 수정(해당 부분에 설명을 추가했습니다) 때문에 결과가 조금 달라졌습니다. 하지만 책에서 제시한 설명은 유효합니다.
파이썬 2와 3을 모두 지원합니다. 공통 모듈을 임포트하고 맷플롯립 그림이 노트북 안에 포함되도록 설정하고 생성한 그림을 저장하기 위한 함수를 준비합니다:
# 파이썬 2와 파이썬 3 지원
from __future__ import division, print_function, unicode_literals
# 공통
import numpy as np
import os
# 일관된 출력을 위해 유사난수 초기화
np.random.seed(42)
# 맷플롯립 설정
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# 한글출력
matplotlib.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False
# 그림을 저장할 폴드
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
fetch_housing_data()
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
housing.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
housing.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): longitude 20640 non-null float64 latitude 20640 non-null float64 housing_median_age 20640 non-null float64 total_rooms 20640 non-null float64 total_bedrooms 20433 non-null float64 population 20640 non-null float64 households 20640 non-null float64 median_income 20640 non-null float64 median_house_value 20640 non-null float64 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
housing["ocean_proximity"].value_counts()
<1H OCEAN 9136 INLAND 6551 NEAR OCEAN 2658 NEAR BAY 2290 ISLAND 5 Name: ocean_proximity, dtype: int64
housing.describe()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
---|---|---|---|---|---|---|---|---|---|
count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()
# 일관된 출력을 위해 유사난수 초기화
np.random.seed(42)
import numpy as np
# 예시를 위해서 만든 것입니다. 사이킷런에는 train_test_split() 함수가 있습니다.
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")
16512 train + 4128 test
from zlib import crc32
def test_set_check(identifier, test_ratio):
return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32
def split_train_test_by_id(data, test_ratio, id_column):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
return data.loc[~in_test_set], data.loc[in_test_set]
위의 test_set_check()
함수는 파이썬 2와 파이썬 3에서 모두 작동되고 다음의 hashlib를 사용한 구현보다 훨씬 빠릅니다.
import hashlib
def test_set_check(identifier, test_ratio, hash=hashlib.md5):
return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio
# 이 버전의 test_set_check() 함수가 파이썬 2도 지원합니다.
def test_set_check(identifier, test_ratio, hash=hashlib.md5):
return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio
housing_with_id = housing.reset_index() # `index` 열이 추가된 데이터프레임이 반환됩니다.
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
test_set.head()
index | longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
8 | 8 | -122.26 | 37.84 | 42.0 | 2555.0 | 665.0 | 1206.0 | 595.0 | 2.0804 | 226700.0 | NEAR BAY | -122222.16 |
10 | 10 | -122.26 | 37.85 | 52.0 | 2202.0 | 434.0 | 910.0 | 402.0 | 3.2031 | 281500.0 | NEAR BAY | -122222.15 |
11 | 11 | -122.26 | 37.85 | 52.0 | 3503.0 | 752.0 | 1504.0 | 734.0 | 3.2705 | 241800.0 | NEAR BAY | -122222.15 |
12 | 12 | -122.26 | 37.85 | 52.0 | 2491.0 | 474.0 | 1098.0 | 468.0 | 3.0750 | 213500.0 | NEAR BAY | -122222.15 |
13 | 13 | -122.26 | 37.84 | 52.0 | 696.0 | 191.0 | 345.0 | 174.0 | 2.6736 | 191300.0 | NEAR BAY | -122222.16 |
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
test_set.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
20046 | -119.01 | 36.06 | 25.0 | 1505.0 | NaN | 1392.0 | 359.0 | 1.6812 | 47700.0 | INLAND |
3024 | -119.46 | 35.14 | 30.0 | 2943.0 | NaN | 1565.0 | 584.0 | 2.5313 | 45800.0 | INLAND |
15663 | -122.44 | 37.80 | 52.0 | 3830.0 | NaN | 1310.0 | 963.0 | 3.4801 | 500001.0 | NEAR BAY |
20484 | -118.72 | 34.28 | 17.0 | 3051.0 | NaN | 1705.0 | 495.0 | 5.7376 | 218600.0 | <1H OCEAN |
9814 | -121.93 | 36.62 | 34.0 | 2351.0 | NaN | 1063.0 | 428.0 | 3.7250 | 278000.0 | NEAR OCEAN |
housing["median_income"].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f474cc131d0>
# 소득 카테고리 개수를 제한하기 위해 1.5로 나눕니다.
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
# 5 이상은 5로 레이블합니다.
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
housing["income_cat"].value_counts()
3.0 7236 2.0 6581 4.0 3639 5.0 2362 1.0 822 Name: income_cat, dtype: int64
housing["income_cat"].hist()
save_fig('income_category_hist')
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
strat_test_set["income_cat"].value_counts() / len(strat_test_set)
3.0 0.350533 2.0 0.318798 4.0 0.176357 5.0 0.114583 1.0 0.039729 Name: income_cat, dtype: float64
housing["income_cat"].value_counts() / len(housing)
3.0 0.350581 2.0 0.318847 4.0 0.176308 5.0 0.114438 1.0 0.039826 Name: income_cat, dtype: float64
def income_cat_proportions(data):
return data["income_cat"].value_counts() / len(data)
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
compare_props = pd.DataFrame({
"Overall": income_cat_proportions(housing),
"Stratified": income_cat_proportions(strat_test_set),
"Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
compare_props
Overall | Stratified | Random | Rand. %error | Strat. %error | |
---|---|---|---|---|---|
1.0 | 0.039826 | 0.039729 | 0.040213 | 0.973236 | -0.243309 |
2.0 | 0.318847 | 0.318798 | 0.324370 | 1.732260 | -0.015195 |
3.0 | 0.350581 | 0.350533 | 0.358527 | 2.266446 | -0.013820 |
4.0 | 0.176308 | 0.176357 | 0.167393 | -5.056334 | 0.027480 |
5.0 | 0.114438 | 0.114583 | 0.109496 | -4.318374 | 0.127011 |
for set_ in (strat_train_set, strat_test_set):
set_.drop("income_cat", axis=1, inplace=True)
housing = strat_train_set.copy()
ax = housing.plot(kind="scatter", x="longitude", y="latitude")
ax.set(xlabel='경도', ylabel='위도')
save_fig("bad_visualization_plot")
ax = housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
ax.set(xlabel='경도', ylabel='위도')
save_fig("better_visualization_plot")
sharex=False
매개변수는 x-축의 값과 범례를 표시하지 못하는 버그를 수정합니다. 이는 임시 방편입니다(https://github.com/pandas-dev/pandas/issues/10611 참조). 수정 사항을 알려준 Wilmer Arellano에게 감사합니다.
ax = housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="인구", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
sharex=False)
ax.set(xlabel='경도', ylabel='위도')
plt.legend()
save_fig("housing_prices_scatterplot")
import matplotlib.image as mpimg
california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')
ax = housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7),
s=housing['population']/100, label="인구",
c="median_house_value", cmap=plt.get_cmap("jet"),
colorbar=False, alpha=0.4,
)
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)
plt.ylabel("위도", fontsize=14)
plt.xlabel("경도", fontsize=14)
prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar()
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('중간 주택 가격', fontsize=16)
plt.legend(fontsize=16)
save_fig("california_housing_prices_plot")
plt.show()
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value 1.000000 median_income 0.687160 total_rooms 0.135097 housing_median_age 0.114110 households 0.064506 total_bedrooms 0.047689 population -0.026920 longitude -0.047432 latitude -0.142724 Name: median_house_value, dtype: float64
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")
housing.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1)
plt.axis([0, 16, 0, 550000])
save_fig("income_vs_house_value_scatterplot")
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value 1.000000 median_income 0.687160 rooms_per_household 0.146285 total_rooms 0.135097 housing_median_age 0.114110 households 0.064506 total_bedrooms 0.047689 population_per_household -0.021985 population -0.026920 longitude -0.047432 latitude -0.142724 bedrooms_per_room -0.259984 Name: median_house_value, dtype: float64
housing.plot(kind="scatter", x="rooms_per_household", y="median_house_value",
alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()
housing.describe()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | rooms_per_household | bedrooms_per_room | population_per_household | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 16512.000000 | 16512.000000 | 16512.000000 | 16512.000000 | 16354.000000 | 16512.000000 | 16512.000000 | 16512.000000 | 16512.000000 | 16512.000000 | 16354.000000 | 16512.000000 |
mean | -119.575834 | 35.639577 | 28.653101 | 2622.728319 | 534.973890 | 1419.790819 | 497.060380 | 3.875589 | 206990.920724 | 5.440341 | 0.212878 | 3.096437 |
std | 2.001860 | 2.138058 | 12.574726 | 2138.458419 | 412.699041 | 1115.686241 | 375.720845 | 1.904950 | 115703.014830 | 2.611712 | 0.057379 | 11.584826 |
min | -124.350000 | 32.540000 | 1.000000 | 6.000000 | 2.000000 | 3.000000 | 2.000000 | 0.499900 | 14999.000000 | 1.130435 | 0.100000 | 0.692308 |
25% | -121.800000 | 33.940000 | 18.000000 | 1443.000000 | 295.000000 | 784.000000 | 279.000000 | 2.566775 | 119800.000000 | 4.442040 | 0.175304 | 2.431287 |
50% | -118.510000 | 34.260000 | 29.000000 | 2119.500000 | 433.000000 | 1164.000000 | 408.000000 | 3.540900 | 179500.000000 | 5.232284 | 0.203031 | 2.817653 |
75% | -118.010000 | 37.720000 | 37.000000 | 3141.000000 | 644.000000 | 1719.250000 | 602.000000 | 4.744475 | 263900.000000 | 6.056361 | 0.239831 | 3.281420 |
max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6210.000000 | 35682.000000 | 5358.000000 | 15.000100 | 500001.000000 | 141.909091 | 1.000000 | 1243.333333 |
housing = strat_train_set.drop("median_house_value", axis=1) # 훈련 세트를 위해 레이블 삭제
housing_labels = strat_train_set["median_house_value"].copy()
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|
4629 | -118.30 | 34.07 | 18.0 | 3759.0 | NaN | 3296.0 | 1462.0 | 2.2708 | <1H OCEAN |
6068 | -117.86 | 34.01 | 16.0 | 4632.0 | NaN | 3038.0 | 727.0 | 5.1762 | <1H OCEAN |
17923 | -121.97 | 37.35 | 30.0 | 1955.0 | NaN | 999.0 | 386.0 | 4.6328 | <1H OCEAN |
13656 | -117.30 | 34.05 | 6.0 | 2155.0 | NaN | 1039.0 | 391.0 | 1.6675 | INLAND |
19252 | -122.79 | 38.48 | 7.0 | 6837.0 | NaN | 3468.0 | 1405.0 | 3.1662 | <1H OCEAN |
sample_incomplete_rows.dropna(subset=["total_bedrooms"]) # 옵션 1
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | ocean_proximity |
---|
sample_incomplete_rows.drop("total_bedrooms", axis=1) # 옵션 2
longitude | latitude | housing_median_age | total_rooms | population | households | median_income | ocean_proximity | |
---|---|---|---|---|---|---|---|---|
4629 | -118.30 | 34.07 | 18.0 | 3759.0 | 3296.0 | 1462.0 | 2.2708 | <1H OCEAN |
6068 | -117.86 | 34.01 | 16.0 | 4632.0 | 3038.0 | 727.0 | 5.1762 | <1H OCEAN |
17923 | -121.97 | 37.35 | 30.0 | 1955.0 | 999.0 | 386.0 | 4.6328 | <1H OCEAN |
13656 | -117.30 | 34.05 | 6.0 | 2155.0 | 1039.0 | 391.0 | 1.6675 | INLAND |
19252 | -122.79 | 38.48 | 7.0 | 6837.0 | 3468.0 | 1405.0 | 3.1662 | <1H OCEAN |
median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # 옵션 3
sample_incomplete_rows
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|
4629 | -118.30 | 34.07 | 18.0 | 3759.0 | 433.0 | 3296.0 | 1462.0 | 2.2708 | <1H OCEAN |
6068 | -117.86 | 34.01 | 16.0 | 4632.0 | 433.0 | 3038.0 | 727.0 | 5.1762 | <1H OCEAN |
17923 | -121.97 | 37.35 | 30.0 | 1955.0 | 433.0 | 999.0 | 386.0 | 4.6328 | <1H OCEAN |
13656 | -117.30 | 34.05 | 6.0 | 2155.0 | 433.0 | 1039.0 | 391.0 | 1.6675 | INLAND |
19252 | -122.79 | 38.48 | 7.0 | 6837.0 | 433.0 | 3468.0 | 1405.0 | 3.1662 | <1H OCEAN |
sklearn.preprocessing.Imputer
클래스는 사이킷런 0.20 버전에서 사용 중지 경고가 발생하고 0.22 버전에서 삭제될 예정입니다. 대신 추가된 sklearn.impute.SimpleImputer
클래스를 사용합니다.
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
중간값이 수치형 특성에서만 계산될 수 있기 때문에 텍스트 특성을 삭제합니다:
housing_num = housing.drop('ocean_proximity', axis=1)
# 다른 방법: housing_num = housing.select_dtypes(include=[np.number])
imputer.fit(housing_num)
SimpleImputer(add_indicator=False, copy=True, fill_value=None, missing_values=nan, strategy='median', verbose=0)
imputer.statistics_
array([-118.51 , 34.26 , 29. , 2119.5 , 433. , 1164. , 408. , 3.5409])
각 특성의 중간 값이 수동으로 계산한 것과 같은지 확인해 보세요:
housing_num.median().values
array([-118.51 , 34.26 , 29. , 2119.5 , 433. , 1164. , 408. , 3.5409])
훈련 세트 변환:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index = list(housing.index.values))
housing_tr.loc[sample_incomplete_rows.index.values]
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | |
---|---|---|---|---|---|---|---|---|
4629 | -118.30 | 34.07 | 18.0 | 3759.0 | 433.0 | 3296.0 | 1462.0 | 2.2708 |
6068 | -117.86 | 34.01 | 16.0 | 4632.0 | 433.0 | 3038.0 | 727.0 | 5.1762 |
17923 | -121.97 | 37.35 | 30.0 | 1955.0 | 433.0 | 999.0 | 386.0 | 4.6328 |
13656 | -117.30 | 34.05 | 6.0 | 2155.0 | 433.0 | 1039.0 | 391.0 | 1.6675 |
19252 | -122.79 | 38.48 | 7.0 | 6837.0 | 433.0 | 3468.0 | 1405.0 | 3.1662 |
imputer.strategy
'median'
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | |
---|---|---|---|---|---|---|---|---|
0 | -121.89 | 37.29 | 38.0 | 1568.0 | 351.0 | 710.0 | 339.0 | 2.7042 |
1 | -121.93 | 37.05 | 14.0 | 679.0 | 108.0 | 306.0 | 113.0 | 6.4214 |
2 | -117.20 | 32.77 | 31.0 | 1952.0 | 471.0 | 936.0 | 462.0 | 2.8621 |
3 | -119.61 | 36.31 | 25.0 | 1847.0 | 371.0 | 1460.0 | 353.0 | 1.8839 |
4 | -118.59 | 34.23 | 17.0 | 6592.0 | 1525.0 | 4459.0 | 1463.0 | 3.0347 |
이제 범주형 입력 특성인 ocean_proximity
을 전처리합니다:
housing_cat = housing['ocean_proximity']
housing_cat.head(10)
17606 <1H OCEAN 18632 <1H OCEAN 14650 NEAR OCEAN 3230 INLAND 3555 <1H OCEAN 19480 INLAND 8879 <1H OCEAN 13685 INLAND 4937 <1H OCEAN 4861 <1H OCEAN Name: ocean_proximity, dtype: object
판다스의 factorize()
메소드는 문자열 범주형 특성을 머신러닝 알고리즘이 다루기 쉬운 숫자 범주형 특성으로 변환시켜 줍니다:
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0])
housing_categories
Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')
OneHotEncoder
를 사용하여 범주형 값을 원-핫 벡터로 변경합니다:
사이킷런 0.20 버전에서 OneHotEncoder의 동작 방식이 변경되었습니다. 종전에는 0~최댓값 사이의 정수를 카테고리로 인식했지만 앞으로는 정수나 문자열에 상관없이 고유한 값만을 카테고리로 인식합니다. 경고 메세지를 피하기 위해 categories
매개변수를 auto
로 설정합니다.
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories='auto')
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
<16512x5 sparse matrix of type '<class 'numpy.float64'>' with 16512 stored elements in Compressed Sparse Row format>
OneHotEncoder
는 기본적으로 희소 행렬을 반환합니다. 필요하면 밀집 배열로 변환할 수 있습니다:
housing_cat_1hot.toarray()
array([[1., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.], ..., [0., 0., 1., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 1., 0.]])
# [PR #9151](https://github.com/scikit-learn/scikit-learn/pull/9151)에서 가져온 CategoricalEncoder 클래스의 정의.
# 이 클래스는 사이킷런 0.20에 포함될 예정입니다.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
class CategoricalEncoder(BaseEstimator, TransformerMixin):
"""Encode categorical features as a numeric array.
The input to this transformer should be a matrix of integers or strings,
denoting the values taken on by categorical (discrete) features.
The features can be encoded using a one-hot aka one-of-K scheme
(``encoding='onehot'``, the default) or converted to ordinal integers
(``encoding='ordinal'``).
This encoding is needed for feeding categorical data to many scikit-learn
estimators, notably linear models and SVMs with the standard kernels.
Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
Parameters
----------
encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
The type of encoding to use (default is 'onehot'):
- 'onehot': encode the features using a one-hot aka one-of-K scheme
(or also called 'dummy' encoding). This creates a binary column for
each category and returns a sparse matrix.
- 'onehot-dense': the same as 'onehot' but returns a dense array
instead of a sparse matrix.
- 'ordinal': encode the features as ordinal integers. This results in
a single column of integers (0 to n_categories - 1) per feature.
categories : 'auto' or a list of lists/arrays of values.
Categories (unique values) per feature:
- 'auto' : Determine categories automatically from the training data.
- list : ``categories[i]`` holds the categories expected in the ith
column. The passed categories are sorted before encoding the data
(used categories can be found in the ``categories_`` attribute).
dtype : number type, default np.float64
Desired dtype of output.
handle_unknown : 'error' (default) or 'ignore'
Whether to raise an error or ignore if a unknown categorical feature is
present during transform (default is to raise). When this is parameter
is set to 'ignore' and an unknown category is encountered during
transform, the resulting one-hot encoded columns for this feature
will be all zeros.
Ignoring unknown categories is not supported for
``encoding='ordinal'``.
Attributes
----------
categories_ : list of arrays
The categories of each feature determined during fitting. When
categories were specified manually, this holds the sorted categories
(in order corresponding with output of `transform`).
Examples
--------
Given a dataset with three features and two samples, we let the encoder
find the maximum value per feature and transform the data to a binary
one-hot encoding.
>>> from sklearn.preprocessing import CategoricalEncoder
>>> enc = CategoricalEncoder(handle_unknown='ignore')
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
... # doctest: +ELLIPSIS
CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
encoding='onehot', handle_unknown='ignore')
>>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()
array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.],
[ 0., 1., 1., 0., 0., 0., 0., 0., 0.]])
See also
--------
sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
integer ordinal features. The ``OneHotEncoder assumes`` that input
features take on values in the range ``[0, max(feature)]`` instead of
using the unique values.
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
dictionary items (also handles string-valued features).
sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
encoding of dictionary items or strings.
"""
def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
handle_unknown='error'):
self.encoding = encoding
self.categories = categories
self.dtype = dtype
self.handle_unknown = handle_unknown
def fit(self, X, y=None):
"""Fit the CategoricalEncoder to X.
Parameters
----------
X : array-like, shape [n_samples, n_feature]
The data to determine the categories of each feature.
Returns
-------
self
"""
if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
template = ("encoding should be either 'onehot', 'onehot-dense' "
"or 'ordinal', got %s")
raise ValueError(template % self.handle_unknown)
if self.handle_unknown not in ['error', 'ignore']:
template = ("handle_unknown should be either 'error' or "
"'ignore', got %s")
raise ValueError(template % self.handle_unknown)
if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
raise ValueError("handle_unknown='ignore' is not supported for"
" encoding='ordinal'")
X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
n_samples, n_features = X.shape
self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
for i in range(n_features):
le = self._label_encoders_[i]
Xi = X[:, i]
if self.categories == 'auto':
le.fit(Xi)
else:
valid_mask = np.in1d(Xi, self.categories[i])
if not np.all(valid_mask):
if self.handle_unknown == 'error':
diff = np.unique(Xi[~valid_mask])
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)
le.classes_ = np.array(np.sort(self.categories[i]))
self.categories_ = [le.classes_ for le in self._label_encoders_]
return self
def transform(self, X):
"""Transform X using one-hot encoding.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to encode.
Returns
-------
X_out : sparse matrix or a 2-d array
Transformed input.
"""
X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
n_samples, n_features = X.shape
X_int = np.zeros_like(X, dtype=np.int)
X_mask = np.ones_like(X, dtype=np.bool)
for i in range(n_features):
valid_mask = np.in1d(X[:, i], self.categories_[i])
if not np.all(valid_mask):
if self.handle_unknown == 'error':
diff = np.unique(X[~valid_mask, i])
msg = ("Found unknown categories {0} in column {1}"
" during transform".format(diff, i))
raise ValueError(msg)
else:
# Set the problematic rows to an acceptable value and
# continue `The rows are marked `X_mask` and will be
# removed later.
X_mask[:, i] = valid_mask
X[:, i][~valid_mask] = self.categories_[i][0]
X_int[:, i] = self._label_encoders_[i].transform(X[:, i])
if self.encoding == 'ordinal':
return X_int.astype(self.dtype, copy=False)
mask = X_mask.ravel()
n_values = [cats.shape[0] for cats in self.categories_]
n_values = np.array([0] + n_values)
indices = np.cumsum(n_values)
column_indices = (X_int + indices[:-1]).ravel()[mask]
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
n_features)[mask]
data = np.ones(n_samples * n_features)[mask]
out = sparse.csc_matrix((data, (row_indices, column_indices)),
shape=(n_samples, indices[-1]),
dtype=self.dtype).tocsr()
if self.encoding == 'onehot-dense':
return out.toarray()
else:
return out
CategoricalEncoder
는 하나 이상의 특성을 가진 2D 배열을 기대합니다. 따라서 housing_cat
을 2D 배열로 바꾸어 주어야 합니다:
#from sklearn.preprocessing import CategoricalEncoder # Scikit-Learn 0.20에서 추가 예정
cat_encoder = CategoricalEncoder()
housing_cat_reshaped = housing_cat.values.reshape(-1, 1)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot
<16512x5 sparse matrix of type '<class 'numpy.float64'>' with 16512 stored elements in Compressed Sparse Row format>
사이킷런 0.20 개발 브랜치에 있던 CategoricalEncoder
는 새로운 OneHotEncoder
와 OrdinalEncoder
로 나뉘었습니다. OneHotEncoder
로 문자열로 된 범주형 변수도 변환할 수 있습니다:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(categories='auto')
housing_cat_reshaped = housing_cat.values.reshape(-1, 1)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot
<16512x5 sparse matrix of type '<class 'numpy.float64'>' with 16512 stored elements in Compressed Sparse Row format>
기본 인코딩은 원-핫 벡터이고 희소 행렬로 반환됩니다. toarray()
메소드를 사용하여 밀집 배열로 바꿀 수 있습니다:
housing_cat_1hot.toarray()
array([[1., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.], ..., [0., 1., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 1., 0.]])
또는 encoding 매개변수를 "onehot-dense"
로 지정하여 희소 행렬대신 밀집 행렬을 얻을 수 있습니다. 0.20 버전의 OneHotEncoder
는 sparse=Fasle
옵션을 주어 밀집 행렬을 얻을 수 있습니다:
# cat_encoder = CategoricalEncoder(encoding="onehot-dense")
cat_encoder = OneHotEncoder(categories='auto', sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot
array([[1., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.], ..., [0., 1., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 1., 0.]])
cat_encoder.categories_
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], dtype=object)]
housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)
ocean_proximity | |
---|---|
17606 | <1H OCEAN |
18632 | <1H OCEAN |
14650 | NEAR OCEAN |
3230 | INLAND |
3555 | <1H OCEAN |
19480 | INLAND |
8879 | <1H OCEAN |
13685 | INLAND |
4937 | <1H OCEAN |
4861 | <1H OCEAN |
주의: 번역서는 판다스의 Series.factorize()
메서드를 사용하여 문자열 범주형 특성을 정수로 인코딩합니다. 사이킷런 0.20에 추가될 OrdinalEncoder
클래스(PR #10521)는 입력 특성(레이블 y
가 아니라 X
)을 위해 설계되었고 파이프라인(나중에 이 노트북에서 나옵니다)과 잘 작동되기 때문에 더 좋은 방법입니다. 지금은 future_encoders.py
파일에서 임포트하지만 사이킷런 0.20 버전이 릴리스되면 sklearn.preprocessing
에서 바로 임포팅할 수 있습니다.
0.20 버전 릴리스에 맞추어 sklearn.preprocessing
에서 임포트합니다.
# from future_encoders import OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]
array([[0.], [0.], [4.], [1.], [0.], [1.], [0.], [1.], [0.], [0.]])
ordinal_encoder.categories_
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], dtype=object)]
주의: 번역서는 CategoricalEncoder
를 사용하여 각 범주형 값을 원-핫 벡터로 변경합니다. 이 클래스는 OrdinalEncoder
와 새로운 OneHotEncoder
로 리팩토링되었습니다. 지금은 OneHotEncoder
가 정수형 범주 입력만 다룰 수 있지만 사이킷런 0.20에서는 문자열 범주 입력도 다룰 수 있을 것입니다(PR #10521). 지금은 future_encoders.py
파일에서 임포트하지만 사이킷런 0.20 버전이 릴리스되면 sklearn.preprocessing
에서 바로 임포팅할 수 있습니다.
0.20 버전 릴리스에 맞추어 sklearn.preprocessing
에서 임포트합니다(사실 우리는 이미 위에서 0.20 버전의 OneHotEncoder
를 사용했습니다).
# from future_encoders import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(categories='auto')
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
<16512x5 sparse matrix of type '<class 'numpy.float64'>' with 16512 stored elements in Compressed Sparse Row format>
기본적으로 OneHotEncoder
클래스는 희소 행렬을 반환하지만 필요하면 toarray()
메서드를 호출하여 밀집 배열로 바꿀 수 있습니다:
housing_cat_1hot.toarray()
array([[1., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.], ..., [0., 1., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 1., 0.]])
또는 OneHotEncoder
객체를 만들 때 sparse=False
로 지정하면 됩니다:
cat_encoder = OneHotEncoder(categories='auto', sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
array([[1., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.], ..., [0., 1., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 1., 0.]])
cat_encoder.categories_
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], dtype=object)]
추가 특성을 위해 나만의 변환기를 만들겠습니다:
from sklearn.base import BaseEstimator, TransformerMixin
# 컬럼 인덱스
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs = pd.DataFrame(
housing_extra_attribs,
columns=list(housing.columns)+["rooms_per_household", "population_per_household"])
housing_extra_attribs.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | ocean_proximity | rooms_per_household | population_per_household | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | -121.89 | 37.29 | 38 | 1568 | 351 | 710 | 339 | 2.7042 | <1H OCEAN | 4.62537 | 2.0944 |
1 | -121.93 | 37.05 | 14 | 679 | 108 | 306 | 113 | 6.4214 | <1H OCEAN | 6.00885 | 2.70796 |
2 | -117.2 | 32.77 | 31 | 1952 | 471 | 936 | 462 | 2.8621 | NEAR OCEAN | 4.22511 | 2.02597 |
3 | -119.61 | 36.31 | 25 | 1847 | 371 | 1460 | 353 | 1.8839 | INLAND | 5.23229 | 4.13598 |
4 | -118.59 | 34.23 | 17 | 6592 | 1525 | 4459 | 1463 | 3.0347 | <1H OCEAN | 4.50581 | 3.04785 |
수치 특성을 전처리하기 위한 파이프라인을 만듭니다(0.20 버전에 새로 추가된 SimpleImputer
클래스로 변경합니다):
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr
array([[-1.15604281, 0.77194962, 0.74333089, ..., -0.31205452, -0.08649871, 0.15531753], [-1.17602483, 0.6596948 , -1.1653172 , ..., 0.21768338, -0.03353391, -0.83628902], [ 1.18684903, -1.34218285, 0.18664186, ..., -0.46531516, -0.09240499, 0.4222004 ], ..., [ 1.58648943, -0.72478134, -1.56295222, ..., 0.3469342 , -0.03055414, -0.52177644], [ 0.78221312, -0.85106801, 0.18664186, ..., 0.02499488, 0.06150916, -0.30340741], [-1.43579109, 0.99645926, 1.85670895, ..., -0.22852947, -0.09586294, 0.10180567]])
사이킷런의 0.20 버전에 포함될 ColumnTransformer
를 사용하면 책의 예제에서처럼 DataFrameSelector
와 FeatureUnion
을 사용하지 않고 간단히 전체 파이프라인을 만들 수 있습니다. 아직 사이킷런 0.20 버전이 릴리스되기 전이므로 여기서는 future_encoders.py에 ColumnTransformer
를 넣어 놓고 사용합니다.
사이킷런 0.20 버전에 추가된 sklearn.compose.ColumnTransformer
로 코드를 변경합니다.
# from future_encoders import ColumnTransformer
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(categories='auto'), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
array([[-1.15604281, 0.77194962, 0.74333089, ..., 0. , 0. , 0. ], [-1.17602483, 0.6596948 , -1.1653172 , ..., 0. , 0. , 0. ], [ 1.18684903, -1.34218285, 0.18664186, ..., 0. , 0. , 1. ], ..., [ 1.58648943, -0.72478134, -1.56295222, ..., 0. , 0. , 0. ], [ 0.78221312, -0.85106801, 0.18664186, ..., 0. , 0. , 0. ], [-1.43579109, 0.99645926, 1.85670895, ..., 0. , 1. , 0. ]])
판단스 DataFrame 컬럼의 일부를 선택하는 변환기를 만듭니다:
from sklearn.base import BaseEstimator, TransformerMixin
# 사이킷런이 DataFrame을 바로 사용하지 못하므로
# 수치형이나 범주형 컬럼을 선택하는 클래스를 만듭니다.
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
하나의 큰 파이프라인에 이들을 모두 결합하여 수치형과 범주형 특성을 전처리합니다:
0.20 버전에 추가된 SimpleImputer
를 사용합니다.
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', OneHotEncoder(categories='auto', sparse=False)),
])
사이킷런 0.20 버전에 추가된 ColumnTransformer
로 만든 full_pipline
을 사용합니다:
# from sklearn.pipeline import FeatureUnion
# full_pipeline = FeatureUnion(transformer_list=[
# ("num_pipeline", num_pipeline),
# ("cat_pipeline", cat_pipeline),
# ])
full_pipeline = ColumnTransformer([
("num_pipeline", num_pipeline, num_attribs),
("cat_encoder", OneHotEncoder(categories='auto'), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
array([[-1.15604281, 0.77194962, 0.74333089, ..., 0. , 0. , 0. ], [-1.17602483, 0.6596948 , -1.1653172 , ..., 0. , 0. , 0. ], [ 1.18684903, -1.34218285, 0.18664186, ..., 0. , 0. , 1. ], ..., [ 1.58648943, -0.72478134, -1.56295222, ..., 0. , 0. , 0. ], [ 0.78221312, -0.85106801, 0.18664186, ..., 0. , 0. , 0. ], [-1.43579109, 0.99645926, 1.85670895, ..., 0. , 1. , 0. ]])
housing_prepared.shape
(16512, 16)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
# 훈련 샘플 몇 개를 사용해 전체 파이프라인을 적용해 보겠습니다.
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("예측:", lin_reg.predict(some_data_prepared))
예측: [210644.60459286 317768.80697211 210956.43331178 59218.98886849 189747.55849879]
실제 값과 비교합니다:
print("레이블:", list(some_labels))
레이블: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
some_data_prepared
array([[-1.15604281, 0.77194962, 0.74333089, -0.49323393, -0.44543821, -0.63621141, -0.42069842, -0.61493744, -0.31205452, -0.08649871, 0.15531753, 1. , 0. , 0. , 0. , 0. ], [-1.17602483, 0.6596948 , -1.1653172 , -0.90896655, -1.0369278 , -0.99833135, -1.02222705, 1.33645936, 0.21768338, -0.03353391, -0.83628902, 1. , 0. , 0. , 0. , 0. ], [ 1.18684903, -1.34218285, 0.18664186, -0.31365989, -0.15334458, -0.43363936, -0.0933178 , -0.5320456 , -0.46531516, -0.09240499, 0.4222004 , 0. , 0. , 0. , 0. , 1. ], [-0.01706767, 0.31357576, -0.29052016, -0.36276217, -0.39675594, 0.03604096, -0.38343559, -1.04556555, -0.07966124, 0.08973561, -0.19645314, 0. , 1. , 0. , 0. , 0. ], [ 0.49247384, -0.65929936, -0.92673619, 1.85619316, 2.41221109, 2.72415407, 2.57097492, -0.44143679, -0.35783383, -0.00419445, 0.2699277 , 1. , 0. , 0. , 0. , 0. ]])
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
68628.19819848923
from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae
49439.89599001897
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=42, splitter='best')
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
0.0
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print("점수:", scores)
print("평균:", scores.mean())
print("표준편차:", scores.std())
display_scores(tree_rmse_scores)
점수: [70194.33680785 66855.16363941 72432.58244769 70758.73896782 71115.88230639 75585.14172901 70262.86139133 70273.6325285 75366.87952553 71231.65726027] 평균: 71407.68766037929 표준편차: 2439.4345041191004
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
점수: [66782.73843989 66960.118071 70347.95244419 74739.57052552 68031.13388938 71193.84183426 64969.63056405 68281.61137997 71552.91566558 67665.10082067] 평균: 69052.46136345083 표준편차: 2731.674001798349
사이킷런 0.22 버전에서 랜덤 포레스트의 n_estimator
기본값이 10에서 100으로 변경됩니다. 0.20 버전에서 n_estimator
값을 지정하지 않을 경우 이에 대한 경고 메세지가 나옵니다. 경고 메세지를 피하기 위해 명시적으로 n_estimator
를 10으로 설정합니다.
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=42, verbose=0, warm_start=False)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
21933.31414779769
from sklearn.model_selection import cross_val_score
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
점수: [51646.44545909 48940.60114882 53050.86323649 54408.98730149 50922.14870785 56482.50703987 51864.52025526 49760.85037653 55434.21627933 53326.10093303] 평균: 52583.72407377466 표준편차: 2298.353351147122
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()
count 10.000000 mean 69052.461363 std 2879.437224 min 64969.630564 25% 67136.363758 50% 68156.372635 75% 70982.369487 max 74739.570526 dtype: float64
from sklearn.svm import SVR
svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse
111094.6308539982
from sklearn.model_selection import GridSearchCV
param_grid = [
# 하이퍼파라미터 12(=3×4)개의 조합을 시도합니다.
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
# bootstrap은 False로 하고 6(=2×3)개의 조합을 시도합니다.
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor(random_state=42)
# 다섯 폴드에서 훈련하면 총 (12+6)*5=90번의 훈련이 일어납니다.
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error',
return_train_score=True, n_jobs=-1)
grid_search.fit(housing_prepared, housing_labels)
GridSearchCV(cv=5, error_score='raise-deprecating', estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None, oob_score=False, random_state=42, verbose=0, warm_start=False), iid='warn', n_jobs=-1, param_grid=[{'max_features': [2, 4, 6, 8], 'n_estimators': [3, 10, 30]}, {'bootstrap': [False], 'max_features': [2, 3, 4], 'n_estimators': [3, 10]}], pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring='neg_mean_squared_error', verbose=0)
최상의 파라미터 조합:
grid_search.best_params_
{'max_features': 8, 'n_estimators': 30}
grid_search.best_estimator_
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None, oob_score=False, random_state=42, verbose=0, warm_start=False)
그리드서치에서 테스트한 하이퍼파라미터 조합의 점수를 확인합니다:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
63669.05791727153 {'max_features': 2, 'n_estimators': 3} 55627.16171305252 {'max_features': 2, 'n_estimators': 10} 53384.57867637289 {'max_features': 2, 'n_estimators': 30} 60965.99185930139 {'max_features': 4, 'n_estimators': 3} 52740.98248528835 {'max_features': 4, 'n_estimators': 10} 50377.344409590376 {'max_features': 4, 'n_estimators': 30} 58663.84733372485 {'max_features': 6, 'n_estimators': 3} 52006.15355973719 {'max_features': 6, 'n_estimators': 10} 50146.465964159885 {'max_features': 6, 'n_estimators': 30} 57869.25504027614 {'max_features': 8, 'n_estimators': 3} 51711.09443660957 {'max_features': 8, 'n_estimators': 10} 49682.25345942335 {'max_features': 8, 'n_estimators': 30} 62895.088889905004 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3} 54658.14484390074 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10} 59470.399594730654 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3} 52725.01091081235 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10} 57490.612956065226 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3} 51009.51445842374 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}
pd.DataFrame(grid_search.cv_results_)
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_features | param_n_estimators | param_bootstrap | params | split0_test_score | split1_test_score | ... | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | split3_train_score | split4_train_score | mean_train_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.050272 | 0.000983 | 0.002671 | 0.000031 | 2 | 3 | NaN | {'max_features': 2, 'n_estimators': 3} | -3.837622e+09 | -4.147108e+09 | ... | -4.053749e+09 | 1.519609e+08 | 18 | -1.064113e+09 | -1.105142e+09 | -1.116550e+09 | -1.112342e+09 | -1.129650e+09 | -1.105559e+09 | 2.220402e+07 |
1 | 0.155895 | 0.002054 | 0.007374 | 0.000096 | 2 | 10 | NaN | {'max_features': 2, 'n_estimators': 10} | -3.047771e+09 | -3.254861e+09 | ... | -3.094381e+09 | 1.327046e+08 | 11 | -5.927175e+08 | -5.870952e+08 | -5.776964e+08 | -5.716332e+08 | -5.802501e+08 | -5.818785e+08 | 7.345821e+06 |
2 | 0.466790 | 0.003222 | 0.020763 | 0.000376 | 2 | 30 | NaN | {'max_features': 2, 'n_estimators': 30} | -2.689185e+09 | -3.021086e+09 | ... | -2.849913e+09 | 1.626879e+08 | 9 | -4.381089e+08 | -4.391272e+08 | -4.371702e+08 | -4.376955e+08 | -4.452654e+08 | -4.394734e+08 | 2.966320e+06 |
3 | 0.080077 | 0.000765 | 0.002662 | 0.000017 | 4 | 3 | NaN | {'max_features': 4, 'n_estimators': 3} | -3.730181e+09 | -3.786886e+09 | ... | -3.716852e+09 | 1.631421e+08 | 16 | -9.865163e+08 | -1.012565e+09 | -9.169425e+08 | -1.037400e+09 | -9.707739e+08 | -9.848396e+08 | 4.084607e+07 |
4 | 0.260610 | 0.002850 | 0.007349 | 0.000147 | 4 | 10 | NaN | {'max_features': 4, 'n_estimators': 10} | -2.666283e+09 | -2.784511e+09 | ... | -2.781611e+09 | 1.268562e+08 | 8 | -5.097115e+08 | -5.162820e+08 | -4.962893e+08 | -5.436192e+08 | -5.160297e+08 | -5.163863e+08 | 1.542862e+07 |
5 | 0.771269 | 0.001967 | 0.020760 | 0.000226 | 4 | 30 | NaN | {'max_features': 4, 'n_estimators': 30} | -2.387153e+09 | -2.588448e+09 | ... | -2.537877e+09 | 1.214603e+08 | 3 | -3.838835e+08 | -3.880268e+08 | -3.790867e+08 | -4.040957e+08 | -3.845520e+08 | -3.879289e+08 | 8.571233e+06 |
6 | 0.109181 | 0.003021 | 0.002627 | 0.000022 | 6 | 3 | NaN | {'max_features': 6, 'n_estimators': 3} | -3.119657e+09 | -3.586319e+09 | ... | -3.441447e+09 | 1.893141e+08 | 14 | -9.245343e+08 | -8.886939e+08 | -9.353135e+08 | -9.009801e+08 | -8.624664e+08 | -9.023976e+08 | 2.591445e+07 |
7 | 0.359064 | 0.002993 | 0.007470 | 0.000265 | 6 | 10 | NaN | {'max_features': 6, 'n_estimators': 10} | -2.549663e+09 | -2.782039e+09 | ... | -2.704640e+09 | 1.471542e+08 | 6 | -4.980344e+08 | -5.045869e+08 | -4.994664e+08 | -4.990325e+08 | -5.055542e+08 | -5.013349e+08 | 3.100456e+06 |
8 | 1.085904 | 0.007037 | 0.020348 | 0.000221 | 6 | 30 | NaN | {'max_features': 6, 'n_estimators': 30} | -2.370010e+09 | -2.583638e+09 | ... | -2.514668e+09 | 1.285063e+08 | 2 | -3.838538e+08 | -3.804711e+08 | -3.805218e+08 | -3.856095e+08 | -3.901917e+08 | -3.841296e+08 | 3.617057e+06 |
9 | 0.140850 | 0.003441 | 0.002625 | 0.000012 | 8 | 3 | NaN | {'max_features': 8, 'n_estimators': 3} | -3.353504e+09 | -3.348552e+09 | ... | -3.348851e+09 | 1.241864e+08 | 13 | -9.228123e+08 | -8.553031e+08 | -8.603321e+08 | -8.881964e+08 | -9.151287e+08 | -8.883545e+08 | 2.750227e+07 |
10 | 0.464359 | 0.003905 | 0.007256 | 0.000095 | 8 | 10 | NaN | {'max_features': 8, 'n_estimators': 10} | -2.571970e+09 | -2.718994e+09 | ... | -2.674037e+09 | 1.392720e+08 | 5 | -4.932416e+08 | -4.815238e+08 | -4.730979e+08 | -5.155367e+08 | -4.985555e+08 | -4.923911e+08 | 1.459294e+07 |
11 | 1.401487 | 0.007872 | 0.020617 | 0.000496 | 8 | 30 | NaN | {'max_features': 8, 'n_estimators': 30} | -2.357390e+09 | -2.546640e+09 | ... | -2.468326e+09 | 1.091647e+08 | 1 | -3.841658e+08 | -3.744500e+08 | -3.773239e+08 | -3.882250e+08 | -3.810005e+08 | -3.810330e+08 | 4.871017e+06 |
12 | 0.074540 | 0.001222 | 0.003115 | 0.000243 | 2 | 3 | False | {'bootstrap': False, 'max_features': 2, 'n_est... | -3.785816e+09 | -4.166012e+09 | ... | -3.955792e+09 | 1.900966e+08 | 17 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
13 | 0.246445 | 0.001551 | 0.008540 | 0.000139 | 2 | 10 | False | {'bootstrap': False, 'max_features': 2, 'n_est... | -2.810721e+09 | -3.107789e+09 | ... | -2.987513e+09 | 1.539231e+08 | 10 | -6.056477e-02 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | -2.967449e+00 | -6.056027e-01 | 1.181156e+00 |
14 | 0.100478 | 0.002083 | 0.003043 | 0.000110 | 3 | 3 | False | {'bootstrap': False, 'max_features': 3, 'n_est... | -3.618324e+09 | -3.441527e+09 | ... | -3.536728e+09 | 7.795196e+07 | 15 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | -6.072840e+01 | -1.214568e+01 | 2.429136e+01 |
15 | 0.329762 | 0.002839 | 0.008933 | 0.000553 | 3 | 10 | False | {'bootstrap': False, 'max_features': 3, 'n_est... | -2.757999e+09 | -2.851737e+09 | ... | -2.779927e+09 | 6.286611e+07 | 7 | -2.089484e+01 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | -5.465556e+00 | -5.272080e+00 | 8.093117e+00 |
16 | 0.125876 | 0.002710 | 0.003012 | 0.000109 | 4 | 3 | False | {'bootstrap': False, 'max_features': 4, 'n_est... | -3.134040e+09 | -3.559375e+09 | ... | -3.305171e+09 | 1.879203e+08 | 12 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
17 | 0.410446 | 0.007215 | 0.008174 | 0.000232 | 4 | 10 | False | {'bootstrap': False, 'max_features': 4, 'n_est... | -2.525578e+09 | -2.710011e+09 | ... | -2.601971e+09 | 1.088031e+08 | 4 | -0.000000e+00 | -1.514119e-02 | -0.000000e+00 | -0.000000e+00 | -0.000000e+00 | -3.028238e-03 | 6.056477e-03 |
18 rows × 23 columns
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
'n_estimators': randint(low=1, high=200),
'max_features': randint(low=1, high=8),
}
forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
n_iter=10, cv=5, scoring='neg_mean_squared_error',
random_state=42, n_jobs=-1)
rnd_search.fit(housing_prepared, housing_labels)
RandomizedSearchCV(cv=5, error_score='raise-deprecating', estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None, oob_score=False, random_sta... warm_start=False), iid='warn', n_iter=10, n_jobs=-1, param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f474dc82320>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f474dc82dd8>}, pre_dispatch='2*n_jobs', random_state=42, refit=True, return_train_score=False, scoring='neg_mean_squared_error', verbose=0)
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
49150.657232934034 {'max_features': 7, 'n_estimators': 180} 51389.85295710133 {'max_features': 5, 'n_estimators': 15} 50796.12045980556 {'max_features': 3, 'n_estimators': 72} 50835.09932039744 {'max_features': 5, 'n_estimators': 21} 49280.90117886215 {'max_features': 7, 'n_estimators': 122} 50774.86679035961 {'max_features': 3, 'n_estimators': 75} 50682.75001237282 {'max_features': 3, 'n_estimators': 88} 49608.94061293652 {'max_features': 5, 'n_estimators': 100} 50473.57642831875 {'max_features': 3, 'n_estimators': 150} 64429.763804893395 {'max_features': 5, 'n_estimators': 2}
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
array([7.33442355e-02, 6.29090705e-02, 4.11437985e-02, 1.46726854e-02, 1.41064835e-02, 1.48742809e-02, 1.42575993e-02, 3.66158981e-01, 5.64191792e-02, 1.08792957e-01, 5.33510773e-02, 1.03114883e-02, 1.64780994e-01, 6.02803867e-05, 1.96041560e-03, 2.85647464e-03])
사이킷런 0.20 버전의 ColumnTransformer
를 사용했기 때문에 full_pipeline
에서 cat_encoder
를 가져옵니다. 즉 cat_pipeline
을 사용하지 않았습니다:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
# cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_encoder = full_pipeline.named_transformers_["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
[(0.36615898061813423, 'median_income'), (0.16478099356159054, 'INLAND'), (0.10879295677551575, 'pop_per_hhold'), (0.07334423551601243, 'longitude'), (0.06290907048262032, 'latitude'), (0.056419179181954014, 'rooms_per_hhold'), (0.053351077347675815, 'bedrooms_per_room'), (0.04114379847872964, 'housing_median_age'), (0.014874280890402769, 'population'), (0.014672685420543239, 'total_rooms'), (0.014257599323407808, 'households'), (0.014106483453584104, 'total_bedrooms'), (0.010311488326303788, '<1H OCEAN'), (0.0028564746373201584, 'NEAR OCEAN'), (0.0019604155994780706, 'NEAR BAY'), (6.0280386727366e-05, 'ISLAND')]
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
47730.22690385927
테스트 RMSE에 대한 95% 신뢰 구간을 계산할 수 있습니다:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
mean = squared_errors.mean()
m = len(squared_errors)
np.sqrt(stats.t.interval(confidence, m - 1,
loc=np.mean(squared_errors),
scale=stats.sem(squared_errors)))
array([45685.10470776, 49691.25001878])
다음과 같이 수동으로 계산할 수도 있습니다:
tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)
tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)
(45685.10470776014, 49691.25001877871)
또는 t 점수 대신 z 점수를 사용할 수도 있습니다:
zscore = stats.norm.ppf((1 + confidence) / 2)
zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)
(45685.717918136594, 49690.68623889426)
full_pipeline_with_predictor = Pipeline([
("preparation", full_pipeline),
("linear", LinearRegression())
])
full_pipeline_with_predictor.fit(housing, housing_labels)
full_pipeline_with_predictor.predict(some_data)
array([210644.60459286, 317768.80697211, 210956.43331178, 59218.98886849, 189747.55849879])
my_model = full_pipeline_with_predictor
from sklearn.externals import joblib
joblib.dump(my_model, "my_model.pkl") # DIFF
#...
my_model_loaded = joblib.load("my_model.pkl") # DIFF
/home/haesun/anaconda3/envs/handson-ml/lib/python3.7/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+. warnings.warn(msg, category=DeprecationWarning)
RandomizedSearchCV
을 위한 Scipy 분포 함수¶from scipy.stats import geom, expon
geom_distrib=geom(0.5).rvs(10000, random_state=42)
expon_distrib=expon(scale=1).rvs(10000, random_state=42)
plt.hist(geom_distrib, bins=50)
plt.show()
plt.hist(expon_distrib, bins=50)
plt.show()
질문: 서포트 벡터 머신 회귀(sklearn.svm.SVR)를 kernel=“linear”(하이퍼파라미터 C를 바꿔가며)나 kernel=“rbf”(하이퍼파라미터 C와 gamma를 바꿔가며) 등의 다양한 하이퍼파라미터 설정으로 시도해보세요. 지금은 이 하이퍼파라미터가 무엇을 의미하는지 너무 신경 쓰지 마세요. 최상의 SVR 모델은 무엇인가요?
from sklearn.model_selection import GridSearchCV
param_grid = [
{'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
{'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
]
svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error',
verbose=2, n_jobs=1)
grid_search.fit(housing_prepared, housing_labels)
Fitting 5 folds for each of 50 candidates, totalling 250 fits [CV] C=10.0, kernel=linear ...........................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ............................ C=10.0, kernel=linear, total= 4.3s [CV] C=10.0, kernel=linear ...........................................
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 4.3s remaining: 0.0s
[CV] ............................ C=10.0, kernel=linear, total= 4.4s [CV] C=10.0, kernel=linear ........................................... [CV] ............................ C=10.0, kernel=linear, total= 4.3s [CV] C=10.0, kernel=linear ........................................... [CV] ............................ C=10.0, kernel=linear, total= 4.3s [CV] C=10.0, kernel=linear ........................................... [CV] ............................ C=10.0, kernel=linear, total= 4.3s [CV] C=30.0, kernel=linear ........................................... [CV] ............................ C=30.0, kernel=linear, total= 4.3s [CV] C=30.0, kernel=linear ........................................... [CV] ............................ C=30.0, kernel=linear, total= 4.3s [CV] C=30.0, kernel=linear ........................................... [CV] ............................ C=30.0, kernel=linear, total= 4.4s [CV] C=30.0, kernel=linear ........................................... [CV] ............................ C=30.0, kernel=linear, total= 4.4s [CV] C=30.0, kernel=linear ........................................... [CV] ............................ C=30.0, kernel=linear, total= 4.3s [CV] C=100.0, kernel=linear .......................................... [CV] ........................... C=100.0, kernel=linear, total= 4.3s [CV] C=100.0, kernel=linear .......................................... [CV] ........................... C=100.0, kernel=linear, total= 4.3s [CV] C=100.0, kernel=linear .......................................... [CV] ........................... C=100.0, kernel=linear, total= 4.4s [CV] C=100.0, kernel=linear .......................................... [CV] ........................... C=100.0, kernel=linear, total= 4.3s [CV] C=100.0, kernel=linear .......................................... [CV] ........................... C=100.0, kernel=linear, total= 4.2s [CV] C=300.0, kernel=linear .......................................... [CV] ........................... C=300.0, kernel=linear, total= 4.3s [CV] C=300.0, kernel=linear .......................................... [CV] ........................... C=300.0, kernel=linear, total= 4.3s [CV] C=300.0, kernel=linear .......................................... [CV] ........................... C=300.0, kernel=linear, total= 4.4s [CV] C=300.0, kernel=linear .......................................... [CV] ........................... C=300.0, kernel=linear, total= 4.4s [CV] C=300.0, kernel=linear .......................................... [CV] ........................... C=300.0, kernel=linear, total= 4.3s [CV] C=1000.0, kernel=linear ......................................... [CV] .......................... C=1000.0, kernel=linear, total= 4.5s [CV] C=1000.0, kernel=linear ......................................... [CV] .......................... C=1000.0, kernel=linear, total= 4.5s [CV] C=1000.0, kernel=linear ......................................... [CV] .......................... C=1000.0, kernel=linear, total= 4.5s [CV] C=1000.0, kernel=linear ......................................... [CV] .......................... C=1000.0, kernel=linear, total= 4.5s [CV] C=1000.0, kernel=linear ......................................... [CV] .......................... C=1000.0, kernel=linear, total= 4.4s [CV] C=3000.0, kernel=linear ......................................... [CV] .......................... C=3000.0, kernel=linear, total= 4.8s [CV] C=3000.0, kernel=linear ......................................... [CV] .......................... C=3000.0, kernel=linear, total= 4.8s [CV] C=3000.0, kernel=linear ......................................... [CV] .......................... C=3000.0, kernel=linear, total= 4.9s [CV] C=3000.0, kernel=linear ......................................... [CV] .......................... C=3000.0, kernel=linear, total= 4.9s [CV] C=3000.0, kernel=linear ......................................... [CV] .......................... C=3000.0, kernel=linear, total= 4.7s [CV] C=10000.0, kernel=linear ........................................ [CV] ......................... C=10000.0, kernel=linear, total= 6.3s [CV] C=10000.0, kernel=linear ........................................ [CV] ......................... C=10000.0, kernel=linear, total= 6.4s [CV] C=10000.0, kernel=linear ........................................ [CV] ......................... C=10000.0, kernel=linear, total= 6.5s [CV] C=10000.0, kernel=linear ........................................ [CV] ......................... C=10000.0, kernel=linear, total= 6.0s [CV] C=10000.0, kernel=linear ........................................ [CV] ......................... C=10000.0, kernel=linear, total= 5.8s [CV] C=30000.0, kernel=linear ........................................ [CV] ......................... C=30000.0, kernel=linear, total= 9.9s [CV] C=30000.0, kernel=linear ........................................ [CV] ......................... C=30000.0, kernel=linear, total= 10.1s [CV] C=30000.0, kernel=linear ........................................ [CV] ......................... C=30000.0, kernel=linear, total= 10.5s [CV] C=30000.0, kernel=linear ........................................ [CV] ......................... C=30000.0, kernel=linear, total= 10.1s [CV] C=30000.0, kernel=linear ........................................ [CV] ......................... C=30000.0, kernel=linear, total= 9.1s [CV] C=1.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=1.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=1.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=1.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=1.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=1.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=1.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.03, kernel=rbf, total= 8.9s [CV] C=1.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=1.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=1.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=1.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=1.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total= 8.8s [CV] C=1.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total= 8.8s [CV] C=1.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total= 8.8s [CV] C=1.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total= 8.8s [CV] C=1.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total= 8.8s [CV] C=1.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=1.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=1.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=1.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=1.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=1.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total= 8.4s [CV] C=1.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total= 8.4s [CV] C=1.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total= 8.4s [CV] C=1.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total= 8.4s [CV] C=1.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=1.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=1.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=1.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=1.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=1.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=3.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=3.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=3.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=3.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=3.0, gamma=0.01, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=3.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=3.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.03, kernel=rbf, total= 8.9s [CV] C=3.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=3.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=3.0, gamma=0.03, kernel=rbf ................................... [CV] .................... C=3.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=3.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=3.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total= 8.8s [CV] C=3.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=3.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total= 8.8s [CV] C=3.0, gamma=0.1, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total= 8.8s [CV] C=3.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=3.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=3.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=3.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=3.0, gamma=0.3, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=3.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=3.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=3.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=3.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total= 8.4s [CV] C=3.0, gamma=1.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=3.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total= 9.0s [CV] C=3.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=3.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=3.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=3.0, gamma=3.0, kernel=rbf .................................... [CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=10.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=10.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=10.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=10.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=10.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=10.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=10.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=10.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=10.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=10.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=10.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=10.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=10.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=10.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=10.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=10.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=10.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=10.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=10.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=10.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=10.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=10.0, gamma=0.3, kernel=rbf, total= 8.6s [CV] C=10.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=1.0, kernel=rbf, total= 8.4s [CV] C=10.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=10.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=1.0, kernel=rbf, total= 8.4s [CV] C=10.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=10.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=1.0, kernel=rbf, total= 8.4s [CV] C=10.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=3.0, kernel=rbf, total= 9.0s [CV] C=10.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=10.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=10.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=10.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=10.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=30.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.01, kernel=rbf, total= 8.8s [CV] C=30.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=30.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.01, kernel=rbf, total= 8.8s [CV] C=30.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.01, kernel=rbf, total= 8.8s [CV] C=30.0, gamma=0.01, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.01, kernel=rbf, total= 8.9s [CV] C=30.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=30.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=30.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=30.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=30.0, gamma=0.03, kernel=rbf .................................. [CV] ................... C=30.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=30.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=30.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=30.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=30.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=30.0, gamma=0.1, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=30.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=30.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=30.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=30.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=30.0, gamma=0.3, kernel=rbf ................................... [CV] .................... C=30.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=30.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=30.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=30.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=1.0, kernel=rbf, total= 8.4s [CV] C=30.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=30.0, gamma=1.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=30.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=3.0, kernel=rbf, total= 9.0s [CV] C=30.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=30.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=30.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=30.0, gamma=3.0, kernel=rbf ................................... [CV] .................... C=30.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=100.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.01, kernel=rbf, total= 8.8s [CV] C=100.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.01, kernel=rbf, total= 8.8s [CV] C=100.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.01, kernel=rbf, total= 8.8s [CV] C=100.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.01, kernel=rbf, total= 8.8s [CV] C=100.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.01, kernel=rbf, total= 8.8s [CV] C=100.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.03, kernel=rbf, total= 8.7s [CV] C=100.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.03, kernel=rbf, total= 8.7s [CV] C=100.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.03, kernel=rbf, total= 8.7s [CV] C=100.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.03, kernel=rbf, total= 8.7s [CV] C=100.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=100.0, gamma=0.03, kernel=rbf, total= 8.7s [CV] C=100.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.1, kernel=rbf, total= 8.5s [CV] C=100.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.1, kernel=rbf, total= 8.6s [CV] C=100.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.1, kernel=rbf, total= 8.6s [CV] C=100.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.1, kernel=rbf, total= 8.6s [CV] C=100.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.1, kernel=rbf, total= 8.6s [CV] C=100.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=100.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=100.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=100.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=100.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=100.0, gamma=0.3, kernel=rbf, total= 8.5s [CV] C=100.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=100.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=100.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=100.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=100.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=100.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=3.0, kernel=rbf, total= 9.0s [CV] C=100.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=100.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=3.0, kernel=rbf, total= 9.0s [CV] C=100.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=100.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=100.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=300.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.01, kernel=rbf, total= 8.7s [CV] C=300.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.01, kernel=rbf, total= 8.7s [CV] C=300.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.01, kernel=rbf, total= 8.7s [CV] C=300.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.01, kernel=rbf, total= 8.7s [CV] C=300.0, gamma=0.01, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.01, kernel=rbf, total= 8.7s [CV] C=300.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.03, kernel=rbf, total= 8.5s [CV] C=300.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.03, kernel=rbf, total= 8.5s [CV] C=300.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.03, kernel=rbf, total= 8.5s [CV] C=300.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.03, kernel=rbf, total= 8.5s [CV] C=300.0, gamma=0.03, kernel=rbf ................................. [CV] .................. C=300.0, gamma=0.03, kernel=rbf, total= 8.5s [CV] C=300.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.1, kernel=rbf, total= 8.4s [CV] C=300.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.1, kernel=rbf, total= 8.4s [CV] C=300.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.1, kernel=rbf, total= 8.4s [CV] C=300.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.1, kernel=rbf, total= 8.4s [CV] C=300.0, gamma=0.1, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.1, kernel=rbf, total= 8.4s [CV] C=300.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.3, kernel=rbf, total= 8.4s [CV] C=300.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.3, kernel=rbf, total= 8.4s [CV] C=300.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.3, kernel=rbf, total= 8.4s [CV] C=300.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.3, kernel=rbf, total= 8.3s [CV] C=300.0, gamma=0.3, kernel=rbf .................................. [CV] ................... C=300.0, gamma=0.3, kernel=rbf, total= 8.4s [CV] C=300.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=300.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=300.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=300.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=300.0, gamma=1.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=1.0, kernel=rbf, total= 8.3s [CV] C=300.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=300.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=300.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=3.0, kernel=rbf, total= 9.0s [CV] C=300.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=300.0, gamma=3.0, kernel=rbf .................................. [CV] ................... C=300.0, gamma=3.0, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=0.01, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total= 8.4s [CV] C=1000.0, gamma=0.01, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total= 8.5s [CV] C=1000.0, gamma=0.01, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total= 8.4s [CV] C=1000.0, gamma=0.01, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total= 8.4s [CV] C=1000.0, gamma=0.01, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total= 8.6s [CV] C=1000.0, gamma=0.03, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total= 8.6s [CV] C=1000.0, gamma=0.03, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=1000.0, gamma=0.03, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total= 8.9s [CV] C=1000.0, gamma=0.03, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=1000.0, gamma=0.03, kernel=rbf ................................ [CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total= 8.8s [CV] C=1000.0, gamma=0.1, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total= 8.7s [CV] C=1000.0, gamma=0.1, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total= 8.6s [CV] C=1000.0, gamma=0.1, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total= 8.9s [CV] C=1000.0, gamma=0.1, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total= 8.9s [CV] C=1000.0, gamma=0.1, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total= 9.6s [CV] C=1000.0, gamma=0.3, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total= 9.5s [CV] C=1000.0, gamma=0.3, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=0.3, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=0.3, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=0.3, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=1.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=1.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=1.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=1.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=1.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total= 9.1s [CV] C=1000.0, gamma=3.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total= 9.9s [CV] C=1000.0, gamma=3.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total= 9.9s [CV] C=1000.0, gamma=3.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total= 9.9s [CV] C=1000.0, gamma=3.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total= 9.9s [CV] C=1000.0, gamma=3.0, kernel=rbf ................................. [CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total= 9.9s
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 34.2min finished
GridSearchCV(cv=5, error_score='raise-deprecating', estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False), iid='warn', n_jobs=1, param_grid=[{'C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0, 10000.0, 30000.0], 'kernel': ['linear']}, {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0], 'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0], 'kernel': ['rbf']}], pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring='neg_mean_squared_error', verbose=2)
최상 모델의 (5-폴드 교차 검증으로 평가한) 점수는 다음과 같습니다:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse
70363.90313964167
이는 RandomForestRegressor
보다 훨씬 좋지 않네요. 최상의 하이퍼파라미터를 확인해 보겠습니다:
grid_search.best_params_
{'C': 30000.0, 'kernel': 'linear'}
선형 커널이 RBF 커널보다 성능이 나은 것 같습니다. C
는 테스트한 것 중에 최대값이 선택되었습니다. 따라서 (작은 값들은 지우고) 더 큰 값의 C
로 그리드서치를 다시 실행해 보아야 합니다. 아마도 더 큰 값의 C
에서 성능이 높아질 것입니다.
질문: GridSearchCV를 RandomizedSearchCV로 바꿔보세요.
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal
# expon(), reciprocal()와 다른 확률 분포 함수에 대해서는
# https://docs.scipy.org/doc/scipy/reference/stats.html를 참고하세요.
# 노트: kernel 매개변수가 "linear"일 때는 gamma가 무시됩니다.
param_distribs = {
'kernel': ['linear', 'rbf'],
'C': reciprocal(20, 200000),
'gamma': expon(scale=1.0),
}
svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
n_iter=50, cv=5, scoring='neg_mean_squared_error',
verbose=2, n_jobs=1, random_state=42)
rnd_search.fit(housing_prepared, housing_labels)
Fitting 5 folds for each of 50 candidates, totalling 250 fits [CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear, total= 5.0s [CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 5.0s remaining: 0.0s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear, total= 5.0s [CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ...... [CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear, total= 5.0s [CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ...... [CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear, total= 5.0s [CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ...... [CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear, total= 5.0s [CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ...... [CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf, total= 10.8s [CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ...... [CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf, total= 10.3s [CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf ........ [CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf, total= 10.3s [CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf ........ [CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf, total= 10.3s [CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf ........ [CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf, total= 10.3s [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ... [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total= 9.2s [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ... [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total= 9.2s [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ... [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total= 9.2s [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ... [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total= 9.2s [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ... [CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total= 9.2s [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ...... [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total= 9.3s [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ...... [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total= 9.3s [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ...... [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total= 9.2s [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ...... [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total= 9.3s [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ...... [CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total= 9.3s [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear . [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total= 4.9s [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear . [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total= 4.9s [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear . [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total= 4.9s [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear . [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total= 4.9s [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear . [CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total= 4.9s [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ...... [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total= 9.1s [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ...... [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total= 9.1s [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ...... [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total= 9.1s [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ...... [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total= 9.0s [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ...... [CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total= 9.0s [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf ..... [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total= 20.7s [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf ..... [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total= 21.6s [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf ..... [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total= 24.8s [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf ..... [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total= 20.3s [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf ..... [CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total= 22.4s [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ... [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total= 10.3s [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ... [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total= 10.8s [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ... [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total= 11.4s [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ... [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total= 10.4s [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ... [CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total= 9.3s [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear .... [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total= 42.8s [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear .... [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total= 35.3s [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear .... [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total= 41.3s [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear .... [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total= 35.9s [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear .... [CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total= 28.6s [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ... [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total= 5.9s [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ... [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total= 6.1s [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ... [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total= 6.2s [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ... [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total= 5.9s [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ... [CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total= 6.0s [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........ [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total= 9.8s [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........ [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total= 9.8s [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........ [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total= 9.8s [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........ [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total= 9.8s [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........ [CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total= 9.8s [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear .... [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total= 12.1s [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear .... [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total= 11.9s [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear .... [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total= 12.5s [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear .... [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total= 12.1s [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear .... [CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total= 11.0s [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf ....... [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total= 9.3s [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf ....... [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total= 9.3s [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf ....... [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total= 9.3s [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf ....... [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total= 9.2s [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf ....... [CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total= 9.3s [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear .... [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total= 4.9s [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear .... [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total= 4.9s [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear .... [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total= 4.9s [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear .... [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total= 4.9s [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear .... [CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total= 4.8s [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf ....... [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total= 9.2s [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf ....... [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total= 9.2s [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf ....... [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total= 9.2s [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf ....... [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total= 9.2s [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf ....... [CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total= 9.2s [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ...... [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total= 9.1s [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ...... [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total= 9.1s [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ...... [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total= 9.2s [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ...... [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total= 9.1s [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ...... [CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total= 9.1s [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear .... [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total= 4.9s [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear .... [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total= 4.9s [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear .... [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total= 4.9s [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear .... [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total= 4.9s [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear .... [CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total= 4.8s [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ...... [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 2.2min [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ...... [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 1.6min [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ...... [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 1.5min [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ...... [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 1.9min [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ...... [CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 1.9min [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear .... [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total= 5.0s [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear .... [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total= 5.0s [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear .... [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total= 5.0s [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear .... [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total= 5.0s [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear .... [CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total= 5.0s [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ... [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total= 23.5s [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ... [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total= 23.0s [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ... [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total= 36.9s [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ... [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total= 24.3s [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ... [CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total= 20.1s [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear .... [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total= 5.5s [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear .... [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total= 5.6s [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear .... [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total= 5.4s [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear .... [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total= 5.6s [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear .... [CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total= 5.3s [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear .... [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total= 5.1s [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear .... [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total= 5.0s [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear .... [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total= 5.1s [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear .... [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total= 5.1s [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear .... [CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total= 5.0s [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf ....... [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total= 10.3s [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf ....... [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total= 10.3s [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf ....... [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total= 10.3s [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf ....... [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total= 10.3s [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf ....... [CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total= 10.3s [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ... [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total= 5.5s [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ... [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total= 5.8s [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ... [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total= 5.8s [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ... [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total= 5.5s [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ... [CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total= 5.5s [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ... [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total= 4.8s [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ... [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total= 4.9s [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ... [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total= 4.9s [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ... [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total= 4.9s [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ... [CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total= 4.8s [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf ..... [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total= 9.6s [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf ..... [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total= 9.6s [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf ..... [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total= 9.6s [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf ..... [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total= 9.6s [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf ..... [CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total= 9.5s [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ...... [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total= 9.5s [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ...... [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total= 9.4s [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ...... [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total= 9.5s [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ...... [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total= 9.4s [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ...... [CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total= 9.5s [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ... [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total= 7.9s [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ... [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total= 8.4s [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ... [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total= 8.5s [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ... [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total= 8.7s [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ... [CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total= 7.5s [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf ....... [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total= 41.1s [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf ....... [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total= 39.4s [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf ....... [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total= 50.7s [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf ....... [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total= 52.8s [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf ....... [CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total= 43.7s [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf ....... [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total= 17.0s [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf ....... [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total= 17.2s [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf ....... [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total= 16.9s [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf ....... [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total= 18.8s [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf ....... [CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total= 17.9s [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf ....... [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total= 9.3s [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf ....... [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total= 9.3s [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf ....... [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total= 9.3s [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf ....... [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total= 9.3s [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf ....... [CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total= 9.3s [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf ....... [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total= 9.1s [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf ....... [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total= 9.1s [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf ....... [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total= 9.1s [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf ....... [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total= 9.1s [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf ....... [CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total= 9.1s [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ... [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total= 4.9s [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ... [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total= 4.9s [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ... [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total= 5.0s [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ... [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total= 5.0s [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ... [CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total= 4.9s [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear .... [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total= 4.9s [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear .... [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total= 4.9s [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear .... [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total= 5.2s [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear .... [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total= 5.9s [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear .... [CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total= 5.7s [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ...... [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total= 9.8s [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ...... [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total= 9.7s [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ...... [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total= 9.6s [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ...... [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total= 9.3s [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ...... [CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total= 9.3s [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear .... [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total= 5.3s [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear .... [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total= 5.3s [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear .... [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total= 5.3s [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear .... [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total= 5.3s [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear .... [CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total= 5.2s [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear ..... [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total= 4.9s [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear ..... [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total= 4.9s [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear ..... [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total= 4.9s [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear ..... [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total= 4.9s [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear ..... [CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total= 4.8s [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ...... [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total= 9.2s [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ...... [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total= 9.2s [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ...... [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total= 9.2s [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ...... [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total= 9.2s [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ...... [CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total= 9.2s [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ... [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total= 5.9s [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ... [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total= 5.7s [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ... [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total= 5.8s [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ... [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total= 5.7s [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ... [CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total= 5.5s [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear .... [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total= 4.8s [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear .... [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total= 4.8s [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear .... [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total= 4.9s [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear .... [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total= 4.9s [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear .... [CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total= 4.8s [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear .... [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total= 4.9s [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear .... [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total= 4.8s [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear .... [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total= 4.9s [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear .... [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total= 4.9s [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear .... [CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total= 4.8s [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ...... [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total= 9.2s [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ...... [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total= 9.2s [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ...... [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total= 9.2s [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ...... [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total= 9.2s [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ...... [CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total= 9.2s [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear .... [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total= 6.2s [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear .... [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total= 6.2s [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear .... [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total= 6.4s [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear .... [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total= 6.3s [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear .... [CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total= 6.0s [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf ....... [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total= 31.8s [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf ....... [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total= 36.3s [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf ....... [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total= 34.2s [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf ....... [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total= 34.8s [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf ....... [CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total= 32.6s [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........ [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total= 9.4s [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........ [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total= 9.4s [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........ [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total= 9.4s [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........ [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total= 9.4s [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........ [CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total= 9.4s [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ...... [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total= 12.1s [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ...... [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total= 11.8s [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ...... [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total= 10.9s [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ...... [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total= 12.4s [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ...... [CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total= 11.2s [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear .... [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total= 29.3s [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear .... [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total= 21.7s [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear .... [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total= 32.3s [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear .... [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total= 24.0s [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear .... [CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total= 18.3s
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 53.2min finished
RandomizedSearchCV(cv=5, error_score='raise-deprecating', estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False), iid='warn', n_iter=50, n_jobs=1, param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f474838d278>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f474838d128>, 'kernel': ['linear', 'rbf']}, pre_dispatch='2*n_jobs', random_state=42, refit=True, return_train_score=False, scoring='neg_mean_squared_error', verbose=2)
최상 모델의 (5-폴드 교차 검증으로 평가한) 점수는 다음과 같습니다:
negative_mse = rnd_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse
54767.99053704409
이제 RandomForestRegressor
의 성능에 훨씬 가까워졌습니다(하지만 아직 차이가 납니다). 최상의 하이퍼파라미터를 확인해 보겠습니다:
rnd_search.best_params_
{'C': 157055.10989448498, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}
이번에는 RBF 커널에 대해 최적의 하이퍼파라미터 조합을 찾았습니다. 보통 랜덤서치가 같은 시간안에 그리드서치보다 더 좋은 하이퍼파라미터를 찾습니다.
여기서 사용된 scale=1.0
인 지수 분포를 살펴보겠습니다. 일부 샘플은 1.0보다 아주 크거나 작습니다. 하지만 로그 분포를 보면 대부분의 값이 exp(-2)와 exp(+2), 즉 0.1과 7.4 사이에 집중되어 있음을 알 수 있습니다.
expon_distrib = expon(scale=1.)
samples = expon_distrib.rvs(10000, random_state=42)
plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.title("Exponential distribution (scale=1.0)")
plt.hist(samples, bins=50)
plt.subplot(122)
plt.title("Log of this distribution")
plt.hist(np.log(samples), bins=50)
plt.show()
C
에 사용된 분포는 매우 다릅니다. 주어진 범위안에서 균등 분포로 샘플링됩니다. 그래서 오른쪽 로그 분포가 거의 일정하게 나타납니다. 이런 분포는 원하는 스케일이 정확이 무엇인지 모를 때 사용하면 좋습니다:
reciprocal_distrib = reciprocal(20, 200000)
samples = reciprocal_distrib.rvs(10000, random_state=42)
plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.title("Reciprocal distribution (scale=1.0)")
plt.hist(samples, bins=50)
plt.subplot(122)
plt.title("Log of this distribution")
plt.hist(np.log(samples), bins=50)
plt.show()
reciprocal() 함수는 하이퍼파라미터의 스케일에 대해 전혀 감을 잡을 수 없을 때 사용합니다(오른쪽 그래프에서 볼 수 있듯이 주어진 범위안에서 모든 값이 균등합니다). 반면 지수 분포는 하이퍼파라미터의 스케일을 (어느정도) 알고 있을 때 사용하는 것이 좋습니다.
질문: 가장 중요한 특성을 선택하는 변환기를 준비 파이프라인에 추가해보세요.
from sklearn.base import BaseEstimator, TransformerMixin
def indices_of_top_k(arr, k):
return np.sort(np.argpartition(np.array(arr), -k)[-k:])
class TopFeatureSelector(BaseEstimator, TransformerMixin):
def __init__(self, feature_importances, k):
self.feature_importances = feature_importances
self.k = k
def fit(self, X, y=None):
self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
return self
def transform(self, X):
return X[:, self.feature_indices_]
노트: 이 특성 선택 클래스는 이미 어떤 식으로든 특성 중요도를 계산했다고 가정합니다(가령 RandomForestRegressor
을 사용하여). TopFeatureSelector
의 fit()
메서드에서 직접 계산할 수도 있지만 (캐싱을 사용하지 않을 경우) 이렇게 하면 그리드서치나 랜덤서치의 모든 하이퍼파라미터 조합에 대해 계산이 일어나기 때문에 매우 느려집니다.
선택할 특성의 개수를 지정합니다:
k = 5
최상의 k개 특성의 인덱스를 확인해 보겠습니다:
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices
array([ 0, 1, 7, 9, 12])
np.array(attributes)[top_k_feature_indices]
array(['longitude', 'latitude', 'median_income', 'pop_per_hhold', 'INLAND'], dtype='<U18')
최상의 k개 특성이 맞는지 다시 확인합니다:
sorted(zip(feature_importances, attributes), reverse=True)[:k]
[(0.36615898061813423, 'median_income'), (0.16478099356159054, 'INLAND'), (0.10879295677551575, 'pop_per_hhold'), (0.07334423551601243, 'longitude'), (0.06290907048262032, 'latitude')]
좋습니다. 이제 이전에 정의한 준비 파이프라인과 특성 선택기를 추가한 새로운 파이프라인을 만듭니다:
preparation_and_feature_selection_pipeline = Pipeline([
('preparation', full_pipeline),
('feature_selection', TopFeatureSelector(feature_importances, k))
])
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)
처음 3개 샘플의 특성을 확인해 보겠습니다:
housing_prepared_top_k_features[0:3]
array([[-1.15604281, 0.77194962, -0.61493744, -0.08649871, 0. ], [-1.17602483, 0.6596948 , 1.33645936, -0.03353391, 0. ], [ 1.18684903, -1.34218285, -0.5320456 , -0.09240499, 0. ]])
최상의 k개 특성이 맞는지 다시 확인합니다:
housing_prepared[0:3, top_k_feature_indices]
array([[-1.15604281, 0.77194962, -0.61493744, -0.08649871, 0. ], [-1.17602483, 0.6596948 , 1.33645936, -0.03353391, 0. ], [ 1.18684903, -1.34218285, -0.5320456 , -0.09240499, 0. ]])
성공입니다! :)
질문: 전체 데이터 준비 과정과 최종 예측을 하나의 파이프라인으로 만들어보세요.
prepare_select_and_predict_pipeline = Pipeline([
('preparation', full_pipeline),
('feature_selection', TopFeatureSelector(feature_importances, k)),
('svm_reg', SVR(**rnd_search.best_params_))
])
prepare_select_and_predict_pipeline.fit(housing, housing_labels)
Pipeline(memory=None, steps=[('preparation', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3, transformer_weights=None, transformers=[('num_pipeline', Pipeline(memory=None, steps=[('selector', DataFrameSelector(attribute_names=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_... 1.41064835e-02, 1.48742809e-02, 1.42575993e-02, 3.66158981e-01, 5.64191792e-02, 1.08792957e-01, 5.33510773e-02, 1.03114883e-02, 1.64780994e-01, 6.02803867e-05, 1.96041560e-03, 2.85647464e-03]), k=5)), ('svm_reg', SVR(C=157055.10989448498, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.26497040005002437, kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))], verbose=False)
몇 개의 샘플에 전체 파이프라인을 적용해 보겠습니다:
some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]
print("예측:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("레이블:\t\t", list(some_labels))
예측: [203214.28978849 371846.88152572 173295.65441612 47328.3970888 ] 레이블: [286600.0, 340600.0, 196900.0, 46300.0]
전체 파이프라인이 잘 작동하는 것 같습니다. 물론 예측 성능이 아주 좋지는 않습니다. SVR
보다 RandomForestRegressor
가 더 나은 것 같습니다.
질문: GridSearchCV
를 사용해 준비 단계의 옵션을 자동으로 탐색해보세요.
사이킷런 0.20 버전에서 GridSearchCV
의 n_jobs
매개변수를 -1로 했을 때 에러가 발생하는 경우가 있습니다(https://github.com/scikit-learn/scikit-learn/issues/12250). 에러가 해결되기 전까지 매개변수를 1로 설정합니다.
param_grid = [
{'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
'feature_selection__k': list(range(1, len(feature_importances) + 1))}
]
grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
scoring='neg_mean_squared_error', verbose=2, n_jobs=1)
grid_search_prep.fit(housing, housing_labels)
Fitting 5 folds for each of 48 candidates, totalling 240 fits [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total= 6.3s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 6.3s remaining: 0.0s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total= 6.3s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total= 6.2s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total= 6.2s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total= 6.3s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total= 6.3s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total= 6.3s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total= 6.3s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total= 6.2s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total= 6.2s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.2s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.3s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.3s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.2s [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.2s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total= 6.4s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total= 6.4s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total= 6.4s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.4s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.5s [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.5s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total= 6.9s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total= 7.2s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total= 7.1s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total= 6.7s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total= 6.8s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total= 6.9s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total= 6.7s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total= 6.6s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total= 6.6s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total= 6.6s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.7s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.7s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.6s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.7s [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total= 6.7s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total= 7.3s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total= 7.1s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total= 7.2s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total= 7.2s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total= 7.1s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total= 7.3s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total= 7.1s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total= 7.3s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total= 7.2s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total= 7.1s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.3s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.1s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.3s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.2s [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.1s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total= 7.5s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total= 7.6s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total= 7.5s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total= 7.6s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total= 7.4s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total= 7.5s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total= 7.6s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total= 7.5s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total= 7.6s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total= 7.5s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.5s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.6s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.5s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.6s [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.4s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total= 7.7s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total= 7.9s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total= 7.7s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total= 7.6s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total= 7.9s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total= 7.7s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total= 7.9s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total= 7.7s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total= 7.6s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total= 7.9s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.7s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.9s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.7s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.6s [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total= 7.9s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total= 8.3s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total= 8.2s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total= 8.5s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total= 8.5s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total= 8.3s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total= 8.9s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total= 8.4s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total= 8.4s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total= 8.1s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total= 8.2s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total= 8.8s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total= 8.3s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total= 8.6s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total= 8.2s [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total= 8.7s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total= 9.9s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total= 9.6s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total= 10.8s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total= 10.4s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total= 11.3s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total= 10.3s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total= 9.7s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total= 10.7s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total= 10.7s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total= 10.5s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total= 10.1s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total= 9.9s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total= 10.6s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total= 9.7s [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total= 10.4s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total= 14.3s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total= 14.5s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total= 13.6s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total= 13.7s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total= 12.6s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total= 14.2s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total= 14.4s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total= 12.2s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total= 13.8s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total= 12.5s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total= 14.1s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total= 14.4s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total= 14.0s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total= 13.9s [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total= 13.3s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total= 14.4s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total= 15.3s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total= 19.5s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total= 17.0s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total= 15.8s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total= 14.6s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total= 17.1s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total= 15.8s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total= 16.0s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total= 14.9s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total= 14.7s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total= 15.4s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total= 14.7s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total= 16.0s [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total= 16.4s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total= 19.0s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total= 17.2s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total= 17.5s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total= 18.6s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total= 17.9s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total= 16.1s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total= 15.8s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total= 16.2s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total= 19.0s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total= 18.4s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total= 19.3s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total= 15.7s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total= 16.2s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total= 17.5s [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total= 20.9s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total= 19.5s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total= 19.1s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total= 18.8s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total= 19.2s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total= 19.9s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total= 17.8s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total= 18.5s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total= 20.1s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total= 18.6s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total= 18.6s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total= 17.3s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total= 18.4s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total= 21.8s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total= 17.7s [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total= 18.0s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total= 23.0s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total= 21.0s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total= 22.5s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total= 20.6s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total= 17.4s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total= 18.8s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total= 22.2s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total= 23.2s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total= 22.6s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total= 20.7s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total= 19.6s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total= 21.3s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total= 21.8s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total= 21.5s [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total= 19.2s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total= 18.0s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total= 20.8s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total= 21.1s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total= 21.2s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total= 19.4s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total= 20.6s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total= 21.5s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total= 21.0s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total= 21.0s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total= 19.1s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total= 20.8s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total= 18.5s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total= 19.4s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total= 21.6s [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total= 25.0s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total= 21.6s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total= 20.6s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total= 22.1s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total= 17.5s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total= 20.3s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total= 18.6s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total= 20.8s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total= 22.7s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total= 22.7s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total= 21.0s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total= 22.3s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total= 22.3s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total= 22.9s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total= 18.8s [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total= 22.3s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total= 20.9s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total= 21.3s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total= 20.1s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total= 21.4s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total= 18.4s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total= 19.2s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total= 21.2s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total= 20.2s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total= 17.5s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total= 20.6s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total= 18.7s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total= 21.0s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total= 19.9s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total= 20.0s [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent [CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total= 21.3s
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 52.4min finished
GridSearchCV(cv=5, error_score='raise-deprecating', estimator=Pipeline(memory=None, steps=[('preparation', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3, transformer_weights=None, transformers=[('num_pipeline', Pipeline(memory=None, steps=[('selector', DataFrameSelector(attribute_names=['longitude', 'latitude', 'housing_median_age', 'tota... kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))], verbose=False), iid='warn', n_jobs=1, param_grid=[{'feature_selection__k': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent']}], pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.best_params_
{'feature_selection__k': 15, 'preparation__num_pipeline__imputer__strategy': 'most_frequent'}
최상의 Imputer 정책은 most_frequent
이고 거의 모든 특성이 유용합니다(16개 중 15개). 마지막 특성(ISLAND
)은 잡음이 추가될 뿐입니다.
축하합니다! 이제 머신러닝에 대해 꽤 많은 것을 알게 되었습니다. :)