k-최근접 이웃 회귀¶

데이터 준비¶

In [ ]:

import numpy as np

In [ ]:

#농어 데이터

perch_length = np.array(
    [8.4, 13.7, 15.0, 16.2, 17.4, 18.0, 18.7, 19.0, 19.6, 20.0, 
     21.0, 21.0, 21.0, 21.3, 22.0, 22.0, 22.0, 22.0, 22.0, 22.5, 
     22.5, 22.7, 23.0, 23.5, 24.0, 24.0, 24.6, 25.0, 25.6, 26.5, 
     27.3, 27.5, 27.5, 27.5, 28.0, 28.7, 30.0, 32.8, 34.5, 35.0, 
     36.5, 36.0, 37.0, 37.0, 39.0, 39.0, 39.0, 40.0, 40.0, 40.0, 
     40.0, 42.0, 43.0, 43.0, 43.5, 44.0]
     )
perch_weight = np.array(
    [5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 
     110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 
     130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 
     197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 
     514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 
     820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 
     1000.0, 1000.0]
     )

k-최근접 이웃회귀¶

지도학습 알고리즘은 크게 분류와 회귀로 나뉜다. 회귀는 클래스 중 하나로 분류하는 것이 아니라 임의의 어떤 숫자를 예측하는 문제이다, 회귀는 정해진 클래스가 없고 임의의 수치를 출력한다.

k-최근접 이웃 분류 알고리즘은 예측하려는 샘플에 가장 가까운 샘플 k개를 선택한후 이 샘플들의 클래스를 확인하여, 다수 클래스를 새로운 샘플의 클래스로 예측한다.

k-최근접 이웃 회귀 또한 예측하려는 샘플 K개를 선택하고 이 이웃샘플의 타깃값들을 평균하여 구한다.

In [ ]:

import matplotlib.pyplot as plt

In [ ]:

plt.scatter(perch_length, perch_weight)
plt.xlabel('length')
plt.ylabel('weight')
plt.show()

In [ ]:

from sklearn.model_selection import train_test_split

In [ ]:

train_input, test_input, train_target, test_target = train_test_split(
    perch_length, perch_weight, random_state=42)
#훈련세트와 테스트 세트로 나누기

In [ ]:

print(train_input.shape, test_input.shape)

(42,) (14,)

In [ ]:

test_array = np.array([1,2,3,4])
print(test_array.shape)

(4,)

In [ ]:

test_array = test_array.reshape(2, 2)
print(test_array.shape)

(2, 2)

In [ ]:

train_input = train_input.reshape(-1, 1)
test_input = test_input.reshape(-1, 1)
#이차원 배열로 바꾸기

In [ ]:

print(train_input.shape, test_input.shape)

(42, 1) (14, 1)

결정 계수 ($ R^2$)¶

R^2 = 1-(타깃-예측)^2의 합)/((타깃-평균)^2의 합)

In [ ]:

from sklearn.neighbors import KNeighborsRegressor

In [ ]:

knr = KNeighborsRegressor()
# k-최근접 이웃 회귀 모델을 훈련합니다
knr.fit(train_input, train_target)

Out[ ]:

KNeighborsRegressor()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [ ]:

knr.score(test_input, test_target)

Out[ ]:

0.992809406101064

In [ ]:

from sklearn.metrics import mean_absolute_error

In [ ]:

# 테스트 세트에 대한 예측을 만듭니다
test_prediction = knr.predict(test_input)
# 테스트 세트에 대한 평균 절댓값 오차를 계산합니다
mae = mean_absolute_error(test_target, test_prediction)
print(mae)

19.157142857142862

과대적합 vs 과소적합¶

훈련세트에서 점수가 좋았는데 테스트 세트에서는 점수가 나쁘다면 모델이 훈련세트에 과대적합(overfitting) 되었다고 한다.

훈련세트보다 테스트세트의 점수가 높거나 두 점수가 모두 낮은경우는 모델이 훈련세트에 과소적합 되었다고 한다.

In [ ]:

print(knr.score(train_input, train_target))

0.9698823289099254

In [ ]:

#모델을 복잡하게 만든다 = k 최근접 이웃 알고리즘에서는 이웃의 개수 k를 줄이면 된다.

# 이웃의 갯수를 3으로 설정합니다
knr.n_neighbors = 3
# 모델을 다시 훈련합니다
knr.fit(train_input, train_target)
print(knr.score(train_input, train_target))

0.9804899950518966

In [ ]:

print(knr.score(test_input, test_target))
#테스트세트의 점수는 훈련세트 보다 낮아졌으므로 과소적합 문제를 해결했다.

0.9746459963987609

확인문제¶

In [ ]:

# k-최근접 이웃 회귀 객체를 만듭니다
knr = KNeighborsRegressor()
# 5에서 45까지 x 좌표를 만듭니다
x = np.arange(5, 45).reshape(-1, 1)

# n = 1, 5, 10일 때 예측 결과를 그래프로 그립니다.
for n in [1, 5, 10]:
    # 모델 훈련
    knr.n_neighbors = n
    knr.fit(train_input, train_target)
    # 지정한 범위 x에 대한 예측 구하기 
    prediction = knr.predict(x)
    # 훈련 세트와 예측 결과 그래프 그리기
    plt.scatter(train_input, train_target)
    plt.plot(x, prediction)
    plt.title('n_neighbors = {}'.format(n))    
    plt.xlabel('length')
    plt.ylabel('weight')
    plt.show()