#!/usr/bin/env python
# coding: utf-8
# # k-평균
#
# ## KMeans 클래스
# In[1]:
get_ipython().system('wget https://bit.ly/fruits_300_data -O fruits_300.npy')
# In[2]:
import numpy as np
fruits = np.load('fruits_300.npy')
fruits_2d = fruits.reshape(-1, 100*100)
# In[3]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3, random_state=42)
km.fit(fruits_2d)
# In[4]:
print(km.labels_)
# In[5]:
print(np.unique(km.labels_, return_counts=True))
# In[6]:
import matplotlib.pyplot as plt
def draw_fruits(arr, ratio=1):
n = len(arr) # n은 샘플 개수입니다
# 한 줄에 10개씩 이미지를 그립니다. 샘플 개수를 10으로 나누어 전체 행 개수를 계산합니다.
rows = int(np.ceil(n/10))
# 행이 1개 이면 열 개수는 샘플 개수입니다. 그렇지 않으면 10개입니다.
cols = n if rows < 2 else 10
fig, axs = plt.subplots(rows, cols,
figsize=(cols*ratio, rows*ratio), squeeze=False)
for i in range(rows):
for j in range(cols):
if i*10 + j < n: # n 개까지만 그립니다.
axs[i, j].imshow(arr[i*10 + j], cmap='gray_r')
axs[i, j].axis('off')
plt.show()
# In[7]:
draw_fruits(fruits[km.labels_==0])
# In[8]:
draw_fruits(fruits[km.labels_==1])
# In[9]:
draw_fruits(fruits[km.labels_==2])
# ## 클러스터 중심
# In[10]:
draw_fruits(km.cluster_centers_.reshape(-1, 100, 100), ratio=3)
# In[11]:
print(km.transform(fruits_2d[100:101]))
# In[12]:
print(km.predict(fruits_2d[100:101]))
# In[13]:
draw_fruits(fruits[100:101])
# In[14]:
print(km.n_iter_)
# ## 최적의 k 찾기
# In[15]:
inertia = []
for k in range(2, 7):
km = KMeans(n_clusters=k, n_init='auto', random_state=42)
km.fit(fruits_2d)
inertia.append(km.inertia_)
plt.plot(range(2, 7), inertia)
plt.xlabel('k')
plt.ylabel('inertia')
plt.show()