from datetime import datetime
print(f'Päivitetty {datetime.now()}')
Päivitetty 2022-10-15 12:17:05.441070
Katsotaan osaako K-means klusterointi jakaa kurjenmiekat kolmeen lajiin (setosa, versicolor, virginica) terä- (petal) ja verholehtien (sepal) koon mukaan.
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
# Esimerkkiaineisto löytyy seaborn-kirjastosta
iris = sns.load_dataset('iris')
iris
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
# X sisältää vain terä- ja verholehtien pituudet ja leveydet
X = iris.drop('species', axis=1)
# Mallin sovitus
kmeans = KMeans(n_clusters=3, random_state=2)
kmeans.fit(X)
KMeans(n_clusters=3, random_state=2)
# Klustereiden numerot (0, 1 ja 2) alkuperäiseen iris-dataan
iris['prediction'] = kmeans.predict(X)
iris
sepal_length | sepal_width | petal_length | petal_width | species | prediction | |
---|---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa | 0 |
... | ... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica | 2 |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica | 1 |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica | 2 |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica | 2 |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica | 1 |
150 rows × 6 columns
# Katsotaan ristiintaulukoimalla miten hyvin clusterit vastaavat lajikkeita
pd.crosstab(iris['species'], iris['prediction'])
prediction | 0 | 1 | 2 |
---|---|---|---|
species | |||
setosa | 50 | 0 | 0 |
versicolor | 0 | 48 | 2 |
virginica | 0 | 14 | 36 |
# Klustereiden keskikohdat
pd.DataFrame(kmeans.cluster_centers_,
columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | 5.006000 | 3.428000 | 1.462000 | 0.246000 |
1 | 5.901613 | 2.748387 | 4.393548 | 1.433871 |
2 | 6.850000 | 3.073684 | 5.742105 | 2.071053 |