이 노트북의 코드에 대한 설명은 분류기 체인: ClassifierChain 글을 참고하세요.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml
fetch yeast dataset (https://www.openml.org/d/40597)
사이킷런 0.20 버전에서 fetch_openml
이 추가되었습니다. fetch_mldata
는 0.22 버전에서 삭제될 예정입니다.
yeast = fetch_openml('yeast', version=4)
X = yeast['data']
Y = yeast['target']
X.shape, Y.shape
((2417, 103), (2417, 14))
Y
array([['FALSE', 'FALSE', 'FALSE', ..., 'TRUE', 'TRUE', 'FALSE'], ['FALSE', 'FALSE', 'TRUE', ..., 'FALSE', 'FALSE', 'FALSE'], ['FALSE', 'TRUE', 'TRUE', ..., 'TRUE', 'TRUE', 'FALSE'], ..., ['FALSE', 'FALSE', 'FALSE', ..., 'TRUE', 'TRUE', 'FALSE'], ['FALSE', 'FALSE', 'FALSE', ..., 'TRUE', 'TRUE', 'FALSE'], ['FALSE', 'TRUE', 'TRUE', ..., 'TRUE', 'TRUE', 'FALSE']], dtype=object)
Y = Y == 'TRUE'
Y
array([[False, False, False, ..., True, True, False], [False, False, True, ..., False, False, False], [False, True, True, ..., True, True, False], ..., [False, False, False, ..., True, True, False], [False, False, False, ..., True, True, False], [False, True, True, ..., True, True, False]])
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
for OVR model
ovr = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
ovr.fit(X_train, Y_train)
pred_ovr = ovr.predict(X_test)
from sklearn.metrics import jaccard_similarity_score
ovr_score = jaccard_similarity_score(Y_test, pred_ovr)
ovr_score
0.5081742752197298
for CC model (https://www.cs.waikato.ac.nz/ml/publications/2009/chains.pdf)
from sklearn.multioutput import ClassifierChain
cc = ClassifierChain(LogisticRegression(solver='liblinear'), order='random', random_state=42)
cc.fit(X_train, Y_train)
pred_cc = cc.predict(X_test)
cc_score = jaccard_similarity_score(Y_test, pred_cc)
cc_score
0.5118178538633084
for CC ensemble
chains = [ClassifierChain(LogisticRegression(solver='liblinear'), order='random', random_state=42+i)
for i in range(10)]
for chain in chains:
chain.fit(X_train, Y_train)
pred_chains = np.array([chain.predict(X_test) for chain in chains])
chain_scores = [jaccard_similarity_score(Y_test, pred_chain)
for pred_chain in pred_chains]
chain_scores
[0.5118178538633084, 0.5261753455451803, 0.5298506016481224, 0.5150574517310055, 0.5076409758744469, 0.49889807162534433, 0.5104657108272811, 0.49708858061130784, 0.5161265846183202, 0.4871433490751672]
plt.bar(np.arange(11), [ovr_score]+chain_scores)
plt.show()
proba_chains = np.array([chain.predict_proba(X_test) for chain in chains])
proba_ensemble = proba_chains.mean(axis=0)
ensemble_score = jaccard_similarity_score(Y_test, proba_ensemble >= 0.5)
ensemble_score
0.5180391578118851
plt.bar(np.arange(12), [ovr_score]+chain_scores+[ensemble_score])
plt.show()