PyOD is a comprehensive Python toolkit to identify outlying objects in multivariate data with both unsupervised and supervised approaches.
Four combination frameworks are demonstrated in this example:
from __future__ import division
from __future__ import print_function
import os
import sys
# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import loadmat
from pyod.models.knn import KNN
from pyod.models.combination import aom, moa, average, maximization
from pyod.utils.utility import standardizer
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
# Define data file and read X and y
# Generate some data if the source data is missing
mat_file = 'cardio.mat'
try:
mat = loadmat(os.path.join('data', mat_file))
except TypeError:
print('{data_file} does not exist. Use generated data'.format(
data_file=mat_file))
X, y = generate_data(train_only=True) # load data
except IOError:
print('{data_file} does not exist. Use generated data'.format(
data_file=mat_file))
X, y = generate_data(train_only=True) # load data
else:
X = mat['X']
y = mat['y'].ravel()
# 60% data for training and 40% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
# standardizing data for processing
X_train_norm, X_test_norm = standardizer(X_train, X_test)
print("Training data:", X_train.shape, y_train.shape)
print("Test data:", X_test.shape, y_test.shape)
Training data: (1098, 21) (1098,) Test data: (733, 21) (733,)
n_clf = 20 # number of base detectors
# Initialize 20 base detectors for combination
k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
150, 160, 170, 180, 190, 200]
train_scores = np.zeros([X_train.shape[0], n_clf])
test_scores = np.zeros([X_test.shape[0], n_clf])
print('Initializing {n_clf} kNN detectors'.format(n_clf=n_clf))
for i in range(n_clf):
k = k_list[i]
clf = KNN(n_neighbors=k, method='largest')
clf.fit(X_train_norm)
train_scores[:, i] = clf.decision_scores_
test_scores[:, i] = clf.decision_function(X_test_norm)
print('Base detector %i is fitted for prediction' % i)
Initializing 20 kNN detectors Base detector 0 is fitted for prediction Base detector 1 is fitted for prediction Base detector 2 is fitted for prediction Base detector 3 is fitted for prediction Base detector 4 is fitted for prediction Base detector 5 is fitted for prediction Base detector 6 is fitted for prediction Base detector 7 is fitted for prediction Base detector 8 is fitted for prediction Base detector 9 is fitted for prediction Base detector 10 is fitted for prediction Base detector 11 is fitted for prediction Base detector 12 is fitted for prediction Base detector 13 is fitted for prediction Base detector 14 is fitted for prediction Base detector 15 is fitted for prediction Base detector 16 is fitted for prediction Base detector 17 is fitted for prediction Base detector 18 is fitted for prediction Base detector 19 is fitted for prediction
# Decision scores have to be normalized before combination
train_scores_norm, test_scores_norm = standardizer(train_scores,
test_scores)
# Predicted scores from all base detectors on the test data is
# stored in train_scores_norm and test_scores_norm
print('Decision score matrix on training data', train_scores_norm.shape)
print('Decision score matrix on test data', test_scores_norm.shape)
Decision score matrix on training data (1098, 20) Decision score matrix on test data (733, 20)
# Combine and evaluate the combination result
# Combination by average
y_by_average = average(test_scores_norm)
evaluate_print('Combination by Average', y_test, y_by_average)
# Combination by max
y_by_maximization = maximization(test_scores_norm)
evaluate_print('Combination by Maximization', y_test, y_by_maximization)
# Combination by aom
y_by_aom = aom(test_scores_norm, n_buckets=5)
evaluate_print('Combination by AOM', y_test, y_by_aom)
# Combination by moa
y_by_moa = moa(test_scores_norm, n_buckets=5)
evaluate_print('Combination by MOA', y_test, y_by_moa)
Combination by Average ROC:0.9182, precision @ rank n:0.5522 Combination by Maximization ROC:0.9225, precision @ rank n:0.597 Combination by AOM ROC:0.9288, precision @ rank n:0.6119 Combination by MOA ROC:0.9245, precision @ rank n:0.5821