#!/usr/bin/env python # coding: utf-8 # # Example of combining multiple base outlier scores. # # **[PyOD](https://github.com/yzhao062/Pyod)** is a comprehensive **Python toolkit** to **identify outlying objects** in # multivariate data with both unsupervised and supervised approaches. # # Four combination frameworks are demonstrated in this example: # # 1. Average: take the average of all base detectors # 2. maximization : take the maximum score across all detectors as the score # 3. Average of Maximum (AOM) # 4. Maximum of Average (MOA) # In[12]: from __future__ import division from __future__ import print_function import os import sys # temporary solution for relative imports in case pyod is not installed # if pyod is installed, no need to use the following line sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) import numpy as np from sklearn.model_selection import train_test_split from scipy.io import loadmat from pyod.models.knn import KNN from pyod.models.combination import aom, moa, average, maximization from pyod.utils.utility import standardizer from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print # In[13]: # Define data file and read X and y # Generate some data if the source data is missing mat_file = 'cardio.mat' try: mat = loadmat(os.path.join('data', mat_file)) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() # 60% data for training and 40% for testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) # In[14]: print("Training data:", X_train.shape, y_train.shape) print("Test data:", X_test.shape, y_test.shape) # In[15]: n_clf = 20 # number of base detectors # Initialize 20 base detectors for combination k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200] train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) print('Initializing {n_clf} kNN detectors'.format(n_clf=n_clf)) for i in range(n_clf): k = k_list[i] clf = KNN(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_scores[:, i] = clf.decision_scores_ test_scores[:, i] = clf.decision_function(X_test_norm) print('Base detector %i is fitted for prediction' % i) # In[16]: # Decision scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer(train_scores, test_scores) # Predicted scores from all base detectors on the test data is # stored in train_scores_norm and test_scores_norm print('Decision score matrix on training data', train_scores_norm.shape) print('Decision score matrix on test data', test_scores_norm.shape) # In[17]: # Combine and evaluate the combination result # Combination by average y_by_average = average(test_scores_norm) evaluate_print('Combination by Average', y_test, y_by_average) # Combination by max y_by_maximization = maximization(test_scores_norm) evaluate_print('Combination by Maximization', y_test, y_by_maximization) # Combination by aom y_by_aom = aom(test_scores_norm, n_buckets=5) evaluate_print('Combination by AOM', y_test, y_by_aom) # Combination by moa y_by_moa = moa(test_scores_norm, n_buckets=5) evaluate_print('Combination by MOA', y_test, y_by_moa) # In[ ]: