#!/usr/bin/env python # coding: utf-8 # # Benchmark of various outlier detection models # # ### The models are evaluaed on ROC, Precision @ n and execution time on 17 benchmark datasets. All datasets are splitted 60% for training and 40% for testing. # # **[PyOD](https://github.com/yzhao062/pyod)** is a comprehensive **Python toolkit** to **identify outlying objects** in # multivariate data with both unsupervised and supervised approaches. # The model covered in this example includes: # # 1. Linear Models for Outlier Detection: # 1. **PCA: Principal Component Analysis** use the sum of # weighted projected distances to the eigenvector hyperplane # as the outlier outlier scores) # 2. **MCD: Minimum Covariance Determinant** (use the mahalanobis distances # as the outlier scores) # 3. **OCSVM: One-Class Support Vector Machines** # # 2. Proximity-Based Outlier Detection Models: # 1. **LOF: Local Outlier Factor** # 2. **CBLOF: Clustering-Based Local Outlier Factor** # 3. **kNN: k Nearest Neighbors** (use the distance to the kth nearest # neighbor as the outlier score) # 4. **Median kNN** Outlier Detection (use the median distance to k nearest # neighbors as the outlier score) # 5. **HBOS: Histogram-based Outlier Score** # # 3. Probabilistic Models for Outlier Detection: # 1. **ABOD: Angle-Based Outlier Detection** # # 4. Outlier Ensembles and Combination Frameworks # 1. **Isolation Forest** # 2. **Feature Bagging** # 3. **LSCP** # # Corresponding file could be found at /examples/compare_all_models.py # In[3]: from __future__ import division from __future__ import print_function import os import sys from time import time # temporary solution for relative imports in case pyod is not installed # if pyod is installed, no need to use the following line sys.path.append( os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) # supress warnings for clean output import warnings warnings.filterwarnings("ignore") import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from scipy.io import loadmat from pyod.models.abod import ABOD from pyod.models.cblof import CBLOF from pyod.models.feature_bagging import FeatureBagging from pyod.models.hbos import HBOS from pyod.models.iforest import IForest from pyod.models.knn import KNN from pyod.models.lof import LOF from pyod.models.mcd import MCD from pyod.models.ocsvm import OCSVM from pyod.models.pca import PCA from pyod.models.lscp import LSCP from pyod.utils.utility import standardizer from pyod.utils.utility import precision_n_scores from sklearn.metrics import roc_auc_score # In[4]: # Define data file and read X and y mat_file_list = ['arrhythmia.mat', 'cardio.mat', 'glass.mat', 'ionosphere.mat', 'letter.mat', 'lympho.mat', 'mnist.mat', 'musk.mat', 'optdigits.mat', 'pendigits.mat', 'pima.mat', 'satellite.mat', 'satimage-2.mat', # 'shuttle.mat', 'vertebral.mat', 'vowels.mat', 'wbc.mat'] # Define nine outlier detection tools to be compared random_state = np.random.RandomState(42) df_columns = ['Data', '#Samples', '# Dimensions', 'Outlier Perc', 'ABOD', 'CBLOF', 'FB', 'HBOS', 'IForest', 'KNN', 'LOF', 'MCD', 'OCSVM', 'PCA', 'LSCP'] roc_df = pd.DataFrame(columns=df_columns) prn_df = pd.DataFrame(columns=df_columns) time_df = pd.DataFrame(columns=df_columns) # initialize a set of detectors for LSCP detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30), LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45), LOF(n_neighbors=50)] for mat_file in mat_file_list: print("\n... Processing", mat_file, '...') mat = loadmat(os.path.join('data', mat_file)) X = mat['X'] y = mat['y'].ravel() outliers_fraction = np.count_nonzero(y) / len(y) outliers_percentage = round(outliers_fraction * 100, ndigits=4) # construct containers for saving results roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage] prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage] time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage] # 60% data for training and 40% for testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD( contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF( contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS( contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF( contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD( contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction, random_state=random_state), 'Principal Component Analysis (PCA)': PCA( contamination=outliers_fraction, random_state=random_state), 'Locally Selective Combination (LSCP)': LSCP( detector_list, contamination=outliers_fraction, random_state=random_state), } for clf_name, clf in classifiers.items(): t0 = time() clf.fit(X_train_norm) test_scores = clf.decision_function(X_test_norm) t1 = time() duration = round(t1 - t0, ndigits=4) time_list.append(duration) roc = round(roc_auc_score(y_test, test_scores), ndigits=4) prn = round(precision_n_scores(y_test, test_scores), ndigits=4) print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, ' 'execution time: {duration}s'.format( clf_name=clf_name, roc=roc, prn=prn, duration=duration)) roc_list.append(roc) prn_list.append(prn) temp_df = pd.DataFrame(time_list).transpose() temp_df.columns = df_columns time_df = pd.concat([time_df, temp_df], axis=0) temp_df = pd.DataFrame(roc_list).transpose() temp_df.columns = df_columns roc_df = pd.concat([roc_df, temp_df], axis=0) temp_df = pd.DataFrame(prn_list).transpose() temp_df.columns = df_columns prn_df = pd.concat([prn_df, temp_df], axis=0) # In[5]: print('Time complexity') time_df # Analyze the performance of ROC and Precision @ n # In[6]: print('ROC Performance') roc_df # In[7]: print('Precision @ n Performance') prn_df # In[ ]: