PyOD is a comprehensive Python toolkit to identify outlying objects in multivariate data with both unsupervised and supervised approaches. The model covered in this example includes:
Linear Models for Outlier Detection:
weighted projected distances to the eigenvector hyperplane as the outlier outlier scores) [10] 2. MCD: Minimum Covariance Determinant (use the mahalanobis distances as the outlier scores) [11, 12] 3. One-Class Support Vector Machines [3]
Proximity-Based Outlier Detection Models:
neighbor as the outlier score) 4. Average kNN Outlier Detection (use the average distance to k nearest neighbors as the outlier score) 5. Median kNN Outlier Detection (use the median distance to k nearest neighbors as the outlier score) 6. HBOS: Histogram-based Outlier Score [5]
Probabilistic Models for Outlier Detection:
Outlier Ensembles and Combination Frameworks
from __future__ import division
from __future__ import print_function
import os
import sys
from time import time
# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager
# Import all models
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
# Define the number of inliers and outliers
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0]
# Compare given detectors under given settings
# Initialize the data
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1
# Show the statics of the data
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print('Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(shape=ground_truth.shape))
Number of inliers: 150 Number of outliers: 50 Ground truth shape is (200,). Outlier are 1 and inliers are 0. [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
random_state = np.random.RandomState(42)
# Define nine outlier detection tools to be compared
classifiers = {'Angle-based Outlier Detector (ABOD)':
'Cluster-based Local Outlier Factor (CBLOF)':
check_estimator=False, random_state=random_state),
'Feature Bagging':
'Histogram-base Outlier Detection (HBOS)': HBOS(
'Isolation Forest': IForest(contamination=outliers_fraction,
'K Nearest Neighbors (KNN)': KNN(
'Average KNN': KNN(method='mean',
'Median KNN': KNN(method='median',
'Local Outlier Factor (LOF)':
LOF(n_neighbors=35, contamination=outliers_fraction),
'Minimum Covariance Determinant (MCD)': MCD(
contamination=outliers_fraction, random_state=random_state),
'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction,
'Principal Component Analysis (PCA)': PCA(
contamination=outliers_fraction, random_state=random_state),
# Show all detectors
for i, clf in enumerate(classifiers.keys()):
print('Model', i + 1, clf)
Model 1 Angle-based Outlier Detector (ABOD) Model 2 Cluster-based Local Outlier Factor (CBLOF) Model 3 Feature Bagging Model 4 Histogram-base Outlier Detection (HBOS) Model 5 Isolation Forest Model 6 K Nearest Neighbors (KNN) Model 7 Average KNN Model 8 Median KNN Model 9 Local Outlier Factor (LOF) Model 10 Minimum Covariance Determinant (MCD) Model 11 One-class SVM (OCSVM) Model 12 Principal Component Analysis (PCA)
# Fit the models with the generated data and
# compare model performances
for i, offset in enumerate(clusters_separation):
# Data generation
X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
X = np.r_[X1, X2]
# Add outliers
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
# Fit the model
plt.figure(figsize=(15, 12))
for i, (clf_name, clf) in enumerate(classifiers.items()):
print(i + 1, 'fitting', clf_name)
# fit the data and tag outliers
scores_pred = clf.decision_function(X) * -1
y_pred = clf.predict(X)
threshold = stats.scoreatpercentile(scores_pred,
100 * outliers_fraction)
n_errors = (y_pred != ground_truth).sum()
# plot the levels lines and the points
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
Z = Z.reshape(xx.shape)
subplot = plt.subplot(3, 4, i + 1)
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
a = subplot.contour(xx, yy, Z, levels=[threshold],
linewidths=2, colors='red')
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white',
s=20, edgecolor='k')
c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black',
s=20, edgecolor='k')
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
loc='lower right')
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlim((-7, 7))
subplot.set_ylim((-7, 7))
plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
plt.suptitle("Outlier detection")
1 fitting Angle-based Outlier Detector (ABOD) 2 fitting Cluster-based Local Outlier Factor (CBLOF) 3 fitting Feature Bagging 4 fitting Histogram-base Outlier Detection (HBOS) 5 fitting Isolation Forest 6 fitting K Nearest Neighbors (KNN) 7 fitting Average KNN 8 fitting Median KNN 9 fitting Local Outlier Factor (LOF) 10 fitting Minimum Covariance Determinant (MCD) 11 fitting One-class SVM (OCSVM) 12 fitting Principal Component Analysis (PCA)