Merge pull request #151 from yzhao062/development

V0.7.6
yzhao062 · Dec 19, 2019 · c7db89c · c7db89c
2 parents 6bc53c8 + ce2e152
commit c7db89c
Show file tree

Hide file tree

Showing 18 changed files with 646 additions and 30 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -86,6 +86,8 @@ v<0.7.5>, <10/13/2019> -- Documentation updates.
 v<0.7.5.1>, <10/15/2019> -- kNN code optimization.
 v<0.7.5.1>, <12/05/2019> -- Hot fix for scikit-learn 0.22 update. To be complete.
 v<0.7.5.1>, <12/05/2019> -- Disable CircleCI for Python 2.7.
+v<0.7.6>, <12/18/2019> -- Update Isolation Forest and LOF to be consistent with sklearn 0.22.
+v<0.7.6>, <12/18/2019> -- Add Deviation-based Outlier Detection (LMDD).
 
 
 

diff --git a/README.rst b/README.rst
@@ -279,6 +279,7 @@ Type                 Abbr              Algorithm
 Linear Model         PCA               Principal Component Analysis (the sum of weighted projected distances to the eigenvector hyperplanes)   2003   [#Shyu2003A]_
 Linear Model         MCD               Minimum Covariance Determinant (use the mahalanobis distances as the outlier scores)                    1999   [#Hardin2004Outlier]_ [#Rousseeuw1999A]_
 Linear Model         OCSVM             One-Class Support Vector Machines                                                                       2001   [#Scholkopf2001Estimating]_
+Linear Model         LMDD              Deviation-based Outlier Detection (LMDD)                                                                1996   [#Arning1996A]_
 Proximity-Based      LOF               Local Outlier Factor                                                                                    2000   [#Breunig2000LOF]_
 Proximity-Based      COF               Connectivity-Based Outlier Factor                                                                       2002   [#Tang2002Enhancing]_
 Proximity-Based      CBLOF             Clustering-Based Local Outlier Factor                                                                   2003   [#He2003Discovering]_
@@ -574,6 +575,8 @@ Reference
 
 .. [#Angiulli2002Fast] Angiulli, F. and Pizzuti, C., 2002, August. Fast outlier detection in high dimensional spaces. In *European Conference on Principles of Data Mining and Knowledge Discovery* pp. 15-27.
 
+.. [#Arning1996A] Arning, A., Agrawal, R. and Raghavan, P., 1996, August. A Linear Method for Deviation Detection in Large Databases. In *KDD* (Vol. 1141, No. 50, pp. 972-981).
+
 .. [#Breunig2000LOF] Breunig, M.M., Kriegel, H.P., Ng, R.T. and Sander, J., 2000, May. LOF: identifying density-based local outliers. *ACM Sigmod Record*\ , 29(2), pp. 93-104.
 
 .. [#Goldstein2012Histogram] Goldstein, M. and Dengel, A., 2012. Histogram-based outlier score (hbos): A fast unsupervised anomaly detection algorithm. In *KI-2012: Poster and Demo Track*\ , pp.59-63.

diff --git a/docs/index.rst b/docs/index.rst
@@ -172,6 +172,7 @@ Type                 Abbr              Algorithm
 Linear Model         PCA               Principal Component Analysis (the sum of weighted projected distances to the eigenvector hyperplanes)   2003   :class:`pyod.models.pca.PCA`                         :cite:`a-shyu2003novel`
 Linear Model         MCD               Minimum Covariance Determinant (use the mahalanobis distances as the outlier scores)                    1999   :class:`pyod.models.mcd.MCD`                         :cite:`a-rousseeuw1999fast,a-hardin2004outlier`
 Linear Model         OCSVM             One-Class Support Vector Machines                                                                       2001   :class:`pyod.models.ocsvm.OCSVM`                     :cite:`a-scholkopf2001estimating`
+Linear Model         LMDD              Deviation-based Outlier Detection (LMDD)                                                                1996   :class:`pyod.models.lmdd.LMDD`                       :cite:`a-arning1996linear`
 Proximity-Based      LOF               Local Outlier Factor                                                                                    2000   :class:`pyod.models.lof.LOF`                         :cite:`a-breunig2000lof`
 Proximity-Based      COF               Connectivity-Based Outlier Factor                                                                       2002   :class:`pyod.models.cof.COF`                         :cite:`a-tang2002enhancing`
 Proximity-Based      CBLOF             Clustering-Based Local Outlier Factor                                                                   2003   :class:`pyod.models.cblof.CBLOF`                     :cite:`a-he2003discovering`

diff --git a/docs/pyod.models.rst b/docs/pyod.models.rst
@@ -85,6 +85,15 @@ pyod.models.knn module
     :show-inheritance:
     :inherited-members:
 
+pyod.models.lmdd module
+----------------------
+
+.. automodule:: pyod.models.lmdd
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :inherited-members:
+
 pyod.models.lof module
 ----------------------
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -6,8 +6,7 @@ numpy>=1.13
 numba>=0.35
 pytest
 scipy>=0.19.1
-# scikit_learn 0.22 causes error for LOF, CBLOF, IForest, and XGBOD. Fix later
-scikit_learn>=0.19.1,<=0.21.*
+scikit_learn>=0.19.1
 six
 sphinxcontrib-bibtex
 tensorflow

diff --git a/docs/zreferences.bib b/docs/zreferences.bib
@@ -284,4 +284,14 @@ @inproceedings{gopalan2019pidforest
   booktitle={Advances in Neural Information Processing Systems},
   pages={15783--15793},
   year={2019}
+}
+
+@inproceedings{arning1996linear,
+  title={A Linear Method for Deviation Detection in Large Databases.},
+  author={Arning, Andreas and Agrawal, Rakesh and Raghavan, Prabhakar},
+  booktitle={KDD},
+  volume={1141},
+  number={50},
+  pages={972--981},
+  year={1996}
 }
diff --git a/examples/data/mat_file_conversion.py b/examples/data/mat_file_conversion.py
@@ -0,0 +1,195 @@
+'''Utility function for unifying mat files
+
+'''
+import os
+import h5py
+import scipy as sp
+import numpy as np
+
+with h5py.File(os.path.join('../datasets', 'http.mat'), 'r') as file:
+    print(list(file.keys()))
+    X = list(file['X'])
+    y = list(file['y'])
+
+X_stack = np.column_stack((X[0], X[1], X[2]))
+
+http = {'X': X_stack,
+        'y': y}
+
+sp.io.savemat('http_n.mat', http)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+# %%
+import arff
+
+
+def read_arff(file_path, misplaced_list):
+    misplaced = False
+    for item in misplaced_list:
+        if item in file_path:
+            misplaced = True
+
+    file = arff.load(open(file_path))
+    data_value = np.asarray(file['data'])
+    attributes = file['attributes']
+
+    X = data_value[:, 0:-2]
+    if not misplaced:
+        y = data_value[:, -1]
+    else:
+        y = data_value[:, -2]
+    y[y == 'no'] = 0
+    y[y == 'yes'] = 1
+    y = y.astype('float').astype('int').ravel()
+
+    if y.sum() > len(y):
+        print(attributes)
+        raise ValueError('wrong sum')
+
+    return X, y, attributes
+
+
+misplaced_list = ['Arrhythmia', 'Cardiotocography', 'Hepatitis', 'ALOI',
+                  'KDDCup99']
+
+X, y, attributes = read_arff(os.path.join('../datasets', 'seismic-bumps.arff'),
+                             misplaced_list)
+
+num_index = [3, 4, 5, 6, 8, 9, 10, 11, 12, 16]  # 13,14,15 is null
+X_num = X[:, num_index].astype('float64')
+
+# %%
+
+# X_stack = np.column_stack((X[0], X[1], X[2]))
+
+seismic = {'X': X_num,
+           'y': y}
+
+sp.io.savemat('seismic.mat', seismic)
+
+# %%##########################################################################
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+from time import time
+import datetime
+
+# temporary solution for relative imports in case pyod is not installed
+# if pyod is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+# supress warnings for clean output
+import warnings
+
+warnings.filterwarnings("ignore")
+
+import numpy as np
+import arff
+
+
+def read_arff(file_path, misplaced_list):
+    misplaced = False
+    for item in misplaced_list:
+        if item in file_path:
+            misplaced = True
+
+    file = arff.load(open(file_path))
+    data_value = np.asarray(file['data'])
+    attributes = file['attributes']
+
+    X = data_value[:, 0:-2]
+    if not misplaced:
+        y = data_value[:, -1]
+    else:
+        y = data_value[:, -2]
+    y[y == 'no'] = 0
+    y[y == 'yes'] = 1
+    y = y.astype('float').astype('int').ravel()
+
+    if y.sum() > len(y):
+        print(attributes)
+        raise ValueError('wrong sum')
+
+    return X, y, attributes
+
+
+misplaced_list = ['Arrhythmia', 'Cardiotocography', 'Hepatitis', 'ALOI',
+                  'KDDCup99']
+
+arff_list = [
+    os.path.join('../semantic', 'Annthyroid',
+                 'Annthyroid_withoutdupl_07.arff'),
+    os.path.join('../semantic', 'Arrhythmia',
+                 'Arrhythmia_withoutdupl_46.arff'),
+    os.path.join('../semantic', 'Cardiotocography',
+                 'Cardiotocography_withoutdupl_22.arff'),
+    os.path.join('../semantic', 'HeartDisease',
+                 'HeartDisease_withoutdupl_44.arff'),
+    os.path.join('../semantic', 'Hepatitis', 'Hepatitis_withoutdupl_16.arff'),
+    os.path.join('../semantic', 'InternetAds',
+                 'InternetAds_withoutdupl_norm_19.arff'),
+    os.path.join('../semantic', 'PageBlocks',
+                 'PageBlocks_withoutdupl_09.arff'),
+    os.path.join('../semantic', 'Parkinson', 'Parkinson_withoutdupl_75.arff'),
+    os.path.join('../semantic', 'Pima', 'Pima_withoutdupl_35.arff'),
+    os.path.join('../semantic', 'SpamBase', 'SpamBase_withoutdupl_40.arff'),
+    os.path.join('../semantic', 'Stamps', 'Stamps_withoutdupl_09.arff'),
+    os.path.join('../semantic', 'Wilt', 'Wilt_withoutdupl_05.arff'),
+    #
+    os.path.join('../literature', 'ALOI', 'ALOI_withoutdupl.arff'),
+    os.path.join('../literature', 'Glass', 'Glass_withoutdupl_norm.arff'),
+    os.path.join('../literature', 'Ionosphere',
+                 'Ionosphere_withoutdupl_norm.arff'),
+    os.path.join('../literature', 'KDDCup99', 'KDDCup99_original.arff'),
+    os.path.join('../literature', 'Lymphography',
+                 'Lymphography_original.arff'),
+    os.path.join('../literature', 'PenDigits',
+                 'PenDigits_withoutdupl_norm_v01.arff'),
+    os.path.join('../literature', 'Shuttle', 'Shuttle_withoutdupl_v01.arff'),
+    os.path.join('../literature', 'Waveform', 'Waveform_withoutdupl_v01.arff'),
+    os.path.join('../literature', 'WBC', 'WBC_withoutdupl_v01.arff'),
+    os.path.join('../literature', 'WDBC', 'WDBC_withoutdupl_v01.arff'),
+    os.path.join('../literature', 'WPBC', 'WPBC_withoutdupl_norm.arff'),
+]
+from sklearn.ensemble import VotingClassifier
+file_names = [
+    'Annthyroid',
+    'Arrhythmia',
+    'Cardiotocography',
+    'HeartDisease',  # too small
+    'Hepatitis',  # too small
+    'InternetAds',
+    'PageBlocks',
+    'Parkinson',  # too small
+    'Pima',
+    'SpamBase',
+    'Stamps',
+    'Wilt',
+    #
+    'ALOI',  # too large
+    'Glass',  # too small
+    'Ionosphere',
+    'KDDCup99',  # too large
+    'Lymphography',  # data type X contains categorical
+    'PenDigits',
+    'Shuttle',
+    'Waveform',
+    'WBC',  # too small
+    'WDBC',  # too small
+    'WPBC',  # too small
+]
+
+assert (len(arff_list) == len(file_names))
+
+for m in range(len(file_names)):
+    arff_file = arff_list[m]
+    arff_file_name = file_names[m]
+    #    print("\n... Processing", arff_file_name, '...')
+
+    X, y, attributes = read_arff(arff_file, misplaced_list)
+    print(arff_file_name, X.shape[0], X.shape[1], y.sum(),
+          y.sum() / X.shape[0])
diff --git a/examples/lmdd_example.py b/examples/lmdd_example.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+"""Example of using Linear Method Deviation-base outlier detection (LMDD)
+"""
+# Author: Yahya Almardeny <almardeny@gmail.com>
+# License: BSD 2 clause
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# temporary solution for relative imports in case pyod is not installed
+# if pyod is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+
+from pyod.models.lmdd import LMDD
+from pyod.utils.data import generate_data
+from pyod.utils.data import evaluate_print
+from pyod.utils.example import visualize
+
+if __name__ == "__main__":
+    contamination = 0.1  # percentage of outliers
+    n_train = 200  # number of training points
+    n_test = 100  # number of testing points
+
+    # Generate sample data
+    X_train, y_train, X_test, y_test = \
+        generate_data(n_train=n_train,
+                      n_test=n_test,
+                      n_features=2,
+                      contamination=contamination,
+                      random_state=42)
+
+    # train LMDD detector
+    clf_name = 'LMDD'
+    clf = LMDD(random_state=42)
+    clf.fit(X_train)
+
+    # get the prediction labels and outlier scores of the training data
+    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
+    y_train_scores = clf.decision_scores_  # raw outlier scores
+
+    # get the prediction on the test data
+    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
+    y_test_scores = clf.decision_function(X_test)  # outlier scores
+
+    # evaluate and print the results
+    print("\nOn Training Data:")
+    evaluate_print(clf_name, y_train, y_train_scores)
+    print("\nOn Test Data:")
+    evaluate_print(clf_name, y_test, y_test_scores)
+
+    # visualize the results
+    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
+              y_test_pred, show_figure=True, save_figure=False)
diff --git a/pyod/models/base.py b/pyod/models/base.py
@@ -10,9 +10,9 @@
 import warnings
 from collections import defaultdict
 
-from ..utils.utility import _sklearn_version_21
+from ..utils.utility import _get_sklearn_version
 
-if _sklearn_version_21():
+if _get_sklearn_version() > 20:
     from inspect import signature
 else:
     from sklearn.externals.funcsigs import signature

diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py
@@ -14,7 +14,7 @@
 from .base import BaseDetector
 from ..utils.utility import invert_order
 # noinspection PyProtectedMember
-from ..utils.utility import _sklearn_version_20
+from ..utils.utility import _get_sklearn_version
 
 
 # TODO: behavior of Isolation Forest will change in sklearn 0.22. See below.
@@ -199,7 +199,8 @@ def fit(self, X, y=None):
         # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'})
         # to IsolationForest that shifts the location of the anomaly scores
         # noinspection PyProtectedMember
-        if _sklearn_version_20():
+        sklearn_version = _get_sklearn_version()
+        if sklearn_version == 21:
             self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                              max_samples=self.max_samples,
                                              contamination=self.contamination,
@@ -210,7 +211,7 @@ def fit(self, X, y=None):
                                              random_state=self.random_state,
                                              verbose=self.verbose)
 
-        # Do not pass behaviour argument when sklearn version is < 0.20
+        # Do not pass behaviour argument when sklearn version is < 0.20 or >0.21
         else:  # pragma: no cover
             self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                              max_samples=self.max_samples,
@@ -221,9 +222,7 @@ def fit(self, X, y=None):
                                              random_state=self.random_state,
                                              verbose=self.verbose)
 
-        self.detector_.fit(X=X,
-                           y=None,
-                           sample_weight=None)
+        self.detector_.fit(X=X, y=None, sample_weight=None)
 
         # invert decision_scores_. Outliers comes with higher outlier scores.
         self.decision_scores_ = invert_order(