From 07b6c41cae342bba9d492b762b38128b4ef27a7d Mon Sep 17 00:00:00 2001 From: Muhammad Yasirroni <48709672+yasirroni@users.noreply.github.com> Date: Wed, 13 Jan 2021 14:58:05 +0700 Subject: [PATCH 1/3] add markdown fence name --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index edbb01c..f515fda 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # rrcf 🌲🌲🌲 + [![Build Status](https://travis-ci.org/kLabUM/rrcf.svg?branch=master)](https://travis-ci.org/kLabUM/rrcf) [![Coverage Status](https://coveralls.io/repos/github/kLabUM/rrcf/badge.svg?branch=master)](https://coveralls.io/github/kLabUM/rrcf?branch=master) [![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/) ![GitHub](https://img.shields.io/github/license/kLabUM/rrcf.svg) [![status](http://joss.theoj.org/papers/f8c83c0b01a984d0dbf934939b53c96d/status.svg)](http://joss.theoj.org/papers/f8c83c0b01a984d0dbf934939b53c96d) Implementation of the *Robust Random Cut Forest Algorithm* for anomaly detection by [Guha et al. (2016)](http://proceedings.mlr.press/v48/guha16.pdf). @@ -76,7 +77,7 @@ for i in range(6): tree.insert_point(x, index=i) ``` -``` +```python ─+ ├───+ │ ├───+ @@ -92,11 +93,11 @@ for i in range(6): ### Deleting points -``` +```python tree.forget_point(2) ``` -``` +```python ─+ ├───+ │ ├───+ @@ -208,7 +209,7 @@ forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) - + # Use the "shingle" generator to create rolling window points = rrcf.shingle(sin, size=shingle_size) @@ -258,13 +259,13 @@ Please consider the following guidelines when contributing to the codebase: To run unit tests, first ensure that `pytest` and `pytest-cov` are installed: -``` +```shell $ pip install pytest pytest-cov ``` To run the tests, navigate to the root directory of the repo and run: -``` +```shell $ pytest --cov=rrcf/ ``` From d0b4aeefda86d9661c1d3f39e307c2c4c1084d60 Mon Sep 17 00:00:00 2001 From: Muhammad Yasirroni <48709672+yasirroni@users.noreply.github.com> Date: Wed, 13 Jan 2021 16:18:47 +0700 Subject: [PATCH 2/3] update readme for better explanation of rrcf --- README.md | 64 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index f515fda..8592000 100644 --- a/README.md +++ b/README.md @@ -115,28 +115,55 @@ The likelihood that a point is an outlier is measured by its collusive displacem ```python # Seed tree with zero-mean, normally distributed data -X = np.random.randn(100,2) -tree = rrcf.RCTree(X) +X = np.random.randn(100,2) # X has 100 data points, each 2 variables +tree = rrcf.RCTree(X) # enter X as tree.leaves points +``` -# Generate an inlier and outlier point +```python +# Generate an inlier point inlier = np.array([0, 0]) -outlier = np.array([4, 4]) # Insert into tree tree.insert_point(inlier, index='inlier') -tree.insert_point(outlier, index='outlier') -``` -```python tree.codisp('inlier') >>> 1.75 ``` ```python +# Delete point to remove from model +tree.forget_point('inlier') +``` + +```python +# Generate an outlier point +inlier = np.array([4, 4]) + +# Insert into tree +tree.insert_point(outlier, index='outlier') + tree.codisp('outlier') >>> 39.0 ``` +```python +# Delete point to remove from model +tree.forget_point('outlier') +``` + +### Control tree random seed + +Even with same data, a tree (also a forest) generated from `rrcf.RCTree()` is subject to `np.random` and might change for every run (resulting in different tree shape and anomaly score). To maintain reproducibility, use `numpy.random.seed()`: + +```python +# Before making a tree or a forest +seed_number = 42 # your_number +np.random.seed(seed_number) +tree = rrcf.RCTree(X) +``` + +WARNING: Don't use `numpy.random.seed()` inside loop while making `tree` or all of the `tree` will be identical. + ## Batch anomaly detection This example shows how a robust random cut forest can be used to detect outliers in a batch setting. Outliers correspond to large CoDisp. @@ -146,19 +173,19 @@ import numpy as np import pandas as pd import rrcf -# Set parameters -np.random.seed(0) +# Generate data n = 2010 d = 3 -num_trees = 100 -tree_size = 256 - -# Generate data X = np.zeros((n, d)) X[:1000,0] = 5 X[1000:2000,0] = -5 X += 0.01*np.random.randn(*X.shape) +# Set forest parameters +np.random.seed(42) +num_trees = 100 +tree_size = 256 + # Construct forest forest = [] while len(forest) < num_trees: @@ -200,15 +227,16 @@ sin = A*np.sin(T*t-phi*T) + center sin[235:255] = 80 # Set tree parameters +np.random.seed(42) num_trees = 40 shingle_size = 4 tree_size = 256 # Create a forest of empty trees -forest = [] -for _ in range(num_trees): +forest = [None] * num_trees +for idx in range(num_trees): tree = rrcf.RCTree() - forest.append(tree) + forest[idx] = tree # Use the "shingle" generator to create rolling window points = rrcf.shingle(sin, size=shingle_size) @@ -220,8 +248,8 @@ avg_codisp = {} for index, point in enumerate(points): # For each tree in the forest... for tree in forest: - # If tree is above permitted size, drop the oldest point (FIFO) - if len(tree.leaves) > tree_size: + # If tree is already full, drop the oldest point (FIFO) + if len(tree.leaves) = tree_size: tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) From 3d937d4e43252df5159119fa5185e8cfc57db31e Mon Sep 17 00:00:00 2001 From: Muhammad Yasirroni <48709672+yasirroni@users.noreply.github.com> Date: Thu, 14 Jan 2021 05:49:47 +0700 Subject: [PATCH 3/3] allow replace existing leave for faster deployment --- rrcf/rrcf.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/rrcf/rrcf.py b/rrcf/rrcf.py index 540214e..c5072da 100644 --- a/rrcf/rrcf.py +++ b/rrcf/rrcf.py @@ -390,7 +390,7 @@ def _update_leaf_count_upwards(self, node, inc=1): node.n += inc node = node.u - def insert_point(self, point, index, tolerance=None): + def insert_point(self, point, index, tolerance=None, replace=False): """ Inserts a point into the tree, creating a new leaf @@ -401,6 +401,8 @@ def insert_point(self, point, index, tolerance=None): Identifier for new leaf in tree tolerance: float Tolerance for determining duplicate points + replace: bool (optional) (default=False) + Allow to replace existing index or not Returns: -------- @@ -432,10 +434,11 @@ def insert_point(self, point, index, tolerance=None): raise ValueError( "Point must be same dimension as existing points in tree.") # Check for existing index in leaves dict - try: - assert (index not in self.leaves) - except KeyError: - raise KeyError("Index already exists in leaves dict.") + if not replace: + try: + assert (index not in self.leaves) + except KeyError: + raise KeyError("Index already exists in leaves dict.") # Check for duplicate points duplicate = self.find_duplicate(point, tolerance=tolerance) if duplicate: