diff --git a/scripts/builtin/outlierByIsolationForest.dml b/scripts/builtin/outlierByIsolationForest.dml new file mode 100644 index 00000000000..ce0ad73c355 --- /dev/null +++ b/scripts/builtin/outlierByIsolationForest.dml @@ -0,0 +1,531 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Builtin function that implements anomaly detection via isolation forest as described in +# [Liu2008]: +# Liu, F. T., Ting, K. M., & Zhou, Z. H. +# (2008, December). +# Isolation forest. +# In 2008 eighth ieee international conference on data mining (pp. 413-422). +# IEEE. +# +# This function creates an iForest model for outlier detection. +# +# .. code-block:: python +# +# >>> import numpy as np +# >>> from systemds.context import SystemDSContext +# >>> from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply +# >>> with SystemDSContext() as sds: +# ... # Create training data: 20 points clustered near origin +# ... X_train = sds.from_numpy(np.array([ +# ... [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], +# ... [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], +# ... [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], +# ... [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] +# ... ])) +# ... model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) +# ... X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) +# ... scores = outlierByIsolationForestApply(model, X_test).compute() +# ... print(scores.shape) +# ... print(scores[1, 0] > scores[0, 0]) +# ... print(scores[1, 0] > 0.5) +# (2, 1) +# True +# True +# +# +# INPUT: +# --------------------------------------------------------------------------------------------- +# X Numerical feature matrix +# n_trees Number of iTrees to build +# subsampling_size Size of the subsample to build iTrees with +# seed Seed for calls to `sample` and `rand`. -1 corresponds to a random seed +# --------------------------------------------------------------------------------------------- +# +# OUTPUT: +# --------------------------------------------------------------------------------------------- +# iForestModel The trained iForest model to be used in outlierByIsolationForestApply. +# The model is represented as a list with two entries: +# Entry 'model' (Matrix[Double]) - The iForest Model in linearized form (see m_iForest) +# Entry 'subsampling_size' (Double) - The subsampling size used to build the model. +# --------------------------------------------------------------------------------------------- + +s_outlierByIsolationForest = function(Matrix[Double] X, Integer n_trees, Integer subsampling_size, Integer seed = -1) + return(List[Unknown] iForestModel) +{ + iForestModel = m_outlierByIsolationForest(X, n_trees, subsampling_size, seed) +} + +# modification 1: Use the actual subsampling size for training and scoring normalization. + +m_outlierByIsolationForest = function(Matrix[Double] X, Integer n_trees, Integer subsampling_size, Integer seed = -1) +return(List[Unknown] iForestModel) +{ + s_warning_assert_outlierByIsolationForest(nrow(X) > 1, "iForest: X must contain at least two rows.") + s_warning_assert_outlierByIsolationForest(ncol(X) > 0, "iForest: X must contain at least one feature column.") + s_warning_assert_outlierByIsolationForest(n_trees > 0, "iForest: n_trees must be positive.") + s_warning_assert_outlierByIsolationForest(subsampling_size > 1, "iForest: subsampling_size must be greater than 1.") + + actual_subsampling_size = as.integer(ifelse(subsampling_size > nrow(X), nrow(X), subsampling_size)) + + M = m_iForest(X, n_trees, actual_subsampling_size, seed) + + iForestModel = list( + model = M, + subsampling_size = actual_subsampling_size + ) +} + +# This function implements isolation forest for numerical input features as +# described in [Liu2008]. +# +# The returned 'linearized' model is of type Matrix[Double] where each row +# corresponds to a linearized iTree (see m_iTree). Note that each tree in the +# model is padded with placeholder nodes such that each iTree has the same maximum depth. +# +# .. code-block:: +# +# For example, give a feature matrix with features [a,b,c,d] +# and the following iForest, M would look as follows: +# +# Level Tree 1 Tree 2 Node Depth +# ------------------------------------------------------------------- +# (L1) |d<=5| |b<=6| 0 +# / \ / \ +# (L2) 2 |a<=7| 20 0 1 +# / \ +# (L3) 10 8 2 +# +# --> M := +# [[ 4, 5, 0, 2, 1, 7, -1, -1, -1, -1, 0, 10, 0, 8], (Tree 1) +# [ 2, 6, 0, 20, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1]] (Tree 2) +# | (L1) | | (L2) | | (L3) | +# +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix[Double] Numerical feature matrix +# n_trees Int Number of iTrees to build +# subsampling_size Int Size of the subsample to build iTrees with +# seed Int -1 Seed for calls to `sample` and `rand`. +# -1 corresponds to a random seed +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# M Matrix containing the learned iForest in linearized form +# --------------------------------------------------------------------------------------------- + +# modification 2: Build exactly n_trees iTrees. Each tree gets a deterministic but distinct seed. + +m_iForest = function(Matrix[Double] X, Integer n_trees, Integer subsampling_size, Integer seed = -1) +return(Matrix[Double] M) +{ + s_warning_assert_outlierByIsolationForest(n_trees > 0, "iForest: n_trees must be positive.") + s_warning_assert_outlierByIsolationForest(subsampling_size > 1, "iForest: subsampling_size must be greater than 1.") + s_warning_assert_outlierByIsolationForest(subsampling_size <= nrow(X), "iForest: subsampling_size must be <= nrow(X).") + + height_limit = ceil(log(subsampling_size, 2)) + tree_size = 2 * (2^(height_limit + 1) - 1) + + M = matrix(-1, rows = n_trees, cols = tree_size) + + parfor (i_iTree in 1:n_trees, taskpartitioner="STATIC") { + tree_seed = ifelse(seed == -1, -1, seed + i_iTree) + + X_subsample = s_sampleRows(X, subsampling_size, tree_seed) + + split_seed = ifelse(seed == -1, -1, seed + 1000003 * i_iTree) + + M_tree = m_iTree(X_subsample, height_limit, split_seed) + + M[i_iTree, 1:ncol(M_tree)] = M_tree + } +} + +# This function implements isolation trees for numerical input features as +# described in [Liu2008]. +# +# The returned 'linearized' model is of type Matrix[Double] with exactly one row. +# Here, each node is represented by two consecutive entries in this row vector. +# Traversing the row vector from left to right corresponds to traversing the tree +# level-wise from top to bottom and left to right. If a node does not exist +# (e.g. because the parent node is already a leaf node), the node is still stored +# using placeholder values. +# Recall that for a binary tree with maximum depth `d`, the maximum number of nodes +# `can be calculated by `2^(maximum depth + 1) - 1`. Hence, for a given maximum depth +# of an iTree, the row vector will have exactly `2*2^(maximum depth + 1) - 1` entries. +# +# There are three types of nodes that are represented in this model: +# - Internal Node +# A node a that based on a "split feature" and corresponding "split value" +# devides the data into two parts, one of which can potentially be an empty set. +# The node is lineraized in the following way: +# - Entry 1: Represents the index of the splitting feature in the feature matrix `X` +# - Entry 2: Represents splitting value +# +# - External Node +# A leaf node of the tree, It contains the "size" of the node. That is the +# number of remaining samples after splitting the feature matrix X by traversing +# the tree to this node. +# The node is lineraized in the following way: +# - Entry 1: Always 0 - indicating an external node +# - Entry 2: The "size" of the node +# +# - Placeholder Node +# A node that is not present in the actual iTree and is used for "padding". +# Both entries are set to -1 +# +# .. code-block:: +# +# For example, give a feature matrix with features [a,b,c,d] +# and the following tree, M would look as follows: +# Level Tree Node Depth +# ------------------------------------------------- +# (L1) |d<5| 0 +# / \ +# (L2) 1 |a<7| 1 +# / \ +# (L3) 10 0 2 +# +# --> M := +# [[4, 5, 0, 1, 1, 7, -1, -1, -1, -1, 0, 10, 0, 0]] +# |(L1)| | (L2) | | (L3) | +# +# +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix[Double] Numerical feature matrix +# max_depth Int Maximum depth of the learned tree where depth is the +# maximum number of edges from root to a leaf note +# seed Int -1 Seed for calls to `sample` and `rand`. +# -1 corresponds to a random seed +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# M Matrix M containing the learned tree in linearized form +# --------------------------------------------------------------------------------------------- + +# modification 3: A node is splittable only if at least one feature has positive range. + +s_hasSplittableFeature = function(Matrix[Double] X_node) +return(Boolean hasSplittableFeature) +{ + ranges = colMaxs(X_node) - colMins(X_node) + hasSplittableFeature = max(ranges) > 0 +} + +# modification 6: + +m_iTree = function(Matrix[Double] X, Integer max_depth, Integer seed = -1) + return(Matrix[Double] M) +{ + # check assumptions + s_warning_assert_outlierByIsolationForest(max_depth > 0 & max_depth <= 32, "iTree: Requirement 0 < max_depth < 32 not satisfied! max_depth: " + max_depth) + s_warning_assert_outlierByIsolationForest(nrow(X) > 0, "iTree: Feature matrix X has no less than 2 rows!") + + + # Initialize M to largest possible matrix given max_depth + # Note that each node takes exactly 2 indices in M and the root node has depth 0 + M = matrix(-1, rows=1, cols=2*(2^(max_depth+1)-1)) + + # Queue for implementing recursion in the original algorithm. + # Each entry in the queue corresponds to a node that in the tree to be added to the model + # M and, in case of internal nodes, split further. + # Nodes in this queue are represented by an ID (first index) and the data corrseponding to the node (second index) + node_queue = list(list(1, X)); + # variable tracking the maximum ID of in the tree + max_id = 1; + while (length(node_queue) > 0) { + # pop next node from queue for splitting + [node_queue, queue_entry] = remove(node_queue, 1); + node = as.list(queue_entry); + node_id = as.scalar(node[1]); + X_node = as.matrix(node[2]); + + max_id = max(max_id, node_id) + + is_external_leaf = s_isExternalINode(X_node, node_id, max_depth) + if (is_external_leaf) { + # External Node: Add node to model + M = s_addExternalINode(X_node, node_id, M) + } + + # modification 6: # Materialize an internal node only if the split creates two non-empty children. + + else { + node_seed = ifelse(seed == -1, -1, seed + node_id) + + [split_feature, split_value] = s_drawSplitPoint(X_node, node_seed) + [left_id, X_left, right_id, X_right] = s_splitINode(X_node, node_id, split_feature, split_value) + + if (nrow(X_left) == 0 | nrow(X_right) == 0) { + M = s_addExternalINode(X_node, node_id, M) + } + else { + M = s_addInternalINode(node_id, split_feature, split_value, M) + + node_queue = append(node_queue, list(left_id, X_left)) + node_queue = append(node_queue, list(right_id, X_right)) + } + } +} + # Prune the model to the actual tree depth + tree_depth = floor(log(max_id, 2)) + M = M[1, 1:2*(2^(tree_depth+1) - 1)]; +} + + +# Randomly draws a split point i.e. a feature and corresponding value to split a node by. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix[Double] Numerical feature matrix +# seed Int -1 Seed for calls to `sample` and `rand` +# -1 corresponds to a random seed +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# split_feature Index of the feature used for splitting the node +# split_value Feature value used for splitting the node +# --------------------------------------------------------------------------------------------- + +# modification 5: Draw the split feature only from columns that can separate the current node. + +s_drawSplitPoint = function(Matrix[Double] X, Integer seed = -1) +return(Integer split_feature, Double split_value) +{ + ranges = colMaxs(X) - colMins(X) + feature_ids = matrix(seq(1, ncol(X)), rows = 1, cols = ncol(X)) + + valid_features = removeEmpty( + target = feature_ids, + margin = "cols", + select = ranges > 0, + empty.return = FALSE + ) + + feature_idx_seed = seed + feature_idx = as.integer(as.scalar(sample(ncol(valid_features), 1, FALSE, feature_idx_seed))) + split_feature = as.integer(as.scalar(valid_features[1, feature_idx])) + + feature_min = min(X[, split_feature]) + feature_max = max(X[, split_feature]) + + split_value = as.scalar(rand( + rows = 1, + cols = 1, + min = feature_min, + max = feature_max, + seed = seed + )) +} + +# Adds a external (leaf) node to the linearized iTree model `M`. In the linerized form, +# each node is assigned two neighboring indices. For external nodes the value at the first +# index in M is always set to 0 while the value at the second index is set to the number of +# rows in the feature matrix corresponding to the node. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X_node Matrix[Double] Numerical feature matrix corresponding to the node +# node_id Int ID of the node +# M Matrix[Double] Linerized model to add the node to +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# M The updated model +# --------------------------------------------------------------------------------------------- +s_addExternalINode = function(Matrix[Double] X_node, Integer node_id, Matrix[Double] M) + return(Matrix[Double] M) +{ + s_warning_assert_outlierByIsolationForest(node_id > 0, "s_addExternalINode: Requirement `node_id > 0` not satisfied!") + + node_start_index = 2*node_id-1 + M[, node_start_index] = 0 + M[, node_start_index + 1] = nrow(X_node) +} + +# Adds a internal node to the linearized iTree model `M`. In the linerized form, +# each node is assigned two neighboring indices. For internal nodes the value at the first +# index in M is set to index of the feature to split by while the value at the second index +# is set to the value to split the node by. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# node_id Int ID of the node +# split_feature Int Index of the feature to split the node by +# split_value Int Value to split the node by +# M Matrix[Double] Linerized model to add the node to +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# M The updated model +# --------------------------------------------------------------------------------------------- +s_addInternalINode = function(Integer node_id, Integer split_feature, Double split_value, Matrix[Double] M) + return(Matrix[Double] M) +{ + s_warning_assert_outlierByIsolationForest(node_id > 0, "s_addInternalINode: Requirement `node_id > 0` not satisfied!") + s_warning_assert_outlierByIsolationForest(split_feature > 0, "s_addInternalINode: Requirement `split_feature > 0` not satisfied!") + + node_start_index = 2*node_id-1 + M[, node_start_index] = split_feature + M[, node_start_index + 1] = split_value +} + +# This function determines if a iTree node is an external node based on it's node_id and the data corresponding to the node +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X_node Matrix[Double] Numerical feature matrix corresponding to the node +# node_id Int ID belonging to the node +# max_depth Int Maximum depth of the learned tree where depth is the +# maximum number of edges from root to a leaf note +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# isExternalNote true if the node is an external (leaf) node, false otherwise. +# This is the case when a max depth is reached or the number of rows +# in the feature matrix corresponding to the node <= 1 +# --------------------------------------------------------------------------------------------- + +# modification 4: Stop if the depth limit is reached, the node is isolated, or no feature can produce a valid split. + +s_isExternalINode = function(Matrix[Double] X_node, Integer node_id, Integer max_depth) +return(Boolean isExternalNode) +{ + s_warning_assert_outlierByIsolationForest(max_depth > 0, "s_isExternalINode: max_depth must be positive.") + s_warning_assert_outlierByIsolationForest(node_id > 0, "s_isExternalINode: node_id must be positive.") + + node_depth = floor(log(node_id, 2)) + + isExternalNode = node_depth >= max_depth | nrow(X_node) <= 1 | !s_hasSplittableFeature(X_node) +} + + +# This function splits a node based on a given feature and value and returns the sub-matrices +# and IDs corresponding to the nodes resulting from the split. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X_node Matrix[Double] Numerical feature matrix corresponding +# node_id Int ID of the node to split +# split_feature Int Index of the feature to split the input matrix by +# split_value Int Value of the feature to split the input matrix by +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# left_id ID of the resulting left node +# X_left Matrix corresponding to the left node resulting from the split with rows where +# value for feature `split_feature` <= value `split_value` +# right_id ID of the resulting right node +# X_right Matrix corresponding to the left node resulting from the split with rows where +# value for feature `split_feature` > value `split_value` +# --------------------------------------------------------------------------------------------- + + + +s_splitINode = function(Matrix[Double] X_node, Integer node_id, Integer split_feature, Double split_value) + return(Integer left_id, Matrix[Double] X_left, Integer right_id, Matrix[Double] X_right) +{ + s_warning_assert_outlierByIsolationForest(nrow(X_node) > 0, "s_splitINode: Requirement `nrow(X_node) > 0` not satisfied!") + s_warning_assert_outlierByIsolationForest(node_id > 0, "s_splitINode: Requirement `nrow(X_node) > 0` not satisfied!") + s_warning_assert_outlierByIsolationForest(split_feature > 0, "s_splitINode: Requirement `split_feature > 0` not satisfied!") + + # modification 7: Follow the original iTree convention < and >= + + left_rows_mask = X_node[, split_feature] < split_value + + # In the lineraized form of the iTree model, nodes need to be ordered by depth + # Since iTrees are binary trees we can use 2*node_id/2*node_id+1 for left/right child ids + # to insure that IDs are chosen accordingly. + left_id = 2 * node_id + X_left = removeEmpty(target=X_node, margin="rows", select=left_rows_mask, empty.return=FALSE) + + right_id = 2 * node_id + 1 + X_right = removeEmpty(target=X_node, margin="rows", select=!left_rows_mask, empty.return=FALSE) +} + +# Randomly samples `size` rows from a matrix X +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix[Double] Matrix to sample rows from +# sample_size Int Number of rows to sample +# seed Int -1 Seed for calls to `sample` +# -1 corresponds to a random seed +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# X_sampled Sampled rows from X +# --------------------------------------------------------------------------------------------- + +# modification 8: Split two conditions. + +s_sampleRows = function(Matrix[Double] X, Integer size, Integer seed = -1) +return(Matrix[Double] X_extracted) +{ + s_warning_assert_outlierByIsolationForest(size > 0, "s_sampleRows: size must be positive.") + s_warning_assert_outlierByIsolationForest(nrow(X) >= size, "s_sampleRows: size must be <= nrow(X).") + + random_vector = rand(rows = nrow(X), cols = 1, seed = seed) + X_rand = cbind(X, random_vector) + + X_rand = order(target = X_rand, by = ncol(X_rand)) + + X_extracted = X_rand[1:size, 1:ncol(X)] +} + +# Function that gives a warning if a assertion is violated. This is used instead of `assert` and +# `stop` since these function can not be used in parfor . +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# assertion Boolean Assertion to check +# warning String Warning message to print if assertion is violated +# --------------------------------------------------------------------------------------------- +s_warning_assert_outlierByIsolationForest = function(Boolean assertion, String warning) +{ + if (!assertion) + print("outlierIsolationForest: "+warning) +} diff --git a/scripts/builtin/outlierByIsolationForestApply.dml b/scripts/builtin/outlierByIsolationForestApply.dml new file mode 100644 index 00000000000..4d83a35a9be --- /dev/null +++ b/scripts/builtin/outlierByIsolationForestApply.dml @@ -0,0 +1,273 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Builtin function that calculates the anomaly score as described in [Liu2008] +# for a set of samples `X` based on an iForest model. +# +# [Liu2008]: +# Liu, F. T., Ting, K. M., & Zhou, Z. H. +# (2008, December). +# Isolation forest. +# In 2008 eighth ieee international conference on data mining (pp. 413-422). +# IEEE. +# +# .. code-block:: python +# +# >>> import numpy as np +# >>> from systemds.context import SystemDSContext +# >>> from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply +# >>> with SystemDSContext() as sds: +# ... # Create training data: 20 points clustered near origin +# ... X_train = sds.from_numpy(np.array([ +# ... [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], +# ... [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], +# ... [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], +# ... [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] +# ... ])) +# ... model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) +# ... X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) +# ... scores = outlierByIsolationForestApply(model, X_test).compute() +# ... print(scores.shape) +# ... print(scores[1, 0] > scores[0, 0]) +# ... print(scores[1, 0] > 0.5) +# (2, 1) +# True +# True +# +# +# INPUT: +# --------------------------------------------------------------------------------------------- +# iForestModel The trained iForest model as returned by outlierByIsolationForest +# X Samples to calculate the anomaly score for +# --------------------------------------------------------------------------------------------- +# +# OUTPUT: +# --------------------------------------------------------------------------------------------- +# anomaly_scores Column vector of anomaly scores corresponding to the samples in X. +# Samples with an anomaly score > 0.5 are generally considered to be outliers +# --------------------------------------------------------------------------------------------- + +s_outlierByIsolationForestApply = function(List[Unknown] iForestModel, Matrix[Double] X) + return(Matrix[Double] anomaly_scores) +{ + anomaly_scores = m_outlierByIsolationForestApply(iForestModel, X) +} + +# modification 9: Scoring a single row and using a one-tree model are valid edge cases. + +m_outlierByIsolationForestApply = function(List[Unknown] iForestModel, Matrix[Double] X) +return(Matrix[Double] anomaly_scores) +{ + s_warning_assert(nrow(X) >= 1, "iForestApply: X must contain at least one row.") + + M = as.matrix(iForestModel["model"]) + subsampling_size = as.integer(as.scalar(iForestModel["subsampling_size"])) + + s_warning_assert(subsampling_size > 1, "iForestApply: invalid subsampling_size.") + s_warning_assert(nrow(M) >= 1, "iForestApply: model must contain at least one tree.") + + height_limit = ceil(log(subsampling_size, 2)) + tree_size = 2 * (2^(height_limit + 1) - 1) + + s_warning_assert(ncol(M) == tree_size, "iForestApply: model has invalid number of columns.") + + anomaly_scores = matrix(0, rows = nrow(X), cols = 1) + + for (i_x in 1:nrow(X)) { + anomaly_scores[i_x, 1] = m_score(M, X[i_x, ], subsampling_size) + } +} + +# Calculates the PathLength as defined in [Liu2008] based on a sample x +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# M Matrix[Double] The linearized iTree model +# x Matrix[Double] The sample to calculate the PathLength +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# PathLength The PathLength for the sample +# --------------------------------------------------------------------------------------------- +m_PathLength = function(Matrix[Double] M, Matrix[Double] x) + return(Double PathLength) +{ + [nrEdgesTraversed, externalNodeSize] = s_traverseITree(M, x) + + if (externalNodeSize <= 1) { + PathLength = nrEdgesTraversed + } + else { + PathLength = nrEdgesTraversed + s_cn(externalNodeSize) + } +} + + +# Traverses an iTree based on a sample x +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# M Matrix[Double] The linearized iTree model to traverse +# x Matrix[Double] The sample to traverse the iTree with +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# nrEdgesTraversed The number of edges traversed until an external node was reached +# externalNodeSize The size of of the external node assigned to during training +# --------------------------------------------------------------------------------------------- + +# modification 10: Traversal must use the same split rule as training. Reaching a placeholder means the tree model is invalid, so fail fast. + +s_traverseITree = function(Matrix[Double] M, Matrix[Double] x) +return(Integer nrEdgesTraversed, Integer externalNodeSize) +{ + s_warning_assert(nrow(x) == 1, "s_traverseITree: x must be a single row.") + + nrEdgesTraversed = 0 + externalNodeSize = 0 + is_external_node = FALSE + node_id = 1 + + while (!is_external_node) { + node_start_idx = node_id * 2 - 1 + + s_warning_assert(node_start_idx + 1 <= ncol(M), "s_traverseITree: node index out of bounds.") + + split_feature = as.integer(as.scalar(M[1, node_start_idx])) + node_value = as.scalar(M[1, node_start_idx + 1]) + + if (split_feature > 0) { + nrEdgesTraversed = nrEdgesTraversed + 1 + + x_val = as.scalar(x[1, split_feature]) + + if (x_val < node_value) { + node_id = node_id * 2 + } + else { + node_id = node_id * 2 + 1 + } + } + else if (split_feature == 0) { + externalNodeSize = as.integer(node_value) + is_external_node = TRUE + } + else { + s_warning_assert(FALSE, "s_traverseITree: invalid iTree model; reached placeholder node.") + } + } +} + + +# This function gives the average path length of unsuccessful search in BST `c(n)` +# for `n` nodes as given in [Liu2008]. This function is used to normalize the path length +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# n Int Number of samples that corresponding to an external +# node for which c(n) should be calculated +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# cn Value for c(n) +# --------------------------------------------------------------------------------------------- +s_cn = function(Integer n) + return(Double cn) +{ + s_warning_assert(n > 1, "s_cn: Requirement `n > 1` not satisfied!") + + # Calculate H(n-1) + # The approximation of the Harmonic Number H using `log(n) + eulergamma` has a higher error + # for low n. We hence calculate it directly for the first 1000 values + # TODO: Discuss a good value for n --> use e.g. HarmonicNumber(1000) - (ln(1000) + 0.5772156649) in WA + if (n < 1000) { + indices = seq(1,n-1) + H_nminus1 = sum(1/indices) + + } + else{ + # Euler–Mascheroni's constant + eulergamma = 0.57721566490153 + # Approximation harmonic number H(n - 1) + H_nminus1 = log(n-1) + eulergamma + } + + cn = 2*H_nminus1 - 2*(n-1)/n +} + +# Scors a sample `x` according to score function `s(x, n)` for a sample x and a testset-size n, as described in [Liu2008]. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# M Matrix[Double] iForest model used to score +# x Matrix[Double] Sample to be scored +# n Int Subsample size the iTrees were built from +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# score The score for +# --------------------------------------------------------------------------------------------- + +# modification 11: Only the mean path length is required for scoring, so accumulate directly. + +m_score = function(Matrix[Double] M, Matrix[Double] x, Integer n) +return(Double score) +{ + s_warning_assert(n > 1, "m_score: n must be greater than 1.") + s_warning_assert(nrow(x) == 1, "m_score: sample has wrong dimension.") + s_warning_assert(nrow(M) >= 1, "m_score: invalid iForest model.") + + path_length_sum = 0.0 + + for (i_iTree in 1:nrow(M)) { + path_length_sum = path_length_sum + m_PathLength(M[i_iTree, ], x) + } + + avg_path_length = path_length_sum / nrow(M) + + score = 2^-(avg_path_length / s_cn(n)) +} + +# Function that gives a warning if a assertion is violated. This is used instead of `assert` and +# `stop` since these function can not be used in parfor . +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# assertion Boolean Assertion to check +# warning String Warning message to print if assertion is violated +# --------------------------------------------------------------------------------------------- +s_warning_assert = function(Boolean assertion, String warning) +{ + if (!assertion) + print("outlierIsolationForest: "+warning) +} \ No newline at end of file diff --git a/scripts/staging/isolationForest/test/isolationForestTest.dml b/scripts/staging/isolationForest/test/isolationForestTest.dml index 9decfe6087d..1bae38aced1 100644 --- a/scripts/staging/isolationForest/test/isolationForestTest.dml +++ b/scripts/staging/isolationForest/test/isolationForestTest.dml @@ -702,6 +702,234 @@ parfor (i_run in 1:nr_runs) { print("\n") } +#------------------------------------------------------------- +# Isolation Forest - Edge Case Tests +# Following the structure of isolationForestTest.dml +#------------------------------------------------------------- + +print("===============================================================") +print("Isolation Forest - Edge Case Tests") +print("===============================================================") +print("") + +# Test data +X_100x5 = rand(rows=100, cols=5, seed=42) +X_zeros = matrix(0.0, rows=100, cols=5) +X_identical = matrix(5.0, rows=100, cols=5) + +print("===============================================================") +print("CATEGORY 1: Extreme Data Sizes") +print("===============================================================") +print("") + +# Test 1: Minimum dataset (2 rows) +print("Test 1: Minimum dataset (2 rows, 2 features)") +testname = "Minimum dataset (2x2)" +X_tiny = rand(rows=2, cols=2, seed=42) +model_tiny = iForest::outlierByIsolationForest(X=X_tiny, n_trees=10, subsampling_size=2) +scores_tiny = iForest::outlierByIsolationForestApply(iForestModel=model_tiny, X=X_tiny) +test_res = (nrow(scores_tiny) == 2) & (ncol(scores_tiny) == 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 2: Very small dataset (3 rows) +print("Test 2: Very small dataset (3 rows, 3 features)") +testname = "Very small dataset (3x3)" +X_mini = rand(rows=3, cols=3, seed=42) +model_mini = iForest::outlierByIsolationForest(X=X_mini, n_trees=5, subsampling_size=3) +scores_mini = iForest::outlierByIsolationForestApply(iForestModel=model_mini, X=X_mini) +test_res = (nrow(scores_mini) == 3) & (ncol(scores_mini) == 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 3: Large dataset +print("Test 3: Large dataset (5,000 rows, 10 features)") +testname = "Large dataset (5000x10)" +X_large = rand(rows=5000, cols=10, seed=42) +model_large = iForest::outlierByIsolationForest(X=X_large, n_trees=30, subsampling_size=256) +scores_large = iForest::outlierByIsolationForestApply(iForestModel=model_large, X=X_large[1:10,]) +test_res = (nrow(scores_large) == 10) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +print("===============================================================") +print("CATEGORY 2: Extreme Feature Counts") +print("===============================================================") +print("") + +# Test 4: Single feature +print("Test 4: Single feature dataset (100 rows, 1 feature)") +testname = "Single feature dataset" +X_one_feature = rand(rows=100, cols=1, seed=42) +model_one = iForest::outlierByIsolationForest(X=X_one_feature, n_trees=20, subsampling_size=50) +scores_one = iForest::outlierByIsolationForestApply(iForestModel=model_one, X=X_one_feature) +test_res = (nrow(scores_one) == 100) & (ncol(scores_one) == 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 5: High-dimensional dataset +print("Test 5: High-dimensional dataset (200 rows, 30 features)") +testname = "High-dimensional dataset (30 features)" +X_high_dim = rand(rows=200, cols=30, seed=42) +model_high = iForest::outlierByIsolationForest(X=X_high_dim, n_trees=20, subsampling_size=100) +scores_high = iForest::outlierByIsolationForestApply(iForestModel=model_high, X=X_high_dim[1:10,]) +test_res = (nrow(scores_high) == 10) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +print("===============================================================") +print("CATEGORY 3: Extreme Values") +print("===============================================================") +print("") + +# Test 6: All zeros +print("Test 6: All values are zero") +testname = "All zeros" +model_zeros = iForest::outlierByIsolationForest(X=X_zeros, n_trees=20, subsampling_size=50) +scores_zeros = iForest::outlierByIsolationForestApply(iForestModel=model_zeros, X=X_zeros) +mean_score = mean(scores_zeros) +# When all data is identical, algorithm can't split -> maximum path length +# -> LOW anomaly scores (all points are equally "normal") +test_res = (mean_score > 0.2) & (mean_score < 0.35) # Expect low scores for identical data +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Mean score: " + toString(mean_score) + " (low score expected - no variation to detect)") +print("") + +# Test 7: All identical values +print("Test 7: All identical values (all 5s)") +testname = "All identical values" +model_identical = iForest::outlierByIsolationForest(X=X_identical, n_trees=20, subsampling_size=50) +scores_identical = iForest::outlierByIsolationForestApply(iForestModel=model_identical, X=X_identical) +mean_score = mean(scores_identical) +# Same logic: identical data -> can't split -> maximum path -> low scores +test_res = (mean_score > 0.2) & (mean_score < 0.35) # Expect low scores for identical data +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Mean score: " + toString(mean_score) + " (low score expected - no variation to detect)") +print("") + +# Test 8: Negative values +print("Test 8: All negative values") +testname = "All negative values" +X_negative = rand(rows=100, cols=5, min=-100, max=-1, seed=42) +model_negative = iForest::outlierByIsolationForest(X=X_negative, n_trees=20, subsampling_size=50) +scores_negative = iForest::outlierByIsolationForestApply(iForestModel=model_negative, X=X_negative) +test_res = (nrow(scores_negative) == 100) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 9: Very large values +print("Test 9: Very large values (thousands)") +testname = "Very large values" +X_huge = rand(rows=100, cols=5, min=1000, max=10000, seed=42) +model_huge = iForest::outlierByIsolationForest(X=X_huge, n_trees=10, subsampling_size=50) +scores_huge = iForest::outlierByIsolationForestApply(iForestModel=model_huge, X=X_huge) +test_res = (nrow(scores_huge) == 100) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 10: Very small values +print("Test 10: Very small values (near zero)") +testname = "Very small values" +X_tiny_vals = rand(rows=100, cols=5, min=0.0001, max=0.001, seed=42) +model_tiny_vals = iForest::outlierByIsolationForest(X=X_tiny_vals, n_trees=10, subsampling_size=50) +scores_tiny_vals = iForest::outlierByIsolationForestApply(iForestModel=model_tiny_vals, X=X_tiny_vals) +test_res = (nrow(scores_tiny_vals) == 100) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 11: Mixed extreme values +print("Test 11: Mixed extreme values") +testname = "Mixed extreme values" +X_mixed = rand(rows=100, cols=5, min=-1000, max=1000, seed=42) +model_mixed = iForest::outlierByIsolationForest(X=X_mixed, n_trees=10, subsampling_size=50) +scores_mixed = iForest::outlierByIsolationForestApply(iForestModel=model_mixed, X=X_mixed) +test_res = (nrow(scores_mixed) == 100) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +print("===============================================================") +print("CATEGORY 4: Model Validation") +print("===============================================================") +print("") + +# NOTE: Seed reproducibility tests skipped due to integer overflow bug +# in seed generation (isolationForest.dml line 144) +# Bug: seeds = matrix(seq(1, n_trees), cols=n_trees, rows=1)*seed +# causes overflow when n_trees >= 10 and seed > 0 + +# Test 12: Model produces valid output +print("Test 12: Model produces valid anomaly scores") +testname = "Valid model output" +X_test = rand(rows=50, cols=5, seed=789) +# Use small n_trees and no seed to avoid bugs +model_test = iForest::outlierByIsolationForest(X=X_test, n_trees=5, subsampling_size=25) +scores_test = iForest::outlierByIsolationForestApply(iForestModel=model_test, X=X_test) +# Check we got scores for all rows +test_res = (nrow(scores_test) == 50) & (ncol(scores_test) == 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Scores produced: " + toString(nrow(scores_test)) + " rows") +print("") + +# Test 13: Scores are in valid range +print("Test 13: Anomaly scores in valid range [0,1]") +testname = "Valid score range" +X_range = rand(rows=100, cols=5, seed=456) +model_range = iForest::outlierByIsolationForest(X=X_range, n_trees=5, subsampling_size=50) +scores_range = iForest::outlierByIsolationForestApply(iForestModel=model_range, X=X_range) +min_score = min(scores_range) +max_score = max(scores_range) +test_res = (min_score >= 0) & (max_score <= 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Score range: [" + toString(min_score) + ", " + toString(max_score) + "]") +print("") + +print("===============================================================") +print("CATEGORY 5: Outlier Detection Scenarios") +print("===============================================================") +print("") + +# Test 14: Subtle outliers +print("Test 14: Subtle outliers (2 standard deviations)") +testname = "Subtle outliers" +X_normal = rand(rows=99, cols=5, pdf="normal", seed=42) +X_subtle_outlier = matrix(2.0, rows=1, cols=5) +X_with_subtle = rbind(X_normal, X_subtle_outlier) +model_subtle = iForest::outlierByIsolationForest(X=X_with_subtle, n_trees=30, subsampling_size=50) +scores_subtle = iForest::outlierByIsolationForestApply(iForestModel=model_subtle, X=X_with_subtle) +outlier_score = as.scalar(scores_subtle[100,1]) +normal_mean_score = mean(scores_subtle[1:99,]) +test_res = outlier_score > normal_mean_score +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Outlier score: " + toString(outlier_score) + ", Normal mean: " + toString(normal_mean_score)) +print("") + +# Test 15: Outlier in only one feature +print("Test 15: Outlier in only one feature") +testname = "One-dimensional outlier" +X_one_dim_outlier = rand(rows=100, cols=5, min=0, max=10, seed=42) +X_one_dim_outlier[1,1] = 100 +model_one_dim = iForest::outlierByIsolationForest(X=X_one_dim_outlier, n_trees=30, subsampling_size=50) +scores_one_dim = iForest::outlierByIsolationForestApply(iForestModel=model_one_dim, X=X_one_dim_outlier) +outlier_score = as.scalar(scores_one_dim[1,1]) +normal_mean = mean(scores_one_dim[2:100,]) +test_res = outlier_score > normal_mean +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Outlier score: " + toString(outlier_score) + ", Normal mean: " + toString(normal_mean)) +print("") + +print("===============================================================") +print("SUMMARY") +print("===============================================================") +succ_test_cnt = test_cnt - length(fails) +print(toString(succ_test_cnt) + "/" + toString(test_cnt) + " tests succeeded!") +if (length(fails) > 0) { + print("\nTests that failed:") + print(toString(fails)) +} + +print("===============================================================") + + print("===============================================================") print("TESTING FINISHED!") \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index 4feab311c76..d3a518f47f0 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -265,6 +265,8 @@ public enum Builtins { OUTLIER_ARIMA("outlierByArima",true), OUTLIER_IQR("outlierByIQR", true), OUTLIER_IQR_APPLY("outlierByIQRApply", true), + OUTLIER_ISOLATION_FOREST("outlierByIsolationForest", true), + OUTLIER_ISOLATION_FOREST_APPLY("outlierByIsolationForestApply", true), OUTLIER_SD("outlierBySd", true), OUTLIER_SD_APPLY("outlierBySdApply", true), PAGERANK("pageRank", true), diff --git a/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForest.rst b/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForest.rst new file mode 100644 index 00000000000..4cc27950c73 --- /dev/null +++ b/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForest.rst @@ -0,0 +1,25 @@ +.. ------------------------------------------------------------- +.. +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. +.. +.. ------------------------------------------------------------ + +outlierByIsolationForest +======================== + +.. autofunction:: systemds.operator.algorithm.outlierByIsolationForest \ No newline at end of file diff --git a/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForestApply.rst b/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForestApply.rst new file mode 100644 index 00000000000..aff908f4785 --- /dev/null +++ b/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForestApply.rst @@ -0,0 +1,25 @@ +.. ------------------------------------------------------------- +.. +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. +.. +.. ------------------------------------------------------------ + +outlierByIsolationForestApply +============================= + +.. autofunction:: systemds.operator.algorithm.outlierByIsolationForestApply \ No newline at end of file diff --git a/src/main/python/systemds/operator/algorithm/__init__.py b/src/main/python/systemds/operator/algorithm/__init__.py index e8cb4c04e95..60ce92715eb 100644 --- a/src/main/python/systemds/operator/algorithm/__init__.py +++ b/src/main/python/systemds/operator/algorithm/__init__.py @@ -159,6 +159,8 @@ from .builtin.outlierByArima import outlierByArima from .builtin.outlierByIQR import outlierByIQR from .builtin.outlierByIQRApply import outlierByIQRApply +from .builtin.outlierByIsolationForest import outlierByIsolationForest +from .builtin.outlierByIsolationForestApply import outlierByIsolationForestApply from .builtin.outlierBySd import outlierBySd from .builtin.outlierBySdApply import outlierBySdApply from .builtin.pageRank import pageRank @@ -358,6 +360,8 @@ 'outlierByArima', 'outlierByIQR', 'outlierByIQRApply', + 'outlierByIsolationForest', + 'outlierByIsolationForestApply', 'outlierBySd', 'outlierBySdApply', 'pageRank', diff --git a/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForest.py b/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForest.py new file mode 100644 index 00000000000..d6adf720ccf --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForest.py @@ -0,0 +1,86 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/outlierByIsolationForest.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def outlierByIsolationForest(X: Matrix, + n_trees: int, + subsampling_size: int, + **kwargs: Dict[str, VALID_INPUT_TYPES]): + """ + Builtin function that implements anomaly detection via isolation forest as described in + [Liu2008]: + Liu, F. T., Ting, K. M., & Zhou, Z. H. + (2008, December). + Isolation forest. + In 2008 eighth ieee international conference on data mining (pp. 413-422). + IEEE. + + This function creates an iForest model for outlier detection. + + .. code-block:: python + + >>> import numpy as np + >>> from systemds.context import SystemDSContext + >>> from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply + >>> with SystemDSContext() as sds: + ... # Create training data: 20 points clustered near origin + ... X_train = sds.from_numpy(np.array([ + ... [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], + ... [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], + ... [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], + ... [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] + ... ])) + ... model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) + ... X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) + ... scores = outlierByIsolationForestApply(model, X_test).compute() + ... print(scores.shape) + ... print(scores[1, 0] > scores[0, 0]) + ... print(scores[1, 0] > 0.5) + (2, 1) + True + True + + + + + :param X: Numerical feature matrix + :param n_trees: Number of iTrees to build + :param subsampling_size: Size of the subsample to build iTrees with + :param seed: Seed for calls to `sample` and `rand`. -1 corresponds to a random seed + :return: The trained iForest model to be used in outlierByIsolationForestApply. + The model is represented as a list with two entries: + Entry 'model' (Matrix[Double]) - The iForest Model in linearized form (see m_iForest) + Entry 'subsampling_size' (Double) - The subsampling size used to build the model. + """ + + params_dict = {'X': X, 'n_trees': n_trees, 'subsampling_size': subsampling_size} + params_dict.update(kwargs) + return Matrix(X.sds_context, + 'outlierByIsolationForest', + named_input_nodes=params_dict) diff --git a/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForestApply.py b/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForestApply.py new file mode 100644 index 00000000000..2ecb612a3e5 --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForestApply.py @@ -0,0 +1,79 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/outlierByIsolationForestApply.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def outlierByIsolationForestApply(iForestModel: List, + X: Matrix): + """ + Builtin function that calculates the anomaly score as described in [Liu2008] + for a set of samples `X` based on an iForest model. + + [Liu2008]: + Liu, F. T., Ting, K. M., & Zhou, Z. H. + (2008, December). + Isolation forest. + In 2008 eighth ieee international conference on data mining (pp. 413-422). + IEEE. + + .. code-block:: python + + >>> import numpy as np + >>> from systemds.context import SystemDSContext + >>> from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply + >>> with SystemDSContext() as sds: + ... # Create training data: 20 points clustered near origin + ... X_train = sds.from_numpy(np.array([ + ... [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], + ... [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], + ... [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], + ... [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] + ... ])) + ... model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) + ... X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) + ... scores = outlierByIsolationForestApply(model, X_test).compute() + ... print(scores.shape) + ... print(scores[1, 0] > scores[0, 0]) + ... print(scores[1, 0] > 0.5) + (2, 1) + True + True + + + + + :param iForestModel: The trained iForest model as returned by outlierByIsolationForest + :param X: Samples to calculate the anomaly score for + :return: Column vector of anomaly scores corresponding to the samples in X. + Samples with an anomaly score > 0.5 are generally considered to be outliers + """ + + params_dict = {'iForestModel': iForestModel, 'X': X} + return Matrix(iForestModel.sds_context, + 'outlierByIsolationForestApply', + named_input_nodes=params_dict) diff --git a/src/main/python/tests/auto_tests/test_outlierByIsolationForest.py b/src/main/python/tests/auto_tests/test_outlierByIsolationForest.py new file mode 100644 index 00000000000..41e0f4b5f31 --- /dev/null +++ b/src/main/python/tests/auto_tests/test_outlierByIsolationForest.py @@ -0,0 +1,56 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +import unittest, contextlib, io + + +class TestOUTLIERBYISOLATIONFOREST(unittest.TestCase): + def test_outlierByIsolationForest(self): + # Example test case provided in python the code block + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + import numpy as np + from systemds.context import SystemDSContext + from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply + with SystemDSContext() as sds: + # Create training data: 20 points clustered near origin + X_train = sds.from_numpy(np.array([ + [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], + [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], + [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], + [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] + ])) + model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) + X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) + scores = outlierByIsolationForestApply(model, X_test).compute() + print(scores.shape) + print(scores[1, 0] > scores[0, 0]) + print(scores[1, 0] > 0.5) + + expected = """(2, 1) +True +True""" + self.assertEqual(buf.getvalue().strip(), expected) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/main/python/tests/auto_tests/test_outlierByIsolationForestApply.py b/src/main/python/tests/auto_tests/test_outlierByIsolationForestApply.py new file mode 100644 index 00000000000..e0a9abb1a90 --- /dev/null +++ b/src/main/python/tests/auto_tests/test_outlierByIsolationForestApply.py @@ -0,0 +1,56 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +import unittest, contextlib, io + + +class TestOUTLIERBYISOLATIONFORESTAPPLY(unittest.TestCase): + def test_outlierByIsolationForestApply(self): + # Example test case provided in python the code block + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + import numpy as np + from systemds.context import SystemDSContext + from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply + with SystemDSContext() as sds: + # Create training data: 20 points clustered near origin + X_train = sds.from_numpy(np.array([ + [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], + [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], + [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], + [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] + ])) + model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) + X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) + scores = outlierByIsolationForestApply(model, X_test).compute() + print(scores.shape) + print(scores[1, 0] > scores[0, 0]) + print(scores[1, 0] > 0.5) + + expected = """(2, 1) +True +True""" + self.assertEqual(buf.getvalue().strip(), expected) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIsolationForestTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIsolationForestTest.java new file mode 100644 index 00000000000..273d3e5e3c7 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIsolationForestTest.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the NOTICE file for + * the specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; + +public class BuiltinIsolationForestTest extends AutomatedTestBase { + private final static String TEST_NAME = "outlierByIsolationForestTest"; + private final static String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinIsolationForestTest.class.getSimpleName() + "/"; + + private final static double eps = 1e-10; + + private final static int rows = 100; + private final static int cols = 3; + private final static int n_trees = 10; + private final static int subsampling_size = 20; + private final static int seed = 42; + + private final static int TEST_BASIC_MODEL = 1; + private final static int TEST_ANOMALY_RANKING = 2; + private final static int TEST_SUBSAMPLING_CLAMP = 3; + private final static int TEST_SINGLE_ROW_APPLY = 4; + private final static int TEST_SINGLE_TREE_MODEL = 5; + private final static int TEST_CONSTANT_DATA = 6; + private final static int TEST_SEED_REPRODUCIBILITY = 7; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, + new String[]{"scores", "model", "subsampling_size"})); + } + + @Test + public void testBasicModelSingleNode() { + runIsolationForestTest(TEST_BASIC_MODEL, ExecMode.SINGLE_NODE, + normalCluster(rows, cols), n_trees, subsampling_size, seed); + } + + @Test + public void testBasicModelHybrid() { + runIsolationForestTest(TEST_BASIC_MODEL, ExecMode.HYBRID, + normalCluster(rows, cols), n_trees, subsampling_size, seed); + } + + @Test + public void testAnomalyRankingSingleNode() { + runIsolationForestTest(TEST_ANOMALY_RANKING, ExecMode.SINGLE_NODE, + normalCluster(30, cols), 50, 10, seed); + } + + @Test + public void testAnomalyRankingHybrid() { + runIsolationForestTest(TEST_ANOMALY_RANKING, ExecMode.HYBRID, + normalCluster(30, cols), 50, 10, seed); + } + + @Test + public void testSubsamplingClampSingleNode() { + runIsolationForestTest(TEST_SUBSAMPLING_CLAMP, ExecMode.SINGLE_NODE, + normalCluster(5, cols), 3, 256, seed); + } + + @Test + public void testSubsamplingClampHybrid() { + runIsolationForestTest(TEST_SUBSAMPLING_CLAMP, ExecMode.HYBRID, + normalCluster(5, cols), 3, 256, seed); + } + + @Test + public void testSingleRowApplySingleNode() { + runIsolationForestTest(TEST_SINGLE_ROW_APPLY, ExecMode.SINGLE_NODE, + normalCluster(20, cols), 5, 10, seed); + } + + @Test + public void testSingleTreeModelSingleNode() { + runIsolationForestTest(TEST_SINGLE_TREE_MODEL, ExecMode.SINGLE_NODE, + normalCluster(20, cols), 1, 10, seed); + } + + @Test + public void testConstantDataSingleNode() { + runIsolationForestTest(TEST_CONSTANT_DATA, ExecMode.SINGLE_NODE, + constantMatrix(5, cols, 7.0), 1, 5, seed); + } + + @Test + public void testSeedReproducibilitySingleNode() { + runIsolationForestTest(TEST_SEED_REPRODUCIBILITY, ExecMode.SINGLE_NODE, + normalCluster(30, cols), 5, 10, seed); + } + + private void runIsolationForestTest(int testCase, ExecMode mode, double[][] A, + int nTrees, int subsamplingSize, int seed) { + ExecMode platformOld = setExecMode(mode); + + try { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + String HOME = SCRIPT_DIR + TEST_DIR; + + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[]{"-nvargs", + "X=" + input("A"), + "test_case=" + testCase, + "n_trees=" + nTrees, + "subsampling_size=" + subsamplingSize, + "seed=" + seed, + "output=" + output("scores"), + "model_output=" + output("model"), + "subsampling_size_output=" + output("subsampling_size")}; + + writeInputMatrixWithMTD("A", A, true); + + runTest(true, false, null, -1); + + HashMap scores = readDMLMatrixFromOutputDir("scores"); + Assert.assertNotNull("Scores should not be null", scores); + Assert.assertFalse("Scores should have entries", scores.isEmpty()); + + HashMap model = readDMLMatrixFromOutputDir("model"); + Assert.assertNotNull("Model should not be null", model); + Assert.assertFalse("Model should have entries", model.isEmpty()); + + HashMap actualSubsamplingSize = + readDMLScalarFromOutputDir("subsampling_size"); + Assert.assertNotNull("Subsampling size should be written", actualSubsamplingSize); + + double expectedSubsamplingSize = Math.min(subsamplingSize, A.length); + Assert.assertEquals("Stored subsampling size should match the actual training subsample size", + expectedSubsamplingSize, + actualSubsamplingSize.get(new CellIndex(1, 1)), + eps); + + int maxRow = 0; + for (CellIndex idx : model.keySet()) + maxRow = Math.max(maxRow, idx.row); + + Assert.assertEquals("Model should have n_trees rows", nTrees, maxRow); + } + finally { + rtplatform = platformOld; + } + } + + private double[][] normalCluster(int rows, int cols) { + double[][] A = new double[rows][cols]; + + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + A[i][j] = (i + 1) / 100.0 + j / 1000.0; + } + } + + return A; + } + + private double[][] constantMatrix(int rows, int cols, double value) { + double[][] A = new double[rows][cols]; + + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + A[i][j] = value; + } + } + + return A; + } +} \ No newline at end of file diff --git a/src/test/scripts/functions/builtin/outlierByIsolationForestTest.dml b/src/test/scripts/functions/builtin/outlierByIsolationForestTest.dml new file mode 100644 index 00000000000..6756512c682 --- /dev/null +++ b/src/test/scripts/functions/builtin/outlierByIsolationForestTest.dml @@ -0,0 +1,126 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read($X) +test_case = $test_case + +# test_case meanings: +# 1: basic model +# 2: anomaly ranking +# 3: subsampling clamp +# 4: single-row apply +# 5: single-tree model +# 6: constant data +# 7: seed reproducibility + +if (test_case == 2) { + # Train on the provided normal data. + model = outlierByIsolationForest( + X = X, + n_trees = $n_trees, + subsampling_size = $subsampling_size, + seed = $seed + ) + + M = as.matrix(model["model"]) + subsampling_size_out = as.scalar(model["subsampling_size"]) + + # Score two normal points and two obvious outliers. + X_normal = X[1:2, ] + X_outliers = matrix(10, rows = 2, cols = ncol(X)) + X_test = rbind(X_normal, X_outliers) + + scores = outlierByIsolationForestApply( + iForestModel = model, + X = X_test + ) + + normal_mean = mean(scores[1:2, 1]) + anomaly_mean = mean(scores[3:4, 1]) + + assert(anomaly_mean > normal_mean) + + write(scores, $output) + write(M, $model_output) + write(subsampling_size_out, $subsampling_size_output) +} + +if (test_case == 7) { + # Same input and same seed should produce the same forest. + model1 = outlierByIsolationForest( + X = X, + n_trees = $n_trees, + subsampling_size = $subsampling_size, + seed = $seed + ) + + model2 = outlierByIsolationForest( + X = X, + n_trees = $n_trees, + subsampling_size = $subsampling_size, + seed = $seed + ) + + M1 = as.matrix(model1["model"]) + M2 = as.matrix(model2["model"]) + + subsampling_size_out = as.scalar(model1["subsampling_size"]) + + diff = sum(abs(M1 - M2)) + assert(diff == 0) + + scores = outlierByIsolationForestApply( + iForestModel = model1, + X = X + ) + + write(scores, $output) + write(M1, $model_output) + write(subsampling_size_out, $subsampling_size_output) +} + +if (test_case != 2 & test_case != 7) { + model = outlierByIsolationForest( + X = X, + n_trees = $n_trees, + subsampling_size = $subsampling_size, + seed = $seed + ) + + M = as.matrix(model["model"]) + subsampling_size_out = as.scalar(model["subsampling_size"]) + + X_apply = X + + if (test_case == 4) { + # Single-row apply test. + X_apply = X[1, ] + } + + scores = outlierByIsolationForestApply( + iForestModel = model, + X = X_apply + ) + + write(scores, $output) + write(M, $model_output) + write(subsampling_size_out, $subsampling_size_output) +} \ No newline at end of file