diff --git a/scripts/builtin/powerTransform.dml b/scripts/builtin/powerTransform.dml new file mode 100644 index 00000000000..29b055674c8 --- /dev/null +++ b/scripts/builtin/powerTransform.dml @@ -0,0 +1,158 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# Power transformation using the Yeo-Johnson method. +# Reduces feature skewness by estimating and applying an optimal transformation parameter for each column. +# +# INPUT: +# ------------------------------------------------------------------------------------- +# X Input feature matrix of shape n-by-m +# ------------------------------------------------------------------------------------- +# +# OUTPUT: +# ------------------------------------------------------------------------------------- +# Y Power-transformed matrix of shape n-by-m +# lambdas Estimated lambda parameters of shape 1-by-m, one per column +# ------------------------------------------------------------------------------------- + +m_powerTransform = function(Matrix[Double] X) + return (Matrix[Double] Y, Matrix[Double] lambdas) +{ + m = ncol(X) + lambdas = matrix(1.0, rows=1, cols=m) # Initialize first, then replace each column with the best lambdas + + +# Estimate lambda for each column separately + for (j in 1:m){ + x = X[,j] + + # For constant columns, variance is 0, so skip optimization and use lambda = 1 + if (max(x) == min(x)) { + lambdas[1,j] = 1.0; + } + else{ + lambdas[1,j]= ptEstimateLambda(x); + } +} +# Pass the parameters to Apply + Y = powerTransformApply(X, lambdas); +} + + + + + +ptEstimateLambda = function(Matrix[Double] x) + return (Double lambda) +{ + lower = -2.0; + upper = 2.0; + + lambda = ptGoldenSectionSearch(x, lower, upper); +} + +# Compute negative log likelihood; lower lambda score is better + +ptYJNegLogLikelihood = function( + Matrix[Double] x, + Double lambda) + return (Double negLogLikelihood) +{ + # powerTransformApply needs lambda as a matrix + lambdaMatrix = matrix(lambda, rows=1, cols=1); + + # Apply one YJ transform with this lambda and use the temporary y for scoring + y = powerTransformApply(x, lambdaMatrix); + + # Safety check; give a huge penalty score when variance is below 0 + n = nrow(x); + yMean = mean(y); + yVariance = sum((y - yMean)^2) / n; + + if (yVariance <= 0.0) { + negLogLikelihood = 1e300 + } + + # Start scoring after the safety check passes + # The objective has two parts + + else { + # Variance Term + logLikelihood = -n / 2.0 * log(yVariance); + + # Jacobian term + jacobian = (lambda - 1.0) * sum(sign(x) * log(abs(x) + 1.0)); + + # Combine + logLikelihood = logLikelihood + jacobian; + # Return the negative value + negLogLikelihood = -logLikelihood; + } +} + +# Temporarily use golden section search for best lambdas; replace with Brent later +ptGoldenSectionSearch = function( + Matrix[Double] x, + Double lower, + Double upper) + return (Double lambdaOptimal) +{ + # Golden ratio, tolerance, and max attempts + goldenRatio = 0.6180339887498949; + tolerance = 1e-6; + maxIterations = 100; + + a = lower; + b = upper; + + c = b - goldenRatio * (b - a); + d = a + goldenRatio * (b - a); + + fc = ptYJNegLogLikelihood(x, c); + fd = ptYJNegLogLikelihood(x, d); + + iteration = 0; + while (((b - a) > tolerance) & (iteration < maxIterations)) { + + if (fc < fd) { + b = d; + d = c; + fd = fc; + c = b - goldenRatio * (b - a); + fc = ptYJNegLogLikelihood(x, c); + } + else { + a = c; + c = d; + fc = fd; + d = a + goldenRatio * (b - a); + fd = ptYJNegLogLikelihood(x, d); + } + + iteration = iteration + 1; + } + + # Keep the best one after shrinking + if (fc < fd) { + lambdaOptimal = c; # Whoever has the lower score wins + } else { + lambdaOptimal = d; + } +} diff --git a/scripts/builtin/powerTransformApply.dml b/scripts/builtin/powerTransformApply.dml new file mode 100644 index 00000000000..adace3382d5 --- /dev/null +++ b/scripts/builtin/powerTransformApply.dml @@ -0,0 +1,76 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# Applies a fitted Yeo-Johnson power transformation. +# Transforms each feature using its previously estimated lambda parameter. +# +# INPUT: +# ------------------------------------------------------------------------------------- +# X Input feature matrix of shape n-by-m +# lambdas Precomputed lambda parameters of shape 1-by-m, one per column +# ------------------------------------------------------------------------------------- +# +# OUTPUT: +# ------------------------------------------------------------------------------------- +# Y Power-transformed matrix of shape n-by-m +# ------------------------------------------------------------------------------------- + + +m_powerTransformApply = function(Matrix[Double] X, Matrix[Double] lambdas) + return (Matrix[Double] Y) +{ + n = nrow(X) + m = ncol(X) + + Y = matrix(0.0, rows=n, cols=m) + + # Handle boundary points (0 and 2) + eps = 1e-12 + + # Loop over columns for transformation + for (j in 1:m){ + x = X[,j] + lambda_j = as.scalar(lambdas[1,j]) + nonnegative = x >= 0 + + # Use intermediate inputs to avoid invalid domains in scoring or fractional powers + x_pos = ifelse(nonnegative, x, 0.0) + x_neg = ifelse(nonnegative, 0.0, x) + + # Transform nonnegative values (x>=0) + if (abs(lambda_j) < eps) { + y_pos = log(x_pos + 1) + } + else { + y_pos = ((x_pos + 1)^lambda_j - 1) / lambda_j + } + + # Transform negative values (x<0) + if (abs(lambda_j - 2) < eps) { + y_neg = -log(1 - x_neg) + } + else { + y_neg = -((1 - x_neg)^(2 - lambda_j) - 1) / (2 - lambda_j) + } + + # Combine the two branches + Y[,j] = ifelse(nonnegative, y_pos, y_neg) + } +} diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index f5719641df7..d1d0d9a67f7 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -275,6 +275,8 @@ public enum Builtins { PCATRANSFORM("pcaTransform", true), PNMF("pnmf", true), PPCA("ppca", true), + POWERTRANSFORM("powerTransform", true), + POWERTRANSFORMAPPLY("powerTransformApply", true), PPRED("ppred", false), PROD("prod", false), PSNR("psnr", true), diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinPowerTransformTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinPowerTransformTest.java new file mode 100644 index 00000000000..a58604c9af1 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinPowerTransformTest.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + +import java.util.HashMap; + +import org.junit.Test; + +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.common.Types.ExecType; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; + +public class BuiltinPowerTransformTest extends AutomatedTestBase { + private static final String TEST_NAME = "powerTransformApply"; + private static final String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = + TEST_DIR + BuiltinPowerTransformTest.class.getSimpleName() + "/"; + + private static final double EPS = 1e-9; + + @Override + public void setUp() { + // Register test configuration and declare the test output matrix Y + addTestConfiguration( + TEST_NAME, + new TestConfiguration( + TEST_CLASS_DIR, + TEST_NAME, + new String[] {"Y"} + ) + ); + } + + @Test + public void testPowerTransformApplyDenseCP() { + // Only test single-node CP execution mode here + runPowerTransformApplyTest(ExecType.CP); + } + + private void runPowerTransformApplyTest(ExecType execType) { + // Save the old execution mode and restore it after the test + ExecMode oldExecMode = setExecMode(execType); + + try { + // Load the configuration for this test + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + + String home = SCRIPT_DIR + TEST_DIR; + + // Set the DML test wrapper and R reference implementation + fullDMLScriptName = home + TEST_NAME + ".dml"; + fullRScriptName = home + TEST_NAME + ".R"; + + // DML arguments in order: + // 1. Input matrix X + // 2. Lambda row matrix L + // 3. Output matrix Y + programArgs = new String[] { + "-exec", "singlenode", + "-args", + input("X"), + input("L"), + output("Y") + }; + + // R script receives the input directory and expected output directory + rCmd = "Rscript " + fullRScriptName + " " + + inputDir() + " " + expectedDir(); + + // Use the same input values in three columns to test lambda 0, 1, and 2 + double[][] X = { + {-2, -2, -2}, + {-1, -1, -1}, + { 0, 0, 0}, + { 1, 1, 1}, + { 2, 2, 2} + }; + + // First column lambda=0, second column lambda=1, third column lambda=2 + double[][] L = { + {0, 1, 2} + }; + + // Write both input matrices to the test input directory and generate metadata + writeInputMatrixWithMTD("X", X, true); + writeInputMatrixWithMTD("L", L, true); + + // DML test + runTest(true, false, null, -1); + + // R reference implementation + runRScript(true); + + // Read the results separately + HashMap dmlResult = + readDMLMatrixFromOutputDir("Y"); + + HashMap rResult = + readRMatrixFromExpectedDir("Y"); + + // Compare the two result matrices within the given tolerance + TestUtils.compareMatrices( + dmlResult, + rResult, + EPS, + "DML", + "R" + ); + } + catch (Exception exception) { + throw new RuntimeException(exception); + } + finally { + // Restore the old execution mode whether the test succeeds or fails + resetExecMode(oldExecMode); + } + } +} diff --git a/src/test/scripts/functions/builtin/powerTransformApply.R b/src/test/scripts/functions/builtin/powerTransformApply.R new file mode 100644 index 00000000000..903a791df61 --- /dev/null +++ b/src/test/scripts/functions/builtin/powerTransformApply.R @@ -0,0 +1,96 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +library("Matrix") + +# Read command line arguments passed from the Java test to the R script +args <- commandArgs(TRUE) + +# Keep higher numeric output precision +options(digits = 22) + +# Read test matrix X and lambda row matrix L from the input directory +X <- as.matrix(readMM(paste(args[1], "X.mtx", sep = ""))) +lambdas <- as.matrix(readMM(paste(args[1], "L.mtx", sep = ""))) + +# Check every column has a matching lambda +if (ncol(X) != ncol(lambdas)) { + stop("The number of lambdas must match the number of columns in X.") +} + +# Apply the YJ transform with a given lambda to one vector +yeoJohnsonApply <- function(x, lambda) { + y <- numeric(length(x)) + + # Find positions of nonnegative and negative values separately to avoid computing the wrong branch + nonnegativeIndex <- which(x >= 0) + negativeIndex <- which(x < 0) + + # Handle the x >= 0 part + if (length(nonnegativeIndex) > 0) { + xPositive <- x[nonnegativeIndex] + + # Use the log form when lambda is close to 0 + if (abs(lambda) < 1e-12) { + y[nonnegativeIndex] <- log(xPositive + 1) + } + else { + y[nonnegativeIndex] <- + ((xPositive + 1)^lambda - 1) / lambda + } + } + + # Handle the x < 0 part + if (length(negativeIndex) > 0) { + xNegative <- x[negativeIndex] + + # Use the log form when lambda is close to 2 + if (abs(lambda - 2) < 1e-12) { + y[negativeIndex] <- -log(1 - xNegative) + } + else { + y[negativeIndex] <- + -(((1 - xNegative)^(2 - lambda) - 1) / + (2 - lambda)) + } + } + + return(y) +} + +# Create a result matrix with the same dimensions as the input matrix +Y <- matrix( + 0.0, + nrow = nrow(X), + ncol = ncol(X) +) + +# Transform each column independently using its matching lambda +for (j in seq_len(ncol(X))) { + lambda <- as.numeric(lambdas[1, j]) + Y[, j] <- yeoJohnsonApply(X[, j], lambda) +} + +# Write the expected result computed by R to the expected output directory +writeMM( + as(Y, "CsparseMatrix"), + paste(args[2], "Y", sep = "") +) diff --git a/src/test/scripts/functions/builtin/powerTransformApply.dml b/src/test/scripts/functions/builtin/powerTransformApply.dml new file mode 100644 index 00000000000..bf73c2a45b3 --- /dev/null +++ b/src/test/scripts/functions/builtin/powerTransformApply.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Read the matrix to transform from the first command line argument +X = read($1); + +# Read the lambda for each column from the second command line argument +lambdas = read($2); + +# Directly call the registered SystemDS builtin +Y = powerTransformApply(X, lambdas); + +# Write the transformed result to the location from the third command line argument +write(Y, $3); diff --git a/src/test/scripts/functions/builtin/powerTransformSmokeTest.dml b/src/test/scripts/functions/builtin/powerTransformSmokeTest.dml new file mode 100644 index 00000000000..e67882e64fb --- /dev/null +++ b/src/test/scripts/functions/builtin/powerTransformSmokeTest.dml @@ -0,0 +1,96 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Test matrix: +# - Column 1 contains negative, zero, and positive values. +# - Column 2 contains positively skewed values. +# - Column 3 is constant. +X = matrix( + "-2 1 5 + -1 1 5 + 0 2 5 + 1 3 5 + 2 6 5 + 4 12 5", + rows=6, + cols=3 +) + +print("Input X:") +print(toString(X, rows=6, cols=3, decimal=6)) + + +# ------------------------------------------------------------------ +# Test 1: Call powerTransformApply directly with lambda = 1 +# ------------------------------------------------------------------ + +identityLambdas = matrix("1 1 1", rows=1, cols=3) + +YIdentity = powerTransformApply(X, identityLambdas) + +print("Apply result with lambda = 1:") +print(toString(YIdentity, rows=6, cols=3, decimal=6)) + +# Yeo-Johnson with lambda = 1 should leave all values unchanged. +identityError = max(abs(YIdentity - X)) + +print("Maximum identity error: " + identityError) + +assert(identityError < 1e-10) + + +# ------------------------------------------------------------------ +# Test 2: Estimate lambdas and transform X +# ------------------------------------------------------------------ + +[Y, lambdas] = powerTransform(X) + +print("Estimated lambdas:") +print(toString(lambdas, rows=1, cols=3, decimal=6)) + +print("Transformed Y:") +print(toString(Y, rows=6, cols=3, decimal=6)) + + +# ------------------------------------------------------------------ +# Basic checks +# ------------------------------------------------------------------ + +# Output dimensions must match the input dimensions. +assert(nrow(Y) == nrow(X)) +assert(ncol(Y) == ncol(X)) + +# There must be one lambda for each input column. +assert(nrow(lambdas) == 1) +assert(ncol(lambdas) == ncol(X)) + +# The constant third column should use lambda = 1. +constantLambda = as.scalar(lambdas[1,3]) +assert(abs(constantLambda - 1.0) < 1e-10) + +# Results should not contain invalid numerical values. +assert(sum(is.nan(Y)) == 0) +assert(sum(is.infinite(Y)) == 0) + +assert(sum(is.nan(lambdas)) == 0) +assert(sum(is.infinite(lambdas)) == 0) + +print("powerTransform smoke test passed.")