From e694787c8cff6dfc6c8221d132a38a92d9ef6c0f Mon Sep 17 00:00:00 2001 From: Wenliang Cao <59223665+WenliangCao@users.noreply.github.com> Date: Sun, 21 Jun 2026 23:44:35 +0200 Subject: [PATCH 1/2] Add Yeo-Johnson PowerTransformer prototype Add two DML builtins for the midterm PowerTransformer prototype: - powerTransform.dml estimates one Yeo-Johnson lambda per input column, uses lambda = 1 for constant columns, and applies the transform. - powerTransformApply.dml applies the Yeo-Johnson transform with supplied per-column lambdas. Register both builtins so they can be resolved by SystemDS. Add powerTransformSmokeTest.dml to verify: - powerTransformApply with lambda = 1 behaves as identity - powerTransform returns output dimensions matching the input - one lambda is returned per input column - constant columns use lambda = 1 - transformed output and lambdas do not contain NaN or Inf The smoke test passes with: ./bin/systemds src/test/scripts/functions/builtin/powerTransformSmokeTest.dml --- scripts/builtin/powerTransform.dml | 160 ++++++++++++++++++ scripts/builtin/powerTransformApply.dml | 76 +++++++++ .../org/apache/sysds/common/Builtins.java | 2 + .../builtin/powerTransformSmokeTest.dml | 96 +++++++++++ 4 files changed, 334 insertions(+) create mode 100644 scripts/builtin/powerTransform.dml create mode 100644 scripts/builtin/powerTransformApply.dml create mode 100644 src/test/scripts/functions/builtin/powerTransformSmokeTest.dml diff --git a/scripts/builtin/powerTransform.dml b/scripts/builtin/powerTransform.dml new file mode 100644 index 00000000000..438841a5cd2 --- /dev/null +++ b/scripts/builtin/powerTransform.dml @@ -0,0 +1,160 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# Power transformation using the Yeo-Johnson method. +# Reduces feature skewness by estimating and applying an optimal transformation parameter for each column. +# +# INPUT: +# ------------------------------------------------------------------------------------- +# X Input feature matrix of shape n-by-m +# ------------------------------------------------------------------------------------- +# +# OUTPUT: +# ------------------------------------------------------------------------------------- +# Y Power-transformed matrix of shape n-by-m +# lambdas Estimated lambda parameters of shape 1-by-m, one per column +# ------------------------------------------------------------------------------------- + +m_powerTransform = function(Matrix[Double] X) + return (Matrix[Double] Y, Matrix[Double] lambdas) +{ + m = ncol(X) + lambdas = matrix(1.0, rows=1, cols=m) #初始设置为0,后续为每一列替换为最优的lambdas + + +# 分开给每一列估计lambda + for (j in 1:m){ + x = X[,j] + + #考虑常数列,方差为0,不优化,拉姆达直接等于1 + if (max(x) == min(x)) { + lambdas[1,j] = 1.0; + } + else{ + lambdas[1,j]= ptEstimateLambda(x); + } +} +# 把参数传给Apply + Y = powerTransformApply(X, lambdas); +} + + + + +#暂时使用黄金分割搜索最佳lambda,后期换成Brent + + +ptEstimateLambda = function(Matrix[Double] x) + return (Double lambda) +{ + lower = -2.0; + upper = 2.0; + + lambda = ptGoldenSectionSearch(x, lower, upper); +} + +#计算负对数似然值,越低lambdas越好 + +ptYJNegLogLikelihood = function( + Matrix[Double] x, + Double lambda) + return (Double negLogLikelihood) +{ + #powerTransformApply要求lambda参数是矩阵形式 + lambdaMatrix = matrix(lambda, rows=1, cols=1); + + #使用这个lambda一次YJ变换之后,得到虚拟结果y,后续会打分 + y = powerTransformApply(x, lambdaMatrix); + + #安全检查,方差小于0时惩罚性给极大值的打分 + n = nrow(x); + yMean = mean(y); + yVariance = sum((y - yMean)^2) / n; + + if (yVariance <= 0.0) { + negLogLikelihood = 1e300 + } + + #安全检查通过后正式开始打分 + #目标函数分两部分 + + else { + # Variance Term + logLikelihood = -n / 2.0 * log(yVariance); + + # Jacobian term + jacobian = (lambda - 1.0) * sum(sign(x) * log(abs(x) + 1.0)); + + # 组合 + logLikelihood = logLikelihood + jacobian; + #取反输出 + negLogLikelihood = -logLikelihood; + } +} + +#暂时使用黄金分割寻找最优lambdas,后期替换成Brent +ptGoldenSectionSearch = function( + Matrix[Double] x, + Double lower, + Double upper) + return (Double lambdaOptimal) +{ + #黄金分割率,精度,最大尝试次数 + goldenRatio = 0.6180339887498949; + tolerance = 1e-6; + maxIterations = 100; + + a = lower; + b = upper; + + c = b - goldenRatio * (b - a); + d = a + goldenRatio * (b - a); + + fc = ptYJNegLogLikelihood(x, c); + fd = ptYJNegLogLikelihood(x, d); + + iteration = 0; + while (((b - a) > tolerance) & (iteration < maxIterations)) { + + if (fc < fd) { + b = d; + d = c; + fd = fc; + c = b - goldenRatio * (b - a); + fc = ptYJNegLogLikelihood(x, c); + } + else { + a = c; + c = d; + fc = fd; + d = a + goldenRatio * (b - a); + fd = ptYJNegLogLikelihood(x, d); + } + + iteration = iteration + 1; + } + + #收缩后留下最优的 + if (fc < fd) { + lambdaOptimal = c; # 谁的得分低,谁就是最终冠军 + } else { + lambdaOptimal = d; + } +} diff --git a/scripts/builtin/powerTransformApply.dml b/scripts/builtin/powerTransformApply.dml new file mode 100644 index 00000000000..510b9fab99d --- /dev/null +++ b/scripts/builtin/powerTransformApply.dml @@ -0,0 +1,76 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# Applies a fitted Yeo-Johnson power transformation. +# Transforms each feature using its previously estimated lambda parameter. +# +# INPUT: +# ------------------------------------------------------------------------------------- +# X Input feature matrix of shape n-by-m +# lambdas Precomputed lambda parameters of shape 1-by-m, one per column +# ------------------------------------------------------------------------------------- +# +# OUTPUT: +# ------------------------------------------------------------------------------------- +# Y Power-transformed matrix of shape n-by-m +# ------------------------------------------------------------------------------------- + + +m_powerTransformApply = function(Matrix[Double] X, Matrix[Double] lambdas) + return (Matrix[Double] Y) +{ + n = nrow(X) + m = ncol(X) + + Y = matrix(0.0, rows=n, cols=m) + + #临界点处理(0和2) + eps = 1e-12 + + #逐列变化循环 + for (j in 1:m){ + x = X[,j] + lambda_j = as.scalar(lambdas[1,j]) + nonnegative = x >= 0 + + #使用中间值输入 避免评价算法或者分数幂超出有效域 + x_pos = ifelse(nonnegative, x, 0.0) + x_neg = ifelse(nonnegative, 0.0, x) + + #对非负值的转化 (x>=0) + if (abs(lambda_j) < eps) { + y_pos = log(x_pos + 1) + } + else { + y_pos = ((x_pos + 1)^lambda_j - 1) / lambda_j + } + + #对负值转化 (x<0) + if (abs(lambda_j - 2) < eps) { + y_neg = -log(1 - x_neg) + } + else { + y_neg = -((1 - x_neg)^(2 - lambda_j) - 1) / (2 - lambda_j) + } + + #俩支线合并 + Y[,j] = ifelse(nonnegative, y_pos, y_neg) + } +} diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index f5719641df7..d1d0d9a67f7 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -275,6 +275,8 @@ public enum Builtins { PCATRANSFORM("pcaTransform", true), PNMF("pnmf", true), PPCA("ppca", true), + POWERTRANSFORM("powerTransform", true), + POWERTRANSFORMAPPLY("powerTransformApply", true), PPRED("ppred", false), PROD("prod", false), PSNR("psnr", true), diff --git a/src/test/scripts/functions/builtin/powerTransformSmokeTest.dml b/src/test/scripts/functions/builtin/powerTransformSmokeTest.dml new file mode 100644 index 00000000000..e67882e64fb --- /dev/null +++ b/src/test/scripts/functions/builtin/powerTransformSmokeTest.dml @@ -0,0 +1,96 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Test matrix: +# - Column 1 contains negative, zero, and positive values. +# - Column 2 contains positively skewed values. +# - Column 3 is constant. +X = matrix( + "-2 1 5 + -1 1 5 + 0 2 5 + 1 3 5 + 2 6 5 + 4 12 5", + rows=6, + cols=3 +) + +print("Input X:") +print(toString(X, rows=6, cols=3, decimal=6)) + + +# ------------------------------------------------------------------ +# Test 1: Call powerTransformApply directly with lambda = 1 +# ------------------------------------------------------------------ + +identityLambdas = matrix("1 1 1", rows=1, cols=3) + +YIdentity = powerTransformApply(X, identityLambdas) + +print("Apply result with lambda = 1:") +print(toString(YIdentity, rows=6, cols=3, decimal=6)) + +# Yeo-Johnson with lambda = 1 should leave all values unchanged. +identityError = max(abs(YIdentity - X)) + +print("Maximum identity error: " + identityError) + +assert(identityError < 1e-10) + + +# ------------------------------------------------------------------ +# Test 2: Estimate lambdas and transform X +# ------------------------------------------------------------------ + +[Y, lambdas] = powerTransform(X) + +print("Estimated lambdas:") +print(toString(lambdas, rows=1, cols=3, decimal=6)) + +print("Transformed Y:") +print(toString(Y, rows=6, cols=3, decimal=6)) + + +# ------------------------------------------------------------------ +# Basic checks +# ------------------------------------------------------------------ + +# Output dimensions must match the input dimensions. +assert(nrow(Y) == nrow(X)) +assert(ncol(Y) == ncol(X)) + +# There must be one lambda for each input column. +assert(nrow(lambdas) == 1) +assert(ncol(lambdas) == ncol(X)) + +# The constant third column should use lambda = 1. +constantLambda = as.scalar(lambdas[1,3]) +assert(abs(constantLambda - 1.0) < 1e-10) + +# Results should not contain invalid numerical values. +assert(sum(is.nan(Y)) == 0) +assert(sum(is.infinite(Y)) == 0) + +assert(sum(is.nan(lambdas)) == 0) +assert(sum(is.infinite(lambdas)) == 0) + +print("powerTransform smoke test passed.") From 130c741eb5e3201bdcd059f81ae22b8dcb0b957c Mon Sep 17 00:00:00 2001 From: Wenliang Cao <59223665+WenliangCao@users.noreply.github.com> Date: Sun, 21 Jun 2026 23:44:35 +0200 Subject: [PATCH 2/2] Add R reference test for PowerTransformApply Add a focused PowerTransformApply test that compares the DML builtin against an independent R reference implementation. The test adds: - powerTransformApply.dml as a small DML wrapper around the registered builtin - powerTransformApply.R as the reference Yeo-Johnson apply implementation - BuiltinPowerTransformTest.java to run the DML and R scripts and compare Y The test covers the key apply branches with fixed lambdas: - lambda = 0 for the positive log branch - lambda = 1 for the identity-style middle case - lambda = 2 for the negative log branch Also translate the remaining PowerTransformer comments from Chinese to English in the prototype DML files. Verified with: Rscript -e 'parse(file="src/test/scripts/functions/builtin/powerTransformApply.R"); cat("R syntax OK\n")' ./bin/systemds src/test/scripts/functions/builtin/powerTransformSmokeTest.dml mvn -Dtest=BuiltinPowerTransformTest test --- scripts/builtin/powerTransform.dml | 34 ++--- scripts/builtin/powerTransformApply.dml | 12 +- .../part2/BuiltinPowerTransformTest.java | 138 ++++++++++++++++++ .../functions/builtin/powerTransformApply.R | 96 ++++++++++++ .../functions/builtin/powerTransformApply.dml | 32 ++++ 5 files changed, 288 insertions(+), 24 deletions(-) create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinPowerTransformTest.java create mode 100644 src/test/scripts/functions/builtin/powerTransformApply.R create mode 100644 src/test/scripts/functions/builtin/powerTransformApply.dml diff --git a/scripts/builtin/powerTransform.dml b/scripts/builtin/powerTransform.dml index 438841a5cd2..29b055674c8 100644 --- a/scripts/builtin/powerTransform.dml +++ b/scripts/builtin/powerTransform.dml @@ -36,14 +36,14 @@ m_powerTransform = function(Matrix[Double] X) return (Matrix[Double] Y, Matrix[Double] lambdas) { m = ncol(X) - lambdas = matrix(1.0, rows=1, cols=m) #初始设置为0,后续为每一列替换为最优的lambdas + lambdas = matrix(1.0, rows=1, cols=m) # Initialize first, then replace each column with the best lambdas -# 分开给每一列估计lambda +# Estimate lambda for each column separately for (j in 1:m){ x = X[,j] - #考虑常数列,方差为0,不优化,拉姆达直接等于1 + # For constant columns, variance is 0, so skip optimization and use lambda = 1 if (max(x) == min(x)) { lambdas[1,j] = 1.0; } @@ -51,15 +51,13 @@ m_powerTransform = function(Matrix[Double] X) lambdas[1,j]= ptEstimateLambda(x); } } -# 把参数传给Apply +# Pass the parameters to Apply Y = powerTransformApply(X, lambdas); } -#暂时使用黄金分割搜索最佳lambda,后期换成Brent - ptEstimateLambda = function(Matrix[Double] x) return (Double lambda) @@ -70,20 +68,20 @@ ptEstimateLambda = function(Matrix[Double] x) lambda = ptGoldenSectionSearch(x, lower, upper); } -#计算负对数似然值,越低lambdas越好 +# Compute negative log likelihood; lower lambda score is better ptYJNegLogLikelihood = function( Matrix[Double] x, Double lambda) return (Double negLogLikelihood) { - #powerTransformApply要求lambda参数是矩阵形式 + # powerTransformApply needs lambda as a matrix lambdaMatrix = matrix(lambda, rows=1, cols=1); - #使用这个lambda一次YJ变换之后,得到虚拟结果y,后续会打分 + # Apply one YJ transform with this lambda and use the temporary y for scoring y = powerTransformApply(x, lambdaMatrix); - #安全检查,方差小于0时惩罚性给极大值的打分 + # Safety check; give a huge penalty score when variance is below 0 n = nrow(x); yMean = mean(y); yVariance = sum((y - yMean)^2) / n; @@ -92,8 +90,8 @@ ptYJNegLogLikelihood = function( negLogLikelihood = 1e300 } - #安全检查通过后正式开始打分 - #目标函数分两部分 + # Start scoring after the safety check passes + # The objective has two parts else { # Variance Term @@ -102,21 +100,21 @@ ptYJNegLogLikelihood = function( # Jacobian term jacobian = (lambda - 1.0) * sum(sign(x) * log(abs(x) + 1.0)); - # 组合 + # Combine logLikelihood = logLikelihood + jacobian; - #取反输出 + # Return the negative value negLogLikelihood = -logLikelihood; } } -#暂时使用黄金分割寻找最优lambdas,后期替换成Brent +# Temporarily use golden section search for best lambdas; replace with Brent later ptGoldenSectionSearch = function( Matrix[Double] x, Double lower, Double upper) return (Double lambdaOptimal) { - #黄金分割率,精度,最大尝试次数 + # Golden ratio, tolerance, and max attempts goldenRatio = 0.6180339887498949; tolerance = 1e-6; maxIterations = 100; @@ -151,9 +149,9 @@ ptGoldenSectionSearch = function( iteration = iteration + 1; } - #收缩后留下最优的 + # Keep the best one after shrinking if (fc < fd) { - lambdaOptimal = c; # 谁的得分低,谁就是最终冠军 + lambdaOptimal = c; # Whoever has the lower score wins } else { lambdaOptimal = d; } diff --git a/scripts/builtin/powerTransformApply.dml b/scripts/builtin/powerTransformApply.dml index 510b9fab99d..adace3382d5 100644 --- a/scripts/builtin/powerTransformApply.dml +++ b/scripts/builtin/powerTransformApply.dml @@ -41,20 +41,20 @@ m_powerTransformApply = function(Matrix[Double] X, Matrix[Double] lambdas) Y = matrix(0.0, rows=n, cols=m) - #临界点处理(0和2) + # Handle boundary points (0 and 2) eps = 1e-12 - #逐列变化循环 + # Loop over columns for transformation for (j in 1:m){ x = X[,j] lambda_j = as.scalar(lambdas[1,j]) nonnegative = x >= 0 - #使用中间值输入 避免评价算法或者分数幂超出有效域 + # Use intermediate inputs to avoid invalid domains in scoring or fractional powers x_pos = ifelse(nonnegative, x, 0.0) x_neg = ifelse(nonnegative, 0.0, x) - #对非负值的转化 (x>=0) + # Transform nonnegative values (x>=0) if (abs(lambda_j) < eps) { y_pos = log(x_pos + 1) } @@ -62,7 +62,7 @@ m_powerTransformApply = function(Matrix[Double] X, Matrix[Double] lambdas) y_pos = ((x_pos + 1)^lambda_j - 1) / lambda_j } - #对负值转化 (x<0) + # Transform negative values (x<0) if (abs(lambda_j - 2) < eps) { y_neg = -log(1 - x_neg) } @@ -70,7 +70,7 @@ m_powerTransformApply = function(Matrix[Double] X, Matrix[Double] lambdas) y_neg = -((1 - x_neg)^(2 - lambda_j) - 1) / (2 - lambda_j) } - #俩支线合并 + # Combine the two branches Y[,j] = ifelse(nonnegative, y_pos, y_neg) } } diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinPowerTransformTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinPowerTransformTest.java new file mode 100644 index 00000000000..a58604c9af1 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinPowerTransformTest.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + +import java.util.HashMap; + +import org.junit.Test; + +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.common.Types.ExecType; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; + +public class BuiltinPowerTransformTest extends AutomatedTestBase { + private static final String TEST_NAME = "powerTransformApply"; + private static final String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = + TEST_DIR + BuiltinPowerTransformTest.class.getSimpleName() + "/"; + + private static final double EPS = 1e-9; + + @Override + public void setUp() { + // Register test configuration and declare the test output matrix Y + addTestConfiguration( + TEST_NAME, + new TestConfiguration( + TEST_CLASS_DIR, + TEST_NAME, + new String[] {"Y"} + ) + ); + } + + @Test + public void testPowerTransformApplyDenseCP() { + // Only test single-node CP execution mode here + runPowerTransformApplyTest(ExecType.CP); + } + + private void runPowerTransformApplyTest(ExecType execType) { + // Save the old execution mode and restore it after the test + ExecMode oldExecMode = setExecMode(execType); + + try { + // Load the configuration for this test + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + + String home = SCRIPT_DIR + TEST_DIR; + + // Set the DML test wrapper and R reference implementation + fullDMLScriptName = home + TEST_NAME + ".dml"; + fullRScriptName = home + TEST_NAME + ".R"; + + // DML arguments in order: + // 1. Input matrix X + // 2. Lambda row matrix L + // 3. Output matrix Y + programArgs = new String[] { + "-exec", "singlenode", + "-args", + input("X"), + input("L"), + output("Y") + }; + + // R script receives the input directory and expected output directory + rCmd = "Rscript " + fullRScriptName + " " + + inputDir() + " " + expectedDir(); + + // Use the same input values in three columns to test lambda 0, 1, and 2 + double[][] X = { + {-2, -2, -2}, + {-1, -1, -1}, + { 0, 0, 0}, + { 1, 1, 1}, + { 2, 2, 2} + }; + + // First column lambda=0, second column lambda=1, third column lambda=2 + double[][] L = { + {0, 1, 2} + }; + + // Write both input matrices to the test input directory and generate metadata + writeInputMatrixWithMTD("X", X, true); + writeInputMatrixWithMTD("L", L, true); + + // DML test + runTest(true, false, null, -1); + + // R reference implementation + runRScript(true); + + // Read the results separately + HashMap dmlResult = + readDMLMatrixFromOutputDir("Y"); + + HashMap rResult = + readRMatrixFromExpectedDir("Y"); + + // Compare the two result matrices within the given tolerance + TestUtils.compareMatrices( + dmlResult, + rResult, + EPS, + "DML", + "R" + ); + } + catch (Exception exception) { + throw new RuntimeException(exception); + } + finally { + // Restore the old execution mode whether the test succeeds or fails + resetExecMode(oldExecMode); + } + } +} diff --git a/src/test/scripts/functions/builtin/powerTransformApply.R b/src/test/scripts/functions/builtin/powerTransformApply.R new file mode 100644 index 00000000000..903a791df61 --- /dev/null +++ b/src/test/scripts/functions/builtin/powerTransformApply.R @@ -0,0 +1,96 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +library("Matrix") + +# Read command line arguments passed from the Java test to the R script +args <- commandArgs(TRUE) + +# Keep higher numeric output precision +options(digits = 22) + +# Read test matrix X and lambda row matrix L from the input directory +X <- as.matrix(readMM(paste(args[1], "X.mtx", sep = ""))) +lambdas <- as.matrix(readMM(paste(args[1], "L.mtx", sep = ""))) + +# Check every column has a matching lambda +if (ncol(X) != ncol(lambdas)) { + stop("The number of lambdas must match the number of columns in X.") +} + +# Apply the YJ transform with a given lambda to one vector +yeoJohnsonApply <- function(x, lambda) { + y <- numeric(length(x)) + + # Find positions of nonnegative and negative values separately to avoid computing the wrong branch + nonnegativeIndex <- which(x >= 0) + negativeIndex <- which(x < 0) + + # Handle the x >= 0 part + if (length(nonnegativeIndex) > 0) { + xPositive <- x[nonnegativeIndex] + + # Use the log form when lambda is close to 0 + if (abs(lambda) < 1e-12) { + y[nonnegativeIndex] <- log(xPositive + 1) + } + else { + y[nonnegativeIndex] <- + ((xPositive + 1)^lambda - 1) / lambda + } + } + + # Handle the x < 0 part + if (length(negativeIndex) > 0) { + xNegative <- x[negativeIndex] + + # Use the log form when lambda is close to 2 + if (abs(lambda - 2) < 1e-12) { + y[negativeIndex] <- -log(1 - xNegative) + } + else { + y[negativeIndex] <- + -(((1 - xNegative)^(2 - lambda) - 1) / + (2 - lambda)) + } + } + + return(y) +} + +# Create a result matrix with the same dimensions as the input matrix +Y <- matrix( + 0.0, + nrow = nrow(X), + ncol = ncol(X) +) + +# Transform each column independently using its matching lambda +for (j in seq_len(ncol(X))) { + lambda <- as.numeric(lambdas[1, j]) + Y[, j] <- yeoJohnsonApply(X[, j], lambda) +} + +# Write the expected result computed by R to the expected output directory +writeMM( + as(Y, "CsparseMatrix"), + paste(args[2], "Y", sep = "") +) diff --git a/src/test/scripts/functions/builtin/powerTransformApply.dml b/src/test/scripts/functions/builtin/powerTransformApply.dml new file mode 100644 index 00000000000..bf73c2a45b3 --- /dev/null +++ b/src/test/scripts/functions/builtin/powerTransformApply.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Read the matrix to transform from the first command line argument +X = read($1); + +# Read the lambda for each column from the second command line argument +lambdas = read($2); + +# Directly call the registered SystemDS builtin +Y = powerTransformApply(X, lambdas); + +# Write the transformed result to the location from the third command line argument +write(Y, $3);