Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 158 additions & 0 deletions scripts/builtin/powerTransform.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Power transformation using the Yeo-Johnson method.
# Reduces feature skewness by estimating and applying an optimal transformation parameter for each column.
#
# INPUT:
# -------------------------------------------------------------------------------------
# X Input feature matrix of shape n-by-m
# -------------------------------------------------------------------------------------
#
# OUTPUT:
# -------------------------------------------------------------------------------------
# Y Power-transformed matrix of shape n-by-m
# lambdas Estimated lambda parameters of shape 1-by-m, one per column
# -------------------------------------------------------------------------------------

m_powerTransform = function(Matrix[Double] X)
return (Matrix[Double] Y, Matrix[Double] lambdas)
{
m = ncol(X)
lambdas = matrix(1.0, rows=1, cols=m) # Initialize first, then replace each column with the best lambdas


# Estimate lambda for each column separately
for (j in 1:m){
x = X[,j]

# For constant columns, variance is 0, so skip optimization and use lambda = 1
if (max(x) == min(x)) {
lambdas[1,j] = 1.0;
}
else{
lambdas[1,j]= ptEstimateLambda(x);
}
}
# Pass the parameters to Apply
Y = powerTransformApply(X, lambdas);
}





ptEstimateLambda = function(Matrix[Double] x)
return (Double lambda)
{
lower = -2.0;
upper = 2.0;

lambda = ptGoldenSectionSearch(x, lower, upper);
}

# Compute negative log likelihood; lower lambda score is better

ptYJNegLogLikelihood = function(
Matrix[Double] x,
Double lambda)
return (Double negLogLikelihood)
{
# powerTransformApply needs lambda as a matrix
lambdaMatrix = matrix(lambda, rows=1, cols=1);

# Apply one YJ transform with this lambda and use the temporary y for scoring
y = powerTransformApply(x, lambdaMatrix);

# Safety check; give a huge penalty score when variance is below 0
n = nrow(x);
yMean = mean(y);
yVariance = sum((y - yMean)^2) / n;

if (yVariance <= 0.0) {
negLogLikelihood = 1e300
}

# Start scoring after the safety check passes
# The objective has two parts

else {
# Variance Term
logLikelihood = -n / 2.0 * log(yVariance);

# Jacobian term
jacobian = (lambda - 1.0) * sum(sign(x) * log(abs(x) + 1.0));

# Combine
logLikelihood = logLikelihood + jacobian;
# Return the negative value
negLogLikelihood = -logLikelihood;
}
}

# Temporarily use golden section search for best lambdas; replace with Brent later
ptGoldenSectionSearch = function(
Matrix[Double] x,
Double lower,
Double upper)
return (Double lambdaOptimal)
{
# Golden ratio, tolerance, and max attempts
goldenRatio = 0.6180339887498949;
tolerance = 1e-6;
maxIterations = 100;

a = lower;
b = upper;

c = b - goldenRatio * (b - a);
d = a + goldenRatio * (b - a);

fc = ptYJNegLogLikelihood(x, c);
fd = ptYJNegLogLikelihood(x, d);

iteration = 0;
while (((b - a) > tolerance) & (iteration < maxIterations)) {

if (fc < fd) {
b = d;
d = c;
fd = fc;
c = b - goldenRatio * (b - a);
fc = ptYJNegLogLikelihood(x, c);
}
else {
a = c;
c = d;
fc = fd;
d = a + goldenRatio * (b - a);
fd = ptYJNegLogLikelihood(x, d);
}

iteration = iteration + 1;
}

# Keep the best one after shrinking
if (fc < fd) {
lambdaOptimal = c; # Whoever has the lower score wins
} else {
lambdaOptimal = d;
}
}
76 changes: 76 additions & 0 deletions scripts/builtin/powerTransformApply.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Applies a fitted Yeo-Johnson power transformation.
# Transforms each feature using its previously estimated lambda parameter.
#
# INPUT:
# -------------------------------------------------------------------------------------
# X Input feature matrix of shape n-by-m
# lambdas Precomputed lambda parameters of shape 1-by-m, one per column
# -------------------------------------------------------------------------------------
#
# OUTPUT:
# -------------------------------------------------------------------------------------
# Y Power-transformed matrix of shape n-by-m
# -------------------------------------------------------------------------------------


m_powerTransformApply = function(Matrix[Double] X, Matrix[Double] lambdas)
return (Matrix[Double] Y)
{
n = nrow(X)
m = ncol(X)

Y = matrix(0.0, rows=n, cols=m)

# Handle boundary points (0 and 2)
eps = 1e-12

# Loop over columns for transformation
for (j in 1:m){
x = X[,j]
lambda_j = as.scalar(lambdas[1,j])
nonnegative = x >= 0

# Use intermediate inputs to avoid invalid domains in scoring or fractional powers
x_pos = ifelse(nonnegative, x, 0.0)
x_neg = ifelse(nonnegative, 0.0, x)

# Transform nonnegative values (x>=0)
if (abs(lambda_j) < eps) {
y_pos = log(x_pos + 1)
}
else {
y_pos = ((x_pos + 1)^lambda_j - 1) / lambda_j
}

# Transform negative values (x<0)
if (abs(lambda_j - 2) < eps) {
y_neg = -log(1 - x_neg)
}
else {
y_neg = -((1 - x_neg)^(2 - lambda_j) - 1) / (2 - lambda_j)
}

# Combine the two branches
Y[,j] = ifelse(nonnegative, y_pos, y_neg)
}
}
2 changes: 2 additions & 0 deletions src/main/java/org/apache/sysds/common/Builtins.java
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,8 @@ public enum Builtins {
PCATRANSFORM("pcaTransform", true),
PNMF("pnmf", true),
PPCA("ppca", true),
POWERTRANSFORM("powerTransform", true),
POWERTRANSFORMAPPLY("powerTransformApply", true),
PPRED("ppred", false),
PROD("prod", false),
PSNR("psnr", true),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.test.functions.builtin.part2;

import java.util.HashMap;

import org.junit.Test;

import org.apache.sysds.common.Types.ExecMode;
import org.apache.sysds.common.Types.ExecType;
import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;

public class BuiltinPowerTransformTest extends AutomatedTestBase {
private static final String TEST_NAME = "powerTransformApply";
private static final String TEST_DIR = "functions/builtin/";
private static final String TEST_CLASS_DIR =
TEST_DIR + BuiltinPowerTransformTest.class.getSimpleName() + "/";

private static final double EPS = 1e-9;

@Override
public void setUp() {
// Register test configuration and declare the test output matrix Y
addTestConfiguration(
TEST_NAME,
new TestConfiguration(
TEST_CLASS_DIR,
TEST_NAME,
new String[] {"Y"}
)
);
}

@Test
public void testPowerTransformApplyDenseCP() {
// Only test single-node CP execution mode here
runPowerTransformApplyTest(ExecType.CP);
}

private void runPowerTransformApplyTest(ExecType execType) {
// Save the old execution mode and restore it after the test
ExecMode oldExecMode = setExecMode(execType);

try {
// Load the configuration for this test
loadTestConfiguration(getTestConfiguration(TEST_NAME));

String home = SCRIPT_DIR + TEST_DIR;

// Set the DML test wrapper and R reference implementation
fullDMLScriptName = home + TEST_NAME + ".dml";
fullRScriptName = home + TEST_NAME + ".R";

// DML arguments in order:
// 1. Input matrix X
// 2. Lambda row matrix L
// 3. Output matrix Y
programArgs = new String[] {
"-exec", "singlenode",
"-args",
input("X"),
input("L"),
output("Y")
};

// R script receives the input directory and expected output directory
rCmd = "Rscript " + fullRScriptName + " "
+ inputDir() + " " + expectedDir();

// Use the same input values in three columns to test lambda 0, 1, and 2
double[][] X = {
{-2, -2, -2},
{-1, -1, -1},
{ 0, 0, 0},
{ 1, 1, 1},
{ 2, 2, 2}
};

// First column lambda=0, second column lambda=1, third column lambda=2
double[][] L = {
{0, 1, 2}
};

// Write both input matrices to the test input directory and generate metadata
writeInputMatrixWithMTD("X", X, true);
writeInputMatrixWithMTD("L", L, true);

// DML test
runTest(true, false, null, -1);

// R reference implementation
runRScript(true);

// Read the results separately
HashMap<CellIndex, Double> dmlResult =
readDMLMatrixFromOutputDir("Y");

HashMap<CellIndex, Double> rResult =
readRMatrixFromExpectedDir("Y");

// Compare the two result matrices within the given tolerance
TestUtils.compareMatrices(
dmlResult,
rResult,
EPS,
"DML",
"R"
);
}
catch (Exception exception) {
throw new RuntimeException(exception);
}
finally {
// Restore the old execution mode whether the test succeeds or fails
resetExecMode(oldExecMode);
}
}
}
Loading