OpenMathLib · amritahs-ibm · Jun 10, 2026
diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c
@@ -94,8 +94,8 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
 #endif
 #define KERNEL(i) \
     rowA = (vec_t *)&AO[(i)<< 3];\
-    rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(i) << 3])); \
-    rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[((i) << 3) + 4])); \
+    rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[(i) << 3])); \
+    rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[((i) << 3) + 4])); \
     __builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\
     __builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\
     __builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\
@@ -200,8 +200,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 1;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB, rowB1;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
-            rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
+            rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[4]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
             __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@@ -283,16 +283,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB, rowB1;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
-            rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
+            rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[4]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
             __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
             __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 2];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3]));
-                rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 3]));
+                rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@@ -323,14 +323,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB, rowB1;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
-            rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
+            rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[4]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 1];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3]));
-                rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 3]));
+                rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
             }
@@ -428,14 +428,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
             __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
             __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 3];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 2]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
                 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@@ -466,12 +466,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 2];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 2]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
             }
@@ -498,11 +498,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 1];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 2]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
             }
             SAVE_ACC (&acc0, 0);

diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c
@@ -314,8 +314,8 @@ typedef __vector unsigned char vec_t;
   *((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1;
 
 #define LOAD_PACKED_B(pb0, pb1, offset)                        \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+0+offset)));  \
-  pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+4+offset)));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)(packB+(k*8)+0+offset)));  \
+  pb1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)(packB+(k*8)+4+offset)));
 
 #ifdef B0
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)

diff --git a/kernel/power/dgemm_small_kernel_nt_power10.c b/kernel/power/dgemm_small_kernel_nt_power10.c
@@ -144,11 +144,11 @@ typedef __vector unsigned char vec_t;
 #define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]);
 
 #define LOAD_BP_1x8(K, N)                                 \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
-  pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
+  pb1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
 
 #define LOAD_BP_1x4(K, N)                                \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
 
 #define LOAD_BP_1x2(K, N)                                  \
   t0 = vec_xl(0, B+(K*ldb)+N);                             \

diff --git a/kernel/power/dgemm_small_kernel_tt_power10.c b/kernel/power/dgemm_small_kernel_tt_power10.c
@@ -207,11 +207,11 @@ typedef __vector unsigned char vec_t;
 #define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);
 
 #define LOAD_BP_1x8(K, N)                                 \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
-  pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
+  pb1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
 
 #define LOAD_BP_1x4(K, N)                                 \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
 
 #define LOAD_BP_1x2(K, N)                                  \
   t0 = vec_xl(0, B+((K)*ldb)+N);                           \

diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c
@@ -61,37 +61,37 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
     a6 = a5 + lda;
     a7 = a6 + lda;
     for (i = 0; i < n/2; i += 2) {
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a0[i*2]));
-        vx = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&x[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a0[i*2]));
+        vx = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&x[i*2]));
         __builtin_vsx_disassemble_pair (res, &vx);
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp0 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp0);
         temp0 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp0);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a1[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a1[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp1 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp1);
         temp1 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp1);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a2[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a2[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp2 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp2);
         temp2 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp2);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a3[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a3[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp3 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp3);
         temp3 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp3);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a4[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a4[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp4 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp4);
         temp4 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp4);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a5[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a5[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp5 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp5);
         temp5 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp5);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a6[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a6[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp6 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp6);
         temp6 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp6);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a7[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a7[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp7 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp7);
         temp7 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp7);