From 464781235b4b4dced59ebc188863410d1412d1b7 Mon Sep 17 00:00:00 2001
From: Amrita H S <amritahs@linux.vnet.ibm.com>
Date: Wed, 10 Jun 2026 02:07:05 -0500
Subject: [PATCH] Power10: Fix __builtin_vsx_lxvp offset for Clang

Use 0L instead of 0 for offset parameter in __builtin_vsx_lxvp
to satisfy Clang compiler requirements.

Signed-off-by: Amrita H S <amritahs@linux.vnet.ibm.com>
---
 kernel/power/dgemm_kernel_power10.c          |  36 +++---
 kernel/power/dgemm_small_kernel_nn_power10.c |   4 +-
 kernel/power/dgemm_small_kernel_nt_power10.c |   6 +-
 kernel/power/dgemm_small_kernel_tt_power10.c |   6 +-
 kernel/power/dgemv_t_power10.c               |  18 +--
 kernel/power/zgemm_kernel_power10.c          | 120 +++++++++----------
 6 files changed, 95 insertions(+), 95 deletions(-)

diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c
index 6ec6ee51b2..4141fbcbc4 100644
--- a/kernel/power/dgemm_kernel_power10.c
+++ b/kernel/power/dgemm_kernel_power10.c
@@ -94,8 +94,8 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
 #endif
 #define KERNEL(i) \
     rowA = (vec_t *)&AO[(i)<< 3];\
-    rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(i) << 3])); \
-    rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[((i) << 3) + 4])); \
+    rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[(i) << 3])); \
+    rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[((i) << 3) + 4])); \
     __builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\
     __builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\
     __builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\
@@ -200,8 +200,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 1;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB, rowB1;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
-            rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
+            rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[4]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
             __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@@ -283,16 +283,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB, rowB1;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
-            rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
+            rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[4]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
             __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
             __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 2];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3]));
-                rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 3]));
+                rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@@ -323,14 +323,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB, rowB1;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
-            rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
+            rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[4]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 1];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3]));
-                rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 3]));
+                rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
             }
@@ -428,14 +428,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
             __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
             __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 3];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 2]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
                 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@@ -466,12 +466,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 2];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 2]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
                 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
             }
@@ -498,11 +498,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
             BLASLONG l = 0;
             vec_t *rowA = (vec_t *) & AO[0];
             __vector_pair rowB;
-            rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
+            rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
             __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
             for (l = 1; l < temp; l++) {
                 rowA = (vec_t *) & AO[l << 1];
-                rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
+                rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 2]));
                 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
             }
             SAVE_ACC (&acc0, 0);
diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c
index 59bee5fe5d..6a373a1fd8 100644
--- a/kernel/power/dgemm_small_kernel_nn_power10.c
+++ b/kernel/power/dgemm_small_kernel_nn_power10.c
@@ -314,8 +314,8 @@ typedef __vector unsigned char vec_t;
   *((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1;
 
 #define LOAD_PACKED_B(pb0, pb1, offset)                        \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+0+offset)));  \
-  pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+4+offset)));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)(packB+(k*8)+0+offset)));  \
+  pb1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)(packB+(k*8)+4+offset)));
 
 #ifdef B0
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
diff --git a/kernel/power/dgemm_small_kernel_nt_power10.c b/kernel/power/dgemm_small_kernel_nt_power10.c
index 009f2dd6cd..b3a8b8f3c6 100644
--- a/kernel/power/dgemm_small_kernel_nt_power10.c
+++ b/kernel/power/dgemm_small_kernel_nt_power10.c
@@ -144,11 +144,11 @@ typedef __vector unsigned char vec_t;
 #define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]);
 
 #define LOAD_BP_1x8(K, N)                                 \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
-  pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
+  pb1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
 
 #define LOAD_BP_1x4(K, N)                                \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
 
 #define LOAD_BP_1x2(K, N)                                  \
   t0 = vec_xl(0, B+(K*ldb)+N);                             \
diff --git a/kernel/power/dgemm_small_kernel_tt_power10.c b/kernel/power/dgemm_small_kernel_tt_power10.c
index 62e3b8b678..13e6b1cb2d 100644
--- a/kernel/power/dgemm_small_kernel_tt_power10.c
+++ b/kernel/power/dgemm_small_kernel_tt_power10.c
@@ -207,11 +207,11 @@ typedef __vector unsigned char vec_t;
 #define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);
 
 #define LOAD_BP_1x8(K, N)                                 \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
-  pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
+  pb1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
 
 #define LOAD_BP_1x4(K, N)                                 \
-  pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
+  pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
 
 #define LOAD_BP_1x2(K, N)                                  \
   t0 = vec_xl(0, B+((K)*ldb)+N);                           \
diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c
index 9aaeec902f..099517b05a 100644
--- a/kernel/power/dgemv_t_power10.c
+++ b/kernel/power/dgemv_t_power10.c
@@ -61,37 +61,37 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
     a6 = a5 + lda;
     a7 = a6 + lda;
     for (i = 0; i < n/2; i += 2) {
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a0[i*2]));
-        vx = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&x[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a0[i*2]));
+        vx = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&x[i*2]));
         __builtin_vsx_disassemble_pair (res, &vx);
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp0 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp0);
         temp0 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp0);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a1[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a1[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp1 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp1);
         temp1 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp1);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a2[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a2[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp2 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp2);
         temp2 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp2);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a3[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a3[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp3 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp3);
         temp3 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp3);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a4[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a4[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp4 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp4);
         temp4 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp4);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a5[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a5[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp5 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp5);
         temp5 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp5);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a6[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a6[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp6 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp6);
         temp6 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp6);
-        vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a7[i*2]));
+        vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a7[i*2]));
         __builtin_vsx_disassemble_pair (res1, &vp);
         temp7 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp7);
         temp7 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp7);
diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c
index 8091418683..b141307bf2 100644
--- a/kernel/power/zgemm_kernel_power10.c
+++ b/kernel/power/zgemm_kernel_power10.c
@@ -316,10 +316,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
           SET_ACC_ZERO()
 	  for (l = 0; l < temp; ++l)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<4]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+4]));
-              __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+8]));
-              __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+12]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<4]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+4]));
+              __vector_pair rowA3 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+8]));
+              __vector_pair rowA4 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+12]));
               vec_t rowB1 = *(vec_t *) & BO[l<<2];
               vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
               __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
@@ -406,10 +406,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
           SET_ACC_ZERO()
 	  for (l = 0; l < (temp & (~1)); l+=2)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<3]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+4]));
-              __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+8]));
-              __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+12]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<3]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+4]));
+              __vector_pair rowA3 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+8]));
+              __vector_pair rowA4 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+12]));
               vec_t rowB1 = *(vec_t *) & BO[l<<2];
               vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
               vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
@@ -425,8 +425,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
 	    }
 	  for (l = (temp & (~1)); l < temp; ++l)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<3]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+4]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<3]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+4]));
               vec_t rowB1 = *(vec_t *) & BO[l<<2];
               vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
               __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
@@ -454,10 +454,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
           SET_ACC_ZERO()
 	  for (l = 0; l < (temp & (~3)); l+=4)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<2]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+4]));
-              __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+8]));
-              __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+12]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<2]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+4]));
+              __vector_pair rowA3 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+8]));
+              __vector_pair rowA4 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+12]));
               vec_t rowB1 = *(vec_t *) & BO[l<<2];
               vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
               vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
@@ -477,7 +477,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
 	    }
 	  for (l = (temp & (~3)); l < temp; ++l)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<2]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<2]));
               vec_t rowB1 = *(vec_t *) & BO[l<<2];
               vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
               __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
@@ -503,10 +503,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
           SET_ACC_ZERO()
 	  for (l = 0; l < (temp & (~3)); l+=4)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<1]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+2]));
-              __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+4]));
-              __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+6]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<1]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+2]));
+              __vector_pair rowA3 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+4]));
+              __vector_pair rowA4 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+6]));
               vec_t rowB1 = *(vec_t *) & BO[l<<2];
               vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
               vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
@@ -526,7 +526,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
 	    }
 	  for (l = (temp & (~3)); l < temp; ++l)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<1]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<1]));
               vec_t rowB1 = *(vec_t *) & BO[l<<2];
               vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
               __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
@@ -564,14 +564,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
           SET_ACC_ZERO()
 	  for (l = 0; l < (temp & (~1)); l+=2)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<4]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+4]));
-              __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+8]));
-              __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+12]));
-              __vector_pair rowA5 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+16]));
-              __vector_pair rowA6 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+20]));
-              __vector_pair rowA7 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+24]));
-              __vector_pair rowA8 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+28]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<4]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+4]));
+              __vector_pair rowA3 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+8]));
+              __vector_pair rowA4 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+12]));
+              __vector_pair rowA5 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+16]));
+              __vector_pair rowA6 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+20]));
+              __vector_pair rowA7 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+24]));
+              __vector_pair rowA8 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+28]));
               vec_t rowB1 = *(vec_t *) & BO[l<<1];
               vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
               __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
@@ -585,10 +585,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
 	    }
 	  for (l = (temp & (~1)); l < temp; ++l)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<4]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+4]));
-              __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+8]));
-              __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+12]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<4]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+4]));
+              __vector_pair rowA3 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+8]));
+              __vector_pair rowA4 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<4)+12]));
               vec_t rowB1 = *(vec_t *) & BO[l<<1];
               __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
               __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
@@ -615,14 +615,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
           SET_ACC_ZERO()
 	  for (l = 0; l < (temp & (~3)); l+=4)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<3]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+4]));
-              __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+8]));
-              __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+12]));
-              __vector_pair rowA5 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+16]));
-              __vector_pair rowA6 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+20]));
-              __vector_pair rowA7 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+24]));
-              __vector_pair rowA8 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+28]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<3]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+4]));
+              __vector_pair rowA3 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+8]));
+              __vector_pair rowA4 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+12]));
+              __vector_pair rowA5 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+16]));
+              __vector_pair rowA6 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+20]));
+              __vector_pair rowA7 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+24]));
+              __vector_pair rowA8 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+28]));
               vec_t rowB1 = *(vec_t *) & BO[l<<1];
               vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
               vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
@@ -638,8 +638,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
 	    }
 	  for (l = (temp & (~3)); l < temp; ++l)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<3]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+4]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<3]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<3)+4]));
               vec_t rowB1 = *(vec_t *) & BO[l<<1];
               __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
               __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
@@ -662,14 +662,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
           SET_ACC_ZERO()
 	  for (l = 0; l < (temp & (~7)); l+=8)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<2]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+4]));
-              __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+8]));
-              __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+12]));
-              __vector_pair rowA5 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+16]));
-              __vector_pair rowA6 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+20]));
-              __vector_pair rowA7 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+24]));
-              __vector_pair rowA8 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+28]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<2]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+4]));
+              __vector_pair rowA3 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+8]));
+              __vector_pair rowA4 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+12]));
+              __vector_pair rowA5 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+16]));
+              __vector_pair rowA6 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+20]));
+              __vector_pair rowA7 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+24]));
+              __vector_pair rowA8 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<2)+28]));
               vec_t rowB1 = *(vec_t *) & BO[l<<1];
               vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
               vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
@@ -689,7 +689,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
 	    }
 	  for (l = (temp & (~7)); l < temp; ++l)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<2]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<2]));
               vec_t rowB1 = *(vec_t *) & BO[l<<1];
               __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
 	    }
@@ -713,14 +713,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
           SET_ACC_ZERO()
 	  for (l = 0; l < (temp & (~7)); l+=8)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<1]));
-              __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+2]));
-              __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+4]));
-              __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+6]));
-              __vector_pair rowA5 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+8]));
-              __vector_pair rowA6 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+10]));
-              __vector_pair rowA7 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+12]));
-              __vector_pair rowA8 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+14]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<1]));
+              __vector_pair rowA2 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+2]));
+              __vector_pair rowA3 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+4]));
+              __vector_pair rowA4 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+6]));
+              __vector_pair rowA5 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+8]));
+              __vector_pair rowA6 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+10]));
+              __vector_pair rowA7 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+12]));
+              __vector_pair rowA8 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[(l<<1)+14]));
               vec_t rowB1 = *(vec_t *) & BO[l<<1];
               vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
               vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
@@ -740,7 +740,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
 	    }
 	  for (l = (temp & (~7)); l < temp; ++l)
 	    {
-              __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<1]));
+              __vector_pair rowA1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&AO[l<<1]));
               vec_t rowB1 = *(vec_t *) & BO[l<<1];
               __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
 	    }