Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions kernel/power/dgemm_kernel_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
#endif
#define KERNEL(i) \
rowA = (vec_t *)&AO[(i)<< 3];\
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(i) << 3])); \
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[((i) << 3) + 4])); \
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[(i) << 3])); \
rowB1 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[((i) << 3) + 4])); \
__builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\
__builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\
__builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\
Expand Down Expand Up @@ -200,8 +200,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 1;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[0]));
rowB1 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[4]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
Expand Down Expand Up @@ -283,16 +283,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[0]));
rowB1 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[4]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 2];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[l << 3]));
rowB1 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[(l << 3) + 4]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
Expand Down Expand Up @@ -323,14 +323,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[0]));
rowB1 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[4]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 1];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[l << 3]));
rowB1 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[(l << 3) + 4]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
}
Expand Down Expand Up @@ -428,14 +428,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[0]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 3];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[l << 2]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
Expand Down Expand Up @@ -466,12 +466,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[0]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 2];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[l << 2]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
}
Expand All @@ -498,11 +498,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[0]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 1];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
rowB = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&BO[l << 2]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
}
SAVE_ACC (&acc0, 0);
Expand Down
4 changes: 2 additions & 2 deletions kernel/power/dgemm_small_kernel_nn_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,8 @@ typedef __vector unsigned char vec_t;
*((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1;

#define LOAD_PACKED_B(pb0, pb1, offset) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+0+offset))); \
pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+4+offset)));
pb0 = __builtin_vsx_lxvp(0L, (const __vector_pair *)((packB+(k*8)+0+offset))); \
pb1 = __builtin_vsx_lxvp(0L, (const __vector_pair *)((packB+(k*8)+4+offset)));

#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
Expand Down
6 changes: 3 additions & 3 deletions kernel/power/dgemm_small_kernel_nt_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,11 @@ typedef __vector unsigned char vec_t;
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]);

#define LOAD_BP_1x8(K, N) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
pb0 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&B[((K)*ldb)+N+0])); \
pb1 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&B[((K)*ldb)+N+4]));

#define LOAD_BP_1x4(K, N) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
pb0 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&B[((K)*ldb)+N+0]));

#define LOAD_BP_1x2(K, N) \
t0 = vec_xl(0, B+(K*ldb)+N); \
Expand Down
6 changes: 3 additions & 3 deletions kernel/power/dgemm_small_kernel_tt_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,11 +207,11 @@ typedef __vector unsigned char vec_t;
#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);

#define LOAD_BP_1x8(K, N) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
pb0 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&B[((K)*ldb)+N+0])); \
pb1 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&B[((K)*ldb)+N+4]));

#define LOAD_BP_1x4(K, N) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
pb0 = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&B[((K)*ldb)+N+0]));

#define LOAD_BP_1x2(K, N) \
t0 = vec_xl(0, B+((K)*ldb)+N); \
Expand Down
18 changes: 9 additions & 9 deletions kernel/power/dgemv_t_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,37 +61,37 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
a6 = a5 + lda;
a7 = a6 + lda;
for (i = 0; i < n/2; i += 2) {
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a0[i*2]));
vx = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&x[i*2]));
vp = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&a0[i*2]));
vx = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&x[i*2]));
__builtin_vsx_disassemble_pair (res, &vx);
__builtin_vsx_disassemble_pair (res1, &vp);
temp0 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp0);
temp0 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp0);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a1[i*2]));
vp = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&a1[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp1 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp1);
temp1 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp1);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a2[i*2]));
vp = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&a2[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp2 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp2);
temp2 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp2);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a3[i*2]));
vp = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&a3[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp3 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp3);
temp3 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp3);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a4[i*2]));
vp = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&a4[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp4 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp4);
temp4 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp4);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a5[i*2]));
vp = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&a5[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp5 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp5);
temp5 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp5);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a6[i*2]));
vp = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&a6[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp6 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp6);
temp6 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp6);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a7[i*2]));
vp = __builtin_vsx_lxvp(0L, (const __vector_pair *)(&a7[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp7 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp7);
temp7 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp7);
Expand Down
Loading
Loading