Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions kernel/power/dgemm_kernel_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
#endif
#define KERNEL(i) \
rowA = (vec_t *)&AO[(i)<< 3];\
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(i) << 3])); \
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[((i) << 3) + 4])); \
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[(i) << 3])); \
rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[((i) << 3) + 4])); \
__builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\
__builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\
__builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\
Expand Down Expand Up @@ -200,8 +200,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 1;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[4]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
Expand Down Expand Up @@ -283,16 +283,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[4]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 2];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 3]));
rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
Expand Down Expand Up @@ -323,14 +323,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[4]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 1];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3]));
rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 3]));
rowB1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[(l << 3) + 4]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
}
Expand Down Expand Up @@ -428,14 +428,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 3];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 2]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
Expand Down Expand Up @@ -466,12 +466,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 2];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 2]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
}
Expand All @@ -498,11 +498,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[0]));
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
for (l = 1; l < temp; l++) {
rowA = (vec_t *) & AO[l << 1];
rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2]));
rowB = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&BO[l << 2]));
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
}
SAVE_ACC (&acc0, 0);
Expand Down
4 changes: 2 additions & 2 deletions kernel/power/dgemm_small_kernel_nn_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,8 @@ typedef __vector unsigned char vec_t;
*((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1;

#define LOAD_PACKED_B(pb0, pb1, offset) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+0+offset))); \
pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+4+offset)));
pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)(packB+(k*8)+0+offset))); \
pb1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)(packB+(k*8)+4+offset)));

#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
Expand Down
6 changes: 3 additions & 3 deletions kernel/power/dgemm_small_kernel_nt_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,11 @@ typedef __vector unsigned char vec_t;
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]);

#define LOAD_BP_1x8(K, N) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));

#define LOAD_BP_1x4(K, N) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));

#define LOAD_BP_1x2(K, N) \
t0 = vec_xl(0, B+(K*ldb)+N); \
Expand Down
6 changes: 3 additions & 3 deletions kernel/power/dgemm_small_kernel_tt_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,11 +207,11 @@ typedef __vector unsigned char vec_t;
#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);

#define LOAD_BP_1x8(K, N) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+4]));

#define LOAD_BP_1x4(K, N) \
pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
pb0 = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&B[((K)*ldb)+N+0]));

#define LOAD_BP_1x2(K, N) \
t0 = vec_xl(0, B+((K)*ldb)+N); \
Expand Down
18 changes: 9 additions & 9 deletions kernel/power/dgemv_t_power10.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,37 +61,37 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
a6 = a5 + lda;
a7 = a6 + lda;
for (i = 0; i < n/2; i += 2) {
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a0[i*2]));
vx = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&x[i*2]));
vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a0[i*2]));
vx = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&x[i*2]));
__builtin_vsx_disassemble_pair (res, &vx);
__builtin_vsx_disassemble_pair (res1, &vp);
temp0 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp0);
temp0 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp0);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a1[i*2]));
vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a1[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp1 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp1);
temp1 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp1);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a2[i*2]));
vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a2[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp2 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp2);
temp2 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp2);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a3[i*2]));
vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a3[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp3 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp3);
temp3 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp3);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a4[i*2]));
vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a4[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp4 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp4);
temp4 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp4);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a5[i*2]));
vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a5[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp5 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp5);
temp5 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp5);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a6[i*2]));
vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a6[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp6 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp6);
temp6 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp6);
vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a7[i*2]));
vp = __builtin_vsx_lxvp(0L, (__vector_pair *)((void *)&a7[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp7 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp7);
temp7 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp7);
Expand Down
Loading
Loading