From 6731dac68b3060cd7e4f6eb4aa405a6f7a510099 Mon Sep 17 00:00:00 2001 From: pengxu Date: Fri, 29 May 2026 09:44:00 +0800 Subject: [PATCH 1/3] optimize zgemm lsx kernel for 2k3000 cpu --- driver/others/parameter.c | 23 ++++ kernel/loongarch64/zgemm_kernel_4x4_lsx.S | 160 +++++----------------- kernel/setparam-ref.c | 86 ++++++++---- param.h | 24 ++-- 4 files changed, 126 insertions(+), 167 deletions(-) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 9a1ff57358..3a1363e0ea 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -790,6 +790,17 @@ int get_L3_size() { return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB } +int get_cpu_prid() { + int ret = 0, id = 0x0; + __asm__ volatile ( + "cpucfg %[ret], %[id]" + : [ret]"=r"(ret) + : [id]"r"(id) + : "memory" + ); + return ret; +} + void blas_set_parameter(void){ #if defined(LA464) int L3_size = get_L3_size(); @@ -868,6 +879,18 @@ void blas_set_parameter(void){ } } #endif +#elif defined(LA264) + int prid = get_cpu_prid(); + if (prid == 0x0014b020) { //2k3000 + + zgemm_p = 128; + zgemm_q = 176; + zgemm_r = 360; + } else { + zgemm_p = 64; + zgemm_q = 120; + zgemm_r = 4096; + } #endif } #endif diff --git a/kernel/loongarch64/zgemm_kernel_4x4_lsx.S b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S index 070bb640b9..ece7ecfd72 100644 --- a/kernel/loongarch64/zgemm_kernel_4x4_lsx.S +++ b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S @@ -271,10 +271,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D4, B0, 0x30 // b3ri vld D0, A0, 0x00 // a0ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a0rr - vshuf4i.d D6, D0, 0x55 //a0ii + vshuf4i.d D5, D0, 0x0a //a0rr + vshuf4i.d D6, D0, 0x0f //a0ii vpackev.d D7, D2, D1 //b0r b1r vpackod.d D8, D2, D1 //b0i b1i @@ -294,10 +292,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D0, A0, 0x10 // a1ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a1rr - vshuf4i.d D6, D0, 0x55 //a1ii + vshuf4i.d D5, D0, 0x0a //a1rr + vshuf4i.d D6, D0, 0x0f //a1ii VMADD1 U4, D5, D7, U4 //01r 11r VMADD2 U5, D6, D7, U5 //01i 11i @@ -311,10 +307,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D0, A0, 0x20 // a2ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a2rr - vshuf4i.d D6, D0, 0x55 //a2ii + vshuf4i.d D5, D0, 0x0a //a2rr + vshuf4i.d D6, D0, 0x0f //a2ii VMADD1 U8, D5, D7, U8 //02r 12r VMADD2 U9, D6, D7, U9 //02i 12i @@ -328,10 +322,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D0, A0, 0x30 // a3ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a3rr - vshuf4i.d D6, D0, 0x55 //a3ii + vshuf4i.d D5, D0, 0x0a //a3rr + vshuf4i.d D6, D0, 0x0f //a3ii VMADD1 U12, D5, D7, U12 //03r 13r VMADD2 U13, D6, D7, U13 //03i 13i @@ -523,70 +515,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D0, C0, 0x00 //c0: 0 1 vld D1, C1, 0x00 //c1: 0 1 - vst U0, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U1, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U2, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U3, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U4, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U5, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U6, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U7, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U8, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U9, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U10, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U11, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U12, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U13, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U14, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - - vst U15, C0, 0x00 - fld.d $f27, C0, 0x00 - fld.d $f27, C0, 0x08 - vpackev.d D2, D1, D0 //c0[0] c1[0] vpackod.d D3, D1, D0 //c0[1] c1[1] @@ -823,10 +751,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D4, B0, 0x30 // b3ri vld D0, A0, 0x00 // a0ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a0rr - vshuf4i.d D6, D0, 0x55 //a0ii + vshuf4i.d D5, D0, 0x0a //a0rr + vshuf4i.d D6, D0, 0x0f //a0ii vpackev.d D7, D2, D1 //b0r b1r vpackod.d D8, D2, D1 //b0i b1i @@ -846,10 +772,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D0, A0, 0x10 // a1ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a1rr - vshuf4i.d D6, D0, 0x55 //a1ii + vshuf4i.d D5, D0, 0x0a //a1rr + vshuf4i.d D6, D0, 0x0f //a1ii VMADD1 U4, D5, D7, U4 //01r 11r VMADD2 U5, D6, D7, U5 //01i 11i @@ -1100,10 +1024,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D4, B0, 0x30 // b3ri vld D0, A0, 0x00 // a0ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a0rr - vshuf4i.d D6, D0, 0x55 //a0ii + vshuf4i.d D5, D0, 0x0a //a0rr + vshuf4i.d D6, D0, 0x0f //a0ii vpackev.d D7, D2, D1 //b0r b1r vpackod.d D8, D2, D1 //b0i b1i @@ -1309,10 +1231,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D2, B0, 0x10 // b1ri vld D0, A0, 0x00 // a0ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a0rr - vshuf4i.d D6, D0, 0x55 //a0ii + vshuf4i.d D5, D0, 0x0a //a0rr + vshuf4i.d D6, D0, 0x0f //a0ii vpackev.d D7, D2, D1 //b0r b1r vpackod.d D8, D2, D1 //b0i b1i @@ -1324,10 +1244,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D0, A0, 0x10 // a1ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a1rr - vshuf4i.d D6, D0, 0x55 //a1ii + vshuf4i.d D5, D0, 0x0a //a1rr + vshuf4i.d D6, D0, 0x0f //a1ii VMADD1 U2, D5, D7, U2 //01r 11r VMADD2 U3, D6, D7, U3 //01i 11i @@ -1336,10 +1254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D0, A0, 0x20 // a2ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a2rr - vshuf4i.d D6, D0, 0x55 //a2ii + vshuf4i.d D5, D0, 0x0a //a2rr + vshuf4i.d D6, D0, 0x0f //a2ii VMADD1 U4, D5, D7, U4 //02r 12r VMADD2 U5, D6, D7, U5 //02i 12i @@ -1348,10 +1264,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D0, A0, 0x30 // a3ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a3rr - vshuf4i.d D6, D0, 0x55 //a3ii + vshuf4i.d D5, D0, 0x0a //a3rr + vshuf4i.d D6, D0, 0x0f //a3ii VMADD1 U6, D5, D7, U6 //03r 13r VMADD2 U7, D6, D7, U7 //03i 13i @@ -1598,10 +1512,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D2, B0, 0x10 // b1ri vld D0, A0, 0x00 // a0ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a0rr - vshuf4i.d D6, D0, 0x55 //a0ii + vshuf4i.d D5, D0, 0x0a //a0rr + vshuf4i.d D6, D0, 0x0f //a0ii vpackev.d D7, D2, D1 //b0r b1r vpackod.d D8, D2, D1 //b0i b1i @@ -1613,10 +1525,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D0, A0, 0x10 // a1ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a1rr - vshuf4i.d D6, D0, 0x55 //a1ii + vshuf4i.d D5, D0, 0x0a //a1rr + vshuf4i.d D6, D0, 0x0f //a1ii VMADD1 U2, D5, D7, U2 //01r 11r VMADD2 U3, D6, D7, U3 //01i 11i @@ -1775,10 +1685,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld D2, B0, 0x10 // b1ri vld D0, A0, 0x00 // a0ri - vand.v D5, D0, D0 - vand.v D6, D0, D0 - vshuf4i.d D5, D0, 0x00 //a0rr - vshuf4i.d D6, D0, 0x55 //a0ii + vshuf4i.d D5, D0, 0x0a //a0rr + vshuf4i.d D6, D0, 0x0f //a0ii vpackev.d D7, D2, D1 //b0r b1r vpackod.d D8, D2, D1 //b0i b1i @@ -1930,10 +1838,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vpackev.d D5, D2, D0 //a0r a1r vpackod.d D6, D2, D0 //a0i a1i - vand.v D7, D1, D1 - vand.v D8, D1, D1 - vshuf4i.d D7, D1, 0x00 //b0rr - vshuf4i.d D8, D1, 0x55 //b0ii + vshuf4i.d D7, D1, 0x0a //b0rr + vshuf4i.d D8, D1, 0x0f //b0ii VMADD1 U0, D5, D7, U0 //00r 01r VMADD2 U1, D6, D7, U1 //00i 01i @@ -2108,10 +2014,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vpackev.d D5, D2, D0 //a0r a1r vpackod.d D6, D2, D0 //a0i a1i - vand.v D7, D1, D1 - vand.v D8, D1, D1 - vshuf4i.d D7, D1, 0x00 //b0rr - vshuf4i.d D8, D1, 0x55 //b0ii + vshuf4i.d D7, D1, 0x0a //b0rr + vshuf4i.d D8, D1, 0x0f //b0ii VMADD1 U0, D5, D7, U0 //00r 01r VMADD2 U1, D6, D7, U1 //00i 01i diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 044ececd18..a6959f2599 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -187,12 +187,12 @@ gotoblas_t TABLE_NAME = { #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) isamax_kTS, -#endif +#endif #if (BUILD_SINGLE==1 ) || (BUILD_COMPLEX==1) isamin_kTS, ismax_kTS, ismin_kTS, snrm2_kTS, sasum_kTS, -#endif -#if BUILD_SINGLE == 1 +#endif +#if BUILD_SINGLE == 1 ssum_kTS, #endif @@ -203,7 +203,7 @@ gotoblas_t TABLE_NAME = { #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) sscal_kTS, -#endif +#endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) sswap_kTS, sgemv_nTS, sgemv_tTS, @@ -211,18 +211,18 @@ gotoblas_t TABLE_NAME = { #if BUILD_SINGLE == 1 sger_kTS, #endif -#if BUILD_SINGLE == 1 +#if BUILD_SINGLE == 1 ssymv_LTS, ssymv_UTS, #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) #ifdef ARCH_X86_64 sgemm_directTS, - sgemm_direct_performantTS, + sgemm_direct_performantTS, #endif #ifdef ARCH_ARM64 sgemm_directTS, - sgemm_direct_performantTS, + sgemm_direct_performantTS, sgemm_direct_alpha_betaTS, ssymm_direct_alpha_betaLUTS, ssymm_direct_alpha_betaLLTS, @@ -293,7 +293,7 @@ gotoblas_t TABLE_NAME = { #endif #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) 0, 0, 0, DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, #ifdef DGEMM_DEFAULT_UNROLL_MN @@ -304,34 +304,34 @@ gotoblas_t TABLE_NAME = { #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, dnrm2_kTS, dasum_kTS, -#endif -#if (BUILD_DOUBLE==1) +#endif +#if (BUILD_DOUBLE==1) dsum_kTS, #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) dcopy_kTS, ddot_kTS, #endif -#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) dsdot_kTS, #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) drot_kTS, drotm_kTS, daxpy_kTS, - dscal_kTS, + dscal_kTS, dswap_kTS, dgemv_nTS, dgemv_tTS, #endif -#if (BUILD_DOUBLE==1) +#if (BUILD_DOUBLE==1) dger_kTS, dsymv_LTS, dsymv_UTS, #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) dgemm_kernelTS, dgemm_betaTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dgemm_incopyTS, dgemm_itcopyTS, @@ -341,14 +341,14 @@ gotoblas_t TABLE_NAME = { dgemm_oncopyTS, dgemm_otcopyTS, #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) #ifdef SMALL_MATRIX_OPT dgemm_small_matrix_permitTS, dgemm_small_kernel_nnTS, dgemm_small_kernel_ntTS, dgemm_small_kernel_tnTS, dgemm_small_kernel_ttTS, dgemm_small_kernel_b0_nnTS, dgemm_small_kernel_b0_ntTS, dgemm_small_kernel_b0_tnTS, dgemm_small_kernel_b0_ttTS, #endif #endif -#if (BUILD_DOUBLE==1) +#if (BUILD_DOUBLE==1) dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, @@ -449,7 +449,7 @@ gotoblas_t TABLE_NAME = { camax_kTS, camin_kTS, #endif #if (BUILD_COMPLEX) - icamax_kTS, + icamax_kTS, #endif #if (BUILD_COMPLEX) icamin_kTS, @@ -463,8 +463,8 @@ gotoblas_t TABLE_NAME = { #endif #if (BUILD_COMPLEX) caxpy_kTS, - caxpyc_kTS, - cscal_kTS, + caxpyc_kTS, + cscal_kTS, cswap_kTS, cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS, @@ -604,7 +604,7 @@ gotoblas_t TABLE_NAME = { #if (BUILD_COMPLEX) #ifndef NO_LAPACK cneg_tcopyTS, - + claswp_ncopyTS, #else NULL, NULL, @@ -1139,6 +1139,16 @@ static int get_L3_size() { ); return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB } +static int get_cpu_prid() { + int ret = 0, id = 0x0; + __asm__ volatile ( + "cpucfg %[ret], %[id]" + : [ret]"=r"(ret) + : [id]"r"(id) + : "memory" + ); + return ret; +} static void init_parameter(void) { #ifdef BUILD_BFLOAT16 @@ -1228,6 +1238,28 @@ static void init_parameter(void) { } } #endif +#elif defined(LA264) + int prid = get_cpu_prid(); + if (prid == 0x0014b020) { //2k3000 + TABLE_NAME.zgemm_p = 128; + TABLE_NAME.zgemm_q = 176; + TABLE_NAME.zgemm_r = 360; + } else { + TABLE_NAME.zgemm_p = 64; + TABLE_NAME.zgemm_q = 120; + TABLE_NAME.zgemm_r = 4096; + } + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; #else TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; @@ -1493,7 +1525,7 @@ static void init_parameter(void) { (void) l2; /* dirty trick to suppress unused variable warning for targets */ /* where the GEMM unrolling parameters do not depend on l2 */ - + #ifdef BUILD_BFLOAT16 TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; @@ -1551,7 +1583,7 @@ static void init_parameter(void) { #if BUILD_DOUBLE == 1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 32 * (l2 >> 7); #endif -#if BUILD_COMPLEX==1 +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 32 * (l2 >> 7); #endif #if BUILD_COMPLEX16==1 @@ -1575,7 +1607,7 @@ static void init_parameter(void) { #if BUILD_DOUBLE == 1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 48 * (l2 >> 7); #endif -#if BUILD_COMPLEX==1 +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 48 * (l2 >> 7); #endif #if BUILD_COMPLEX16==1 @@ -2131,7 +2163,7 @@ static void init_parameter(void) { ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); #endif -#if BUILD_COMPLEX ==1 +#if BUILD_COMPLEX ==1 TABLE_NAME.cgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) diff --git a/param.h b/param.h index c4a1b2520a..8e7ab29f24 100644 --- a/param.h +++ b/param.h @@ -13,9 +13,9 @@ modification, are permitted provided that the following conditions are notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -2433,7 +2433,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_Q 104 #define CGEMM_DEFAULT_R 1012 - + #define ZGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_Q 104 #define ZGEMM_DEFAULT_R 1012 @@ -2518,7 +2518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SNUMOPT 16 #define DNUMOPT 8 -#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL @@ -2572,7 +2572,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SNUMOPT 16 #define DNUMOPT 8 -#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL @@ -2635,7 +2635,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 512 #define CGEMM_DEFAULT_Q 384 -#define ZGEMM_DEFAULT_Q 384 +#define ZGEMM_DEFAULT_Q 384 #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 @@ -2925,17 +2925,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 -#define ZGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P zgemm_p #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 -#define ZGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q zgemm_q #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R zgemm_r #define SYMV_P 16 #endif @@ -3391,7 +3391,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host -is a big desktop or server with abundant cache rather than a phone or embedded device */ +is a big desktop or server with abundant cache rather than a phone or embedded device */ #if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) || defined(VORTEXM4) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 @@ -3772,7 +3772,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout /* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ -#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_MN 32 From d1df5928aed8cb8a4388ef93ec7ded8e745a25fd Mon Sep 17 00:00:00 2001 From: pengxu Date: Mon, 8 Jun 2026 14:25:26 +0800 Subject: [PATCH 2/3] optimize ic/zamin lsx kernel --- kernel/loongarch64/icamin_lsx.S | 61 ++++++++++----------------------- 1 file changed, 19 insertions(+), 42 deletions(-) diff --git a/kernel/loongarch64/icamin_lsx.S b/kernel/loongarch64/icamin_lsx.S index 982a41fe25..d47c30e1d4 100644 --- a/kernel/loongarch64/icamin_lsx.S +++ b/kernel/loongarch64/icamin_lsx.S @@ -58,6 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VI3 $vr8 #define VI4 $vr19 #define VT0 $vr23 +#define VMASK $vr7 PROLOGUE li.d i0, 0 @@ -76,6 +77,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li.d I, -1 vreplgr2vr.d VI4, I vffint.d.l VI4, VI4 // -1 + li.d I, 0x7fffffffffffffff // Mask for clearing the sign bit + vreplgr2vr.d VMASK, I bne INCX, TEMP, .L20 // incx != 1 // Init Index @@ -99,17 +102,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld VX1, X, 2 * SIZE vpickev.d x1, VX1, VX0 vpickod.d x2, VX1, VX0 - vfmul.d x3, VI4, x1 - vfmul.d x4, VI4, x2 - vfcmp.clt.d VT0, x1, VI3 - vfcmp.clt.d VINC8, x2, VI3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VINC8 + vand.v x1, x1, VMASK + vand.v x2, x2, VMASK vfadd.d VM0, x1, x2 #else li.w I, -1 vreplgr2vr.w VI4, I vffint.s.w VI4, VI4 // -1 + li.d I, 0x7fffffff // Mask for clearing the sign bit + vreplgr2vr.w VMASK, I bne INCX, TEMP, .L20 // incx != 1 // Init Index @@ -141,12 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld VX1, X, 4 * SIZE vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, VI4, x1 - vfmul.s x4, VI4, x2 - vfcmp.clt.s VT0, x1, VI3 - vfcmp.clt.s VINC8, x2, VI3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VINC8 + vand.v x1, x1, VMASK + vand.v x2, x2, VMASK vfadd.s VM0, x1, x2 #endif .align 3 @@ -159,12 +156,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d I, I, -1 vpickev.d x1, VX1, VX0 vpickod.d x2, VX1, VX0 - vfmul.d x3, VI4, x1 - vfmul.d x4, VI4, x2 - vfcmp.clt.d VT0, x1, VI3 - vfcmp.clt.d VINC8, x2, VI3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VINC8 + vand.v x1, x1, VMASK + vand.v x2, x2, VMASK vfadd.d x1, x1, x2 vfmin.d x3, VM0, x1 vfcmp.ceq.d VT0, x3, VM0 @@ -183,12 +176,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 #endif - VFMUL x3, VI4, x1 - VFMUL x4, VI4, x2 - VCMPLT VT0, x1, VI3 - VCMPLT VINC8, x2, VI3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VINC8 + vand.v x1, x1, VMASK + vand.v x2, x2, VMASK VFADD x1, x1, x2 VFMIN x3, VM0, x1 VCMPEQ VT0, x3, VM0 @@ -264,12 +253,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.d x2, t2, 0 vinsgr2vr.d x1, t3, 1 vinsgr2vr.d x2, t4, 1 - vfmul.d x3, VI4, x1 - vfmul.d x4, VI4, x2 - vfcmp.clt.d VT0, x1, VI3 - vfcmp.clt.d VINC8, x2, VI3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VINC8 + vand.v x1, x1, VMASK + vand.v x2, x2, VMASK vfadd.d VM0, x1, x2 #else addi.w i0, i0, 1 @@ -339,12 +324,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.d x1, t3, 1 vinsgr2vr.d x2, t4, 1 vadd.d VI1, VI1, VINC4 - vfmul.d x3, VI4, x1 - vfmul.d x4, VI4, x2 - vfcmp.clt.d VT0, x1, VI3 - vfcmp.clt.d VINC8, x2, VI3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VINC8 + vand.v x1, x1, VMASK + vand.v x2, x2, VMASK vfadd.d x1, x1, x2 vfmin.d x3, VM0, x1 ld.d t1, X, 0 * SIZE @@ -385,12 +366,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w x2, t4, 3 #endif addi.d I, I, -1 - VFMUL x3, VI4, x1 - VFMUL x4, VI4, x2 - VCMPLT VT0, x1, VI3 - VCMPLT VINC8, x2, VI3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VINC8 + vand.v x1, x1, VMASK + vand.v x2, x2, VMASK VFADD x1, x1, x2 VFMIN x3, VM0, x1 VCMPEQ VT0, x3, VM0 From 4850f86e3d9b8e578a6c715b1083410e94fa06cf Mon Sep 17 00:00:00 2001 From: pengxu Date: Mon, 8 Jun 2026 14:26:03 +0800 Subject: [PATCH 3/3] optimize sdot lsx kernel --- kernel/loongarch64/KERNEL.LA264 | 2 +- kernel/loongarch64/sdot_lsx.S | 121 ++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 kernel/loongarch64/sdot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LA264 b/kernel/loongarch64/KERNEL.LA264 index 068b3cf4c4..18f18af738 100644 --- a/kernel/loongarch64/KERNEL.LA264 +++ b/kernel/loongarch64/KERNEL.LA264 @@ -1,6 +1,6 @@ ifndef NO_LSX -SDOTKERNEL = dot_lsx.S +SDOTKERNEL = sdot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S CDOTKERNEL = cdot_lsx.S diff --git a/kernel/loongarch64/sdot_lsx.S b/kernel/loongarch64/sdot_lsx.S new file mode 100644 index 0000000000..9adf1006bb --- /dev/null +++ b/kernel/loongarch64/sdot_lsx.S @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 + +#define I $r17 +#define TEMP $r18 + +/* Don't change following FR unless you know the effects. */ +#define s1 $f8 +#define s2 $f9 +#define a1 $f10 +#define b1 $f11 + + PROLOGUE + + vxor.v $vr8, $vr8, $vr8 + vxor.v $vr9, $vr9, $vr9 + slli.d INCX, INCX, BASE_SHIFT + li.d TEMP, SIZE + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + bne INCX, TEMP, .L20 /* inc_x=1 */ + bne INCY, TEMP, .L20 /* inc_y=1 */ + + /* ((inc_x == 1) && (inc_y == 1)) */ + srai.d I, N, 4 + bge $r0, I, .L12 /* FLOAT: <16 */ +.L11: + /* FLOAT: 16~ */ + vld $vr0, X, 0 + vld $vr1, X, 16 + vld $vr2, X, 32 + vld $vr3, X, 48 + vld $vr4, Y, 0 + vld $vr5, Y, 16 + vld $vr6, Y, 32 + vld $vr7, Y, 48 + addi.d I, I, -1 + addi.d X, X, 64 + addi.d Y, Y, 64 + + vfmadd.s $vr8, $vr0, $vr4, $vr8 + vfmadd.s $vr9, $vr1, $vr5, $vr9 + vfmadd.s $vr8, $vr2, $vr6, $vr8 + vfmadd.s $vr9, $vr3, $vr7, $vr9 + + bnez I, .L11 + + vfadd.s $vr8, $vr8, $vr9 + vextrins.w $vr1, $vr8, 0x01 + vextrins.w $vr2, $vr8, 0x02 + vextrins.w $vr3, $vr8, 0x03 + fadd.s $f8, $f8, $f1 + fadd.s $f8, $f8, $f2 + fadd.s $f8, $f8, $f3 +.L12: + andi I, N, 0xf + bge $r0, I, .L999 +.L13: + addi.d I, I, -1 + fld.s $f0, X, 0 + fld.s $f4, Y, 0 + addi.d X, X, 4 + addi.d Y, Y, 4 + fmadd.s $f8, $f0, $f4, $f8 + bnez I, .L13 + b .L999 + + /* !((inc_x == 1) && (inc_y == 1)) */ +.L20: + move I, N +.L21: + addi.d I, I, -1 + + fld.s $f0, X, 0 + fld.s $f4, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + fmadd.s $f8, $f0, $f4, $f8 + + bnez I, .L21 + b .L999 + +.L999: + fmov.s $f0, $f8 + jirl $r0, $r1, 0x0 + + EPILOGUE