From 1415e946fa4e5b15aafc3f19f14a5381b5f119e8 Mon Sep 17 00:00:00 2001 From: Michael Yeh <111819036+myeh01@users.noreply.github.com> Date: Thu, 9 May 2024 13:07:24 -0700 Subject: [PATCH 1/8] Use intrinsics for all sifive_x280 level 1, 1f, 1m kernels --- .../sifive_x280/bli_cntx_init_sifive_x280.c | 80 +- config/sifive_x280/make_defs.mk | 2 +- .../sifive_x280/1/bli_amaxv_sifive_x280_asm.c | 293 -- .../bli_amaxv_sifive_x280_intr.c | 179 + .../bli_amaxv_sifive_x280_intr_complex.c | 105 + .../bli_amaxv_sifive_x280_intr_real.c | 100 + .../bli_axpbyv_sifive_x280_intr.c | 4 +- .../sifive_x280/1/bli_copyv_sifive_x280_asm.c | 272 -- .../bli_copyv_sifive_x280_intr.c | 116 + .../bli_copyv_sifive_x280_intr_complex.c | 75 + .../bli_copyv_sifive_x280_intr_real.c | 68 + .../1/bli_invertv_sifive_x280_asm.c | 221 -- .../bli_invertv_sifive_x280_intr.c | 118 + .../bli_invertv_sifive_x280_intr_complex.c | 83 + .../bli_invertv_sifive_x280_intr_real.c | 68 + .../1/bli_invscalv_sifive_x280_asm.c | 266 -- .../bli_invscalv_sifive_x280_intr.c | 117 + .../bli_invscalv_sifive_x280_intr_complex.c | 83 + .../bli_invscalv_sifive_x280_intr_real.c | 75 + .../bli_scal2v_sifive_x280_intr.c | 4 +- .../bli_scalv_sifive_x280_intr.c | 2 +- .../sifive_x280/1/bli_setv_sifive_x280_asm.c | 204 -- .../bli_setv_sifive_x280_intr.c | 116 + .../bli_setv_sifive_x280_intr_complex.c | 71 + .../bli_setv_sifive_x280_intr_real.c | 64 + .../sifive_x280/1/bli_swapv_sifive_x280_asm.c | 245 -- .../bli_swapv_sifive_x280_intr.c | 115 + .../bli_swapv_sifive_x280_intr_complex.c | 76 + .../bli_swapv_sifive_x280_intr_real.c | 76 + .../bli_xpbyv_sifive_x280_intr.c | 2 +- .../1f/bli_axpyf_sifive_x280_asm.c | 430 --- .../bli_axpyf_sifive_x280_intr.c | 121 + .../bli_axpyf_sifive_x280_intr_complex.c | 149 + .../bli_axpyf_sifive_x280_intr_real.c | 96 + .../1f/bli_dotxaxpyf_sifive_x280_asm.c | 3120 ----------------- .../bli_dotxaxpyf_sifive_x280_intr.c | 137 + .../bli_dotxaxpyf_sifive_x280_intr_complex.c | 427 +++ .../bli_dotxaxpyf_sifive_x280_intr_real.c | 283 ++ .../1f/bli_dotxf_sifive_x280_asm.c | 2645 -------------- .../bli_dotxf_sifive_x280_intr.c | 132 + .../bli_dotxf_sifive_x280_intr_complex.c | 324 ++ .../bli_dotxf_sifive_x280_intr_real.c | 262 ++ .../1m/bli_packm_sifive_x280_asm.c | 1465 -------- .../bli_packm_sifive_x280_intr.c | 168 + .../bli_packm_sifive_x280_intr_complex.c | 545 +++ .../bli_packm_sifive_x280_intr_real.c | 364 ++ kernels/sifive_x280/bli_kernels_sifive_x280.h | 80 +- kernels/sifive_x280/riscv_cmul_macros_intr.h | 129 + .../sifive_x280/riscv_overloaded_intrinsics.h | 86 +- 49 files changed, 5012 insertions(+), 9251 deletions(-) delete mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c create mode 100644 kernels/sifive_x280/riscv_cmul_macros_intr.h diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c index 56a1a66d53..0f5a39d104 100644 --- a/config/sifive_x280/bli_cntx_init_sifive_x280.c +++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c @@ -54,10 +54,10 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_ADDV_KER, BLIS_SCOMPLEX, bli_caddv_sifive_x280_intr, BLIS_ADDV_KER, BLIS_DCOMPLEX, bli_zaddv_sifive_x280_intr, - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_x280_asm, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_x280_asm, - BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_x280_asm, - BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_asm, + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_x280_intr, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_x280_intr, + BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_x280_intr, + BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_intr, BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_sifive_x280_intr, BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_sifive_x280_intr, @@ -69,10 +69,10 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_sifive_x280_intr, BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_sifive_x280_intr, - BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_x280_asm, - BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_x280_asm, - BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_asm, - BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_asm, + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_x280_intr, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_x280_intr, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_intr, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_intr, BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_sifive_x280_intr, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_sifive_x280_intr, @@ -84,15 +84,15 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_sifive_x280_intr, BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_sifive_x280_intr, - BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_x280_asm, - BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_x280_asm, - BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_asm, - BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_asm, + BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_x280_intr, + BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_x280_intr, + BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_intr, + BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_intr, - BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_x280_asm, - BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_x280_asm, - BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_asm, - BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_asm, + BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_x280_intr, + BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_x280_intr, + BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_intr, + BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_intr, BLIS_SCAL2V_KER, BLIS_FLOAT, bli_sscal2v_sifive_x280_intr, BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_sifive_x280_intr, @@ -104,20 +104,20 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_sifive_x280_intr, BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_sifive_x280_intr, - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_x280_asm, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_x280_asm, - BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_x280_asm, - BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_x280_asm, + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_x280_intr, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_x280_intr, + BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_x280_intr, + BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_x280_intr, BLIS_SUBV_KER, BLIS_FLOAT, bli_ssubv_sifive_x280_intr, BLIS_SUBV_KER, BLIS_DOUBLE, bli_dsubv_sifive_x280_intr, BLIS_SUBV_KER, BLIS_SCOMPLEX, bli_csubv_sifive_x280_intr, BLIS_SUBV_KER, BLIS_DCOMPLEX, bli_zsubv_sifive_x280_intr, - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_x280_asm, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_x280_asm, - BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_x280_asm, - BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_x280_asm, + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_x280_intr, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_x280_intr, + BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_x280_intr, + BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_x280_intr, BLIS_XPBYV_KER, BLIS_FLOAT, bli_sxpbyv_sifive_x280_intr, BLIS_XPBYV_KER, BLIS_DOUBLE, bli_dxpbyv_sifive_x280_intr, @@ -130,31 +130,31 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_AXPY2V_KER, BLIS_SCOMPLEX, bli_caxpy2v_sifive_x280_intr, BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_sifive_x280_intr, - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_x280_asm, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_x280_asm, - BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_asm, - BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_asm, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_x280_intr, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_x280_intr, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_intr, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_intr, - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_x280_asm, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_x280_asm, - BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_asm, - BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_asm, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_x280_intr, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_x280_intr, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_intr, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_intr, BLIS_DOTAXPYV_KER, BLIS_FLOAT, bli_sdotaxpyv_sifive_x280_intr, BLIS_DOTAXPYV_KER, BLIS_DOUBLE, bli_ddotaxpyv_sifive_x280_intr, BLIS_DOTAXPYV_KER, BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_x280_intr, BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_x280_intr, - BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_x280_asm, - BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_x280_asm, - BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_asm, - BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_asm, + BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_x280_intr, + BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_x280_intr, + BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_intr, + BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_intr, // Level 1m - BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_sifive_x280_asm_7m4, - BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_sifive_x280_asm_7m4, - BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_6m2, - BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_6m2, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_sifive_x280_intr, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_sifive_x280_intr, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_intr, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_intr, // Level 3 BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_x280_asm_7m4, diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk index acdf5a3611..31b31e387a 100644 --- a/config/sifive_x280/make_defs.mk +++ b/config/sifive_x280/make_defs.mk @@ -61,7 +61,7 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -Ofast +COPTFLAGS := -O3 endif # Flags specific to optimized kernels. diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c deleted file mode 100644 index c423dd131d..0000000000 --- a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, - dim_t *index, const cntx_t *cntx) { - // assumes 64-bit index - (void)cntx; - const float* restrict x = x_; - - if (n <= 1) { - *index = 0; - return; - } - incx *= 4; - size_t avl = n; - size_t offset = 0; - bool first = true; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma" - : "=r"(vl) - : "r"(avl)); - if (incx == 4) - __asm__("vle32.v v24, (%0)" : : "r"(x)); - else - __asm__("vlse32.v v24, (%0), %1" : : "r"(x), "r"(incx)); - // check for NaN - __asm__ volatile("vmfne.vv v0, v24, v24"); - dim_t nan_index; - __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); - if (nan_index != -1) { - *index = nan_index + offset; - return; - } - if (first) { - __asm__("vfabs.v v8, v24"); - // keep vl same, change SEW and LMUL - __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); - __asm__("vid.v v16"); - first = false; - } else { - __asm__("vfabs.v v24, v24"); - __asm__("vmflt.vv v0, v8, v24"); - __asm__("vmerge.vvm v8, v8, v24, v0"); - // keep vl same, change SEW and LMUL - __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma"); - __asm__("vid.v v24"); - __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); - __asm__("vmerge.vvm v16, v16, v24, v0"); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - offset += vl; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n)); - __asm__("vmv.s.x v0, zero"); - __asm__("vfredmax.vs v0, v8, v0"); - __asm__("vrgather.vi v24, v0, 0"); - __asm__("vmfeq.vv v0, v8, v24"); - __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); - uint64_t imax = -1; - __asm__("vmv.s.x v24, %0" : : "r"(imax)); - __asm__("vredminu.vs v24, v16, v24, v0.t"); - __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vse64.v v24, (%0)" : : "r"(index)); - return; -} - -void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, - dim_t *index, const cntx_t *cntx) { - // assumes 64-bit index - (void)cntx; - const double* restrict x = x_; - - if (n <= 1) { - *index = 0; - return; - } - incx *= 8; - size_t avl = n; - size_t offset = 0; - bool first = true; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e64, m8, tu, ma" - : "=r"(vl) - : "r"(avl)); - if (incx == 8) - __asm__("vle64.v v24, (%0)" : : "r"(x)); - else - __asm__("vlse64.v v24, (%0), %1" : : "r"(x), "r"(incx)); - // check for NaN - __asm__ volatile("vmfne.vv v0, v24, v24"); - dim_t nan_index; - __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); - if (nan_index != -1) { - *index = nan_index + offset; - return; - } - if (first) { - __asm__("vfabs.v v8, v24"); - __asm__("vid.v v16"); - first = false; - } else { - __asm__("vfabs.v v24, v24"); - __asm__("vmflt.vv v0, v8, v24"); - __asm__("vmerge.vvm v8, v8, v24, v0"); - __asm__("vid.v v24"); - __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); - __asm__("vmerge.vvm v16, v16, v24, v0"); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - offset += vl; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e64, m8, ta, ma" : : "r"(n)); - __asm__("vmv.s.x v0, zero"); - __asm__("vfredmax.vs v0, v8, v0"); - __asm__("vrgather.vi v24, v0, 0"); - __asm__("vmfeq.vv v0, v8, v24"); - uint64_t imax = -1; - __asm__("vmv.s.x v24, %0" : : "r"(imax)); - __asm__("vredminu.vs v24, v16, v24, v0.t"); - __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vse64.v v24, (%0)" : : "r"(index)); - return; -} - -void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, - dim_t *index, const cntx_t *cntx) { - // assumes 64-bit index - (void)cntx; - const scomplex* restrict x = x_; - - if (n <= 1) { - *index = 0; - return; - } - incx *= 8; - size_t avl = n; - size_t offset = 0; - bool first = true; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma" - : "=r"(vl) - : "r"(avl)); - if (incx == 8) - __asm__("vlseg2e32.v v24, (%0)" : : "r"(x)); - else - __asm__("vlsseg2e32.v v24, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfabs.v v24, v24"); - __asm__("vfabs.v v28, v28"); - __asm__("vfadd.vv v24, v24, v28"); - // check for NaN - __asm__ volatile("vmfne.vv v0, v24, v24"); - dim_t nan_index; - __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); - if (nan_index != -1) { - *index = nan_index + offset; - return; - } - if (first) { - __asm__("vmv4r.v v8, v24"); - // keep vl same, change SEW and LMUL - __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); - __asm__("vid.v v16"); - first = false; - } else { - __asm__("vmflt.vv v0, v8, v24"); - __asm__("vmerge.vvm v8, v8, v24, v0"); - // keep vl same, change SEW and LMUL - __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma"); - __asm__("vid.v v24"); - __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); - __asm__("vmerge.vvm v16, v16, v24, v0"); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - offset += vl; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n)); - __asm__("vmv.s.x v0, zero"); - __asm__("vfredmax.vs v0, v8, v0"); - __asm__("vrgather.vi v24, v0, 0"); - __asm__("vmfeq.vv v0, v8, v24"); - __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); - uint64_t imax = -1; - __asm__("vmv.s.x v24, %0" : : "r"(imax)); - __asm__("vredminu.vs v24, v16, v24, v0.t"); - __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vse64.v v24, (%0)" : : "r"(index)); - return; -} - -void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, - dim_t *index, const cntx_t *cntx) { - // assumes 64-bit index - (void)cntx; - const dcomplex* restrict x = x_; - - if (n <= 1) { - *index = 0; - return; - } - incx *= 16; - size_t avl = n; - size_t offset = 0; - bool first = true; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e64, m4, tu, ma" - : "=r"(vl) - : "r"(avl)); - if (incx == 16) - __asm__("vlseg2e64.v v24, (%0)" : : "r"(x)); - else - __asm__("vlsseg2e64.v v24, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfabs.v v24, v24"); - __asm__("vfabs.v v28, v28"); - __asm__("vfadd.vv v24, v24, v28"); - // check for NaN - __asm__ volatile("vmfne.vv v0, v24, v24"); - dim_t nan_index; - __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); - if (nan_index != -1) { - *index = nan_index + offset; - return; - } - if (first) { - __asm__("vmv4r.v v8, v24"); - __asm__("vid.v v16"); - first = false; - } else { - __asm__("vmflt.vv v0, v8, v24"); - __asm__("vmerge.vvm v8, v8, v24, v0"); - __asm__("vid.v v24"); - __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); - __asm__("vmerge.vvm v16, v16, v24, v0"); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - offset += vl; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(n)); - __asm__("vmv.s.x v0, zero"); - __asm__("vfredmax.vs v0, v8, v0"); - __asm__("vrgather.vi v24, v0, 0"); - __asm__("vmfeq.vv v0, v8, v24"); - uint64_t imax = -1; - __asm__("vmv.s.x v24, %0" : : "r"(imax)); - __asm__("vredminu.vs v24, v16, v24, v0.t"); - __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vse64.v v24, (%0)" : : "r"(index)); - return; -} diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c new file mode 100644 index 0000000000..4f7d546304 --- /dev/null +++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c @@ -0,0 +1,179 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include +#include +#include + +#define AMAXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##amaxv_sifive_x280_intr(\ + dim_t n, \ + const T* restrict x_, inc_t incx, \ + dim_t* index, \ + const cntx_t* cntx \ +) + +#define AMAXV(...) AMAXV_(__VA_ARGS__) + +// BLIS defines integers to be 32 or 64 bits according to BLIS_INT_TYPE_SIZE. +// If BLIS_INT_TYPE_SIZE is any other value, integers are defined to be longs. +#if BLIS_INT_TYPE_SIZE == 32 || BLIS_INT_TYPE_SIZE == 64 +#define AMAXV_SIFIVE_X280_INT_SIZE BLIS_INT_TYPE_SIZE +#elif LONG_MAX == INT32_MAX +#define AMAXV_SIFIVE_X280_INT_SIZE 32 +#elif LONG_MAX == INT64_MAX +#define AMAXV_SIFIVE_X280_INT_SIZE 64 +#else +#error "Integers must be 32- or 64-bits for bli_?amaxv_sifive_x280_intr." +#endif + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC_X 32 +#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#if PREC_I == 32 +#define LMUL_X m4 +#define LMUL_I m4 +#define RATIO 8 +#elif PREC_I == 64 +#define LMUL_X m4 +#define LMUL_I m8 +#define RATIO 8 +#endif +#define FLT_SIZE sizeof(float) + +#include "./bli_amaxv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC_X +#undef PREC_I +#undef LMUL_X +#undef LMUL_I +#undef RATIO +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC_X 64 +#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#if PREC_I == 32 +#define LMUL_X m8 +#define LMUL_I m4 +#define RATIO 8 +#elif PREC_I == 64 +#define LMUL_X m4 +#define LMUL_I m4 +#define RATIO 16 +#endif +#define FLT_SIZE sizeof(double) + +#include "./bli_amaxv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC_X +#undef PREC_I +#undef LMUL_X +#undef LMUL_I +#undef RATIO +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC_X 32 +#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#if PREC_I == 32 +#define LMUL_X m4 +#define LMUL_I m4 +#define RATIO 8 +#elif PREC_I == 64 +#define LMUL_X m4 +#define LMUL_I m8 +#define RATIO 8 +#endif +#define FLT_SIZE sizeof(float) + +#include "./bli_amaxv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC_X +#undef PREC_I +#undef LMUL_X +#undef LMUL_I +#undef RATIO +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC_X 64 +#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#if PREC_I == 32 +#define LMUL_X m8 +#define LMUL_I m4 +#define RATIO 8 +#elif PREC_I == 64 +#define LMUL_X m4 +#define LMUL_I m4 +#define RATIO 16 +#endif +#define FLT_SIZE sizeof(double) + +#include "./bli_amaxv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC_X +#undef PREC_I +#undef LMUL_X +#undef LMUL_I +#undef RATIO +#undef FLT_SIZE + +#undef AMAXV_SIFIVE_X280_INT_SIZE + +#undef AMAXV +#undef AMAXV_ diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..f1f3a749e7 --- /dev/null +++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AMAXV + +AMAXV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict x = x_; + + if (n <= 1) { + *index = 0; + return; + } + + RVV_TYPE_F(PREC_X, LMUL_X) xacc; + // Indices will be unsigned and of the same width as dim_t. + RVV_TYPE_U(PREC_I, LMUL_I) iacc; + RVV_TYPE_U(PREC_I, LMUL_I) vid_vec = VID_V(PREC_I, LMUL_I)(n); + bool first = true; + guint_t offset = 0; + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC_X, LMUL_X)(avl); + RVV_TYPE_FX(PREC_X, LMUL_X, 2) xvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC_X, LMUL_X, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC_X, LMUL_X, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + + RVV_TYPE_F(PREC_X, LMUL_X) xvec_real = VGET_V_F(PREC_X, LMUL_X, 2)(xvec, 0); + RVV_TYPE_F(PREC_X, LMUL_X) xvec_imag = VGET_V_F(PREC_X, LMUL_X, 2)(xvec, 1); + RVV_TYPE_F(PREC_X, LMUL_X) xvec_real_abs = VFABS_V(PREC_X, LMUL_X)(xvec_real, vl); + RVV_TYPE_F(PREC_X, LMUL_X) xvec_imag_abs = VFABS_V(PREC_X, LMUL_X)(xvec_imag, vl); + RVV_TYPE_F(PREC_X, LMUL_X) xvec_abs = VFADD_VV(PREC_X, LMUL_X)(xvec_real_abs, xvec_imag_abs, vl); + + RVV_TYPE_B(RATIO) is_nan = VMFNE_VV(PREC_X, LMUL_X, RATIO)(xvec_abs, xvec_abs, vl); + int nan_index = VFIRST_M(RATIO)(is_nan, vl); + if (nan_index != -1) { + *index = (guint_t) nan_index + offset; + return; + } + + if (first) { + xacc = xvec_abs; + iacc = vid_vec; + first = false; + } + else { + RVV_TYPE_B(RATIO) mask = VMFGT_VV(PREC_X, LMUL_X, RATIO)(xvec_abs, xacc, vl); + xacc = VFMAX_VV_TU(PREC_X, LMUL_X)(xacc, xvec_abs, xacc, vl); + RVV_TYPE_U(PREC_I, LMUL_I) ivec = VADD_VX_U(PREC_I, LMUL_I)(vid_vec, offset, vl); + iacc = VMERGE_VVM_TU_U(PREC_I, LMUL_I)(iacc, iacc, ivec, mask, vl); + } + + x += vl * incx; + offset += vl; + avl -= vl; + } + + RVV_TYPE_F(PREC_X, m1) xmax = VFMV_S_F(PREC_X, m1)(0., 1); + xmax = VFREDMAX_VS(PREC_X, LMUL_X)(xacc, xmax, n); + RVV_TYPE_F(PREC_X, LMUL_X) xmax_splat = VLMUL_EXT_V_F_M1(PREC_X, LMUL_X)(xmax); + xmax_splat = VRGATHER_VX_F(PREC_X, LMUL_X)(xmax_splat, 0, n); + RVV_TYPE_B(RATIO) mask = VMFEQ_VV(PREC_X, LMUL_X, RATIO)(xacc, xmax_splat, n); + RVV_TYPE_U(PREC_I, m1) imax = VMV_S_X_U(PREC_I, m1)(-1, 1); + imax = VREDMINU_VS_M(PREC_I, LMUL_I)(mask, iacc, imax, n); + *index = VMV_X_S_U(PREC_I)(imax); + return; +} + +#endif // AMAXV diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..bcc4ee99de --- /dev/null +++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c @@ -0,0 +1,100 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AMAXV + +AMAXV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict x = x_; + + if (n <= 1) { + *index = 0; + return; + } + + RVV_TYPE_F(PREC_X, LMUL_X) xacc; + // Indices will be unsigned and of the same width as dim_t. + RVV_TYPE_U(PREC_I, LMUL_I) iacc; + RVV_TYPE_U(PREC_I, LMUL_I) vid_vec = VID_V(PREC_I, LMUL_I)(n); + bool first = true; + guint_t offset = 0; + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC_X, LMUL_X)(avl); + RVV_TYPE_F(PREC_X, LMUL_X) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC_X, LMUL_X)(x, vl); + else + xvec = VLSE_V_F(PREC_X, LMUL_X)(x, FLT_SIZE * incx, vl); + + RVV_TYPE_B(RATIO) is_nan = VMFNE_VV(PREC_X, LMUL_X, RATIO)(xvec, xvec, vl); + int nan_index = VFIRST_M(RATIO)(is_nan, vl); + if (nan_index != -1) { + *index = (guint_t) nan_index + offset; + return; + } + + if (first) { + xacc = VFABS_V(PREC_X, LMUL_X)(xvec, vl); + iacc = vid_vec; + first = false; + } + else { + xvec = VFABS_V(PREC_X, LMUL_X)(xvec, vl); + RVV_TYPE_B(RATIO) mask = VMFGT_VV(PREC_X, LMUL_X, RATIO)(xvec, xacc, vl); + xacc = VFMAX_VV_TU(PREC_X, LMUL_X)(xacc, xvec, xacc, vl); + RVV_TYPE_U(PREC_I, LMUL_I) ivec = VADD_VX_U(PREC_I, LMUL_I)(vid_vec, offset, vl); + iacc = VMERGE_VVM_TU_U(PREC_I, LMUL_I)(iacc, iacc, ivec, mask, vl); + } + + x += vl * incx; + offset += vl; + avl -= vl; + } + + RVV_TYPE_F(PREC_X, m1) xmax = VFMV_S_F(PREC_X, m1)(0., 1); + xmax = VFREDMAX_VS(PREC_X, LMUL_X)(xacc, xmax, n); + RVV_TYPE_F(PREC_X, LMUL_X) xmax_splat = VLMUL_EXT_V_F_M1(PREC_X, LMUL_X)(xmax); + xmax_splat = VRGATHER_VX_F(PREC_X, LMUL_X)(xmax_splat, 0, n); + RVV_TYPE_B(RATIO) mask = VMFEQ_VV(PREC_X, LMUL_X, RATIO)(xacc, xmax_splat, n); + RVV_TYPE_U(PREC_I, m1) imax = VMV_S_X_U(PREC_I, m1)(-1, 1); + imax = VREDMINU_VS_M(PREC_I, LMUL_I)(mask, iacc, imax, n); + *index = VMV_X_S_U(PREC_I)(imax); + return; +} + +#endif // AMAXV diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c index 3b29f898df..389292f90f 100644 --- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c +++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c @@ -52,9 +52,7 @@ #define AXPBYV(...) AXPBYV_(__VA_ARGS__) -#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm -#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) #define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr #define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c deleted file mode 100644 index 3571877759..0000000000 --- a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c +++ /dev/null @@ -1,272 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)conjx; - (void)cntx; - const float* restrict x = x_; - float* restrict y = y_; - if (n <= 0) - return; - - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - - if (incy == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)conjx; - (void)cntx; - const double* restrict x = x_; - double* restrict y = y_; - if (n <= 0) - return; - - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - - if (incy == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " - -void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - const scomplex* restrict x = x_; - scomplex* restrict y = y_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - if (conjx == BLIS_NO_CONJUGATE) { - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * 2 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - - if (incy == 2 * FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - } else { - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - - __asm__("vfneg.v v4, v4"); - - if (incy == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(y)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 - -#define FLT_SIZE 8 -#define SH_ADD "sh3add " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - const dcomplex* restrict x = x_; - dcomplex* restrict y = y_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - if (conjx == BLIS_NO_CONJUGATE && incx == 2 * FLT_SIZE && - incy == 2 * FLT_SIZE) { - size_t avl = 2 * n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__(VSE "v0, (%0)" : : "r"(y)); - __asm__(SH_ADD "%0, %1, %0" : "+r"(x) : "r"(vl)); - __asm__(SH_ADD "%0, %1, %0" : "+r"(y) : "r"(vl)); - avl -= vl; - } - } else { - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - - if (conjx == BLIS_CONJUGATE) - __asm__("vfneg.v v4, v4"); - - if (incy == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(y)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - } - return; -} diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c new file mode 100644 index 0000000000..e030d85ff3 --- /dev/null +++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define COPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##copyv_sifive_x280_intr(\ + conj_t conjx, \ + dim_t n, \ + const T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* cntx \ +) + +#define COPYV(...) COPYV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_copyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_copyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_copyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_copyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef COPYV +#undef COPYV_ diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..21e5959679 --- /dev/null +++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef COPYV + +COPYV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + + if (bli_is_conj(conjx)) { + RVV_TYPE_F(PREC, LMUL) xvec_imag; + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + xvec_imag = VFNEG_VF(PREC, LMUL)(xvec_imag, vl); + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag); + } + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, xvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, xvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // COPYV diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..00bb8ed494 --- /dev/null +++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef COPYV + +COPYV(PRECISION_CHAR, void) +{ + (void)conjx; + (void)cntx; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + if (incy == 1) + VSE_V_F(PREC, LMUL)(y, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // COPYV diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c deleted file mode 100644 index cbca885929..0000000000 --- a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - float* restrict x = x_; - if (n <= 0) - return; - - float one = 1.f; - __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); - incx *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__("vfrdiv.vf v0, v0, f0"); - __asm__(VSE "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfrdiv.vf v0, v0, f0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - double* restrict x = x_; - if (n <= 0) - return; - - double one = 1.; - __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); - incx *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__("vfrdiv.vf v0, v0, f0"); - __asm__(VSE "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfrdiv.vf v0, v0, f0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " - -void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - scomplex* restrict x = x_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - __asm__("vfneg.v v4, v4"); - __asm__("vfmul.vv v8, v0, v0"); - __asm__("vfmacc.vv v8, v4, v4"); - __asm__("vfdiv.vv v0, v0, v8"); - __asm__("vfdiv.vv v4, v4, v8"); - __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfneg.v v4, v4"); - __asm__("vfmul.vv v8, v0, v0"); - __asm__("vfmacc.vv v8, v4, v4"); - __asm__("vfdiv.vv v0, v0, v8"); - __asm__("vfdiv.vv v4, v4, v8"); - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 - -#define FLT_SIZE 8 -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - dcomplex* restrict x = x_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - __asm__("vfneg.v v4, v4"); - __asm__("vfmul.vv v8, v0, v0"); - __asm__("vfmacc.vv v8, v4, v4"); - __asm__("vfdiv.vv v0, v0, v8"); - __asm__("vfdiv.vv v4, v4, v8"); - __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfneg.v v4, v4"); - __asm__("vfmul.vv v8, v0, v0"); - __asm__("vfmacc.vv v8, v4, v4"); - __asm__("vfdiv.vv v0, v0, v8"); - __asm__("vfdiv.vv v4, v4, v8"); - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c new file mode 100644 index 0000000000..fc8f8a76d7 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c @@ -0,0 +1,118 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define INVERTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invertv_sifive_x280_intr(\ + dim_t n, \ + T* restrict x_, inc_t incx, \ + const cntx_t* cntx \ +) + +#define INVERTV(...) INVERTV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_invertv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_invertv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define RATIO 8 +#define FLT_SIZE sizeof(float) + +#include "./bli_invertv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef RATIO +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define RATIO 16 +#define FLT_SIZE sizeof(double) + +#include "./bli_invertv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef RATIO +#undef FLT_SIZE + +#undef INVERTV +#undef INVERTV_ diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..994ae3075c --- /dev/null +++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c @@ -0,0 +1,83 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef INVERTV + +INVERTV(PRECISION_CHAR, void) +{ + (void)cntx; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + RVV_TYPE_F(PREC, LMUL) xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + RVV_TYPE_F(PREC, LMUL) xvec_real_abs = VFABS_V(PREC, LMUL)(xvec_real, vl); + RVV_TYPE_F(PREC, LMUL) xvec_imag_abs = VFABS_V(PREC, LMUL)(xvec_imag, vl); + RVV_TYPE_B(RATIO) mask = VMFGE_VV(PREC, LMUL, RATIO)(xvec_real_abs, xvec_imag_abs, vl); + RVV_TYPE_F(PREC, LMUL) max = VMERGE_VVM_F(PREC, LMUL)(xvec_imag, xvec_real, mask, vl); + RVV_TYPE_F(PREC, LMUL) min = VMERGE_VVM_F(PREC, LMUL)(xvec_real, xvec_imag, mask, vl); + RVV_TYPE_F(PREC, LMUL) f = VFDIV_VV(PREC, LMUL)(min, max, vl); + RVV_TYPE_F(PREC, LMUL) denom = VFMACC_VV(PREC, LMUL)(max, f, min, vl); + RVV_TYPE_F(PREC, LMUL) t1 = VFRDIV_VF(PREC, LMUL)(denom, 1., vl); + RVV_TYPE_F(PREC, LMUL) t2 = VFDIV_VV(PREC, LMUL)(f, denom, vl); + xvec_real = VMERGE_VVM_F(PREC, LMUL)(t2, t1, mask, vl); + xvec_imag = VMERGE_VVM_F(PREC, LMUL)(t1, t2, mask, vl); + xvec_imag = VFNEG_VF(PREC, LMUL)(xvec_imag, vl); + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, xvec_real); + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag); + + if (incx == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, xvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, xvec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // INVERTV diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..621e88c9f6 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef INVERTV + +INVERTV(PRECISION_CHAR, void) +{ + (void)cntx; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + xvec = VFRDIV_VF(PREC, LMUL)(xvec, 1., vl); + + if (incx == 1) + VSE_V_F(PREC, LMUL)(x, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // INVERTV diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c deleted file mode 100644 index 51edc92214..0000000000 --- a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FDIV "fdiv.s " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)conjalpha; - (void)cntx; - const float* restrict alpha = alpha_; - float* restrict x = x_; - if (n <= 0 || *alpha == 0.f || *alpha == 1.f) - return; - - float one = 1.f; - __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); - __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha)); - __asm__(FDIV "f0, f0, f1"); - incx *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__(VSE "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FDIV -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FDIV "fdiv.d " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)conjalpha; - (void)cntx; - const double* restrict alpha = alpha_; - double* restrict x = x_; - if (n <= 0 || *alpha == 0. || *alpha == 1.) - return; - - double one = 1.; - __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); - __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha)); - __asm__(FDIV "f0, f0, f1"); - incx *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__(VSE "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FDIV -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define FMADD "fmadd.s " -#define FDIV "fdiv.s " -#define FNEG "fneg.s " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " - -void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - const scomplex* restrict alpha = alpha_; - scomplex* restrict x = x_; - if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f)) - return; - - __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FMUL "f2, f0, f0"); - __asm__(FMADD "f2, f1, f1, f2"); - __asm__(FDIV "f0, f0, f2"); - __asm__(FDIV "f1, f1, f2"); - if (conjalpha == BLIS_NO_CONJUGATE) - __asm__(FNEG "f1, f1"); - incx *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - __asm__("vfmul.vf v8, v0, f0"); - __asm__("vfmul.vf v12, v4, f0"); - __asm__("vfnmsac.vf v8, f1, v4"); - __asm__("vfmacc.vf v12, f1, v0"); - __asm__(VSSEG2 "v8, (%0)" : : "r"(x)); - } else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfmul.vf v8, v0, f0"); - __asm__("vfmul.vf v12, v4, f0"); - __asm__("vfnmsac.vf v8, f1, v4"); - __asm__("vfmacc.vf v12, f1, v0"); - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef FMADD -#undef FDIV -#undef FNEG -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define FMADD "fmadd.d " -#define FDIV "fdiv.d " -#define FNEG "fneg.d " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - const dcomplex* restrict alpha = alpha_; - dcomplex* restrict x = x_; - if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.)) - return; - - __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FMUL "f2, f0, f0"); - __asm__(FMADD "f2, f1, f1, f2"); - __asm__(FDIV "f0, f0, f2"); - __asm__(FDIV "f1, f1, f2"); - if (conjalpha == BLIS_NO_CONJUGATE) - __asm__(FNEG "f1, f1"); - incx *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - __asm__("vfmul.vf v8, v0, f0"); - __asm__("vfmul.vf v12, v4, f0"); - __asm__("vfnmsac.vf v8, f1, v4"); - __asm__("vfmacc.vf v12, f1, v0"); - __asm__(VSSEG2 "v8, (%0)" : : "r"(x)); - } else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfmul.vf v8, v0, f0"); - __asm__("vfmul.vf v12, v4, f0"); - __asm__("vfnmsac.vf v8, f1, v4"); - __asm__("vfmacc.vf v12, f1, v0"); - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c new file mode 100644 index 0000000000..a5c7561bd8 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c @@ -0,0 +1,117 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define INVSCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invscalv_sifive_x280_intr(\ + conj_t conjalpha, \ + dim_t n, \ + const T* restrict alpha_, \ + T* restrict x_, inc_t incx, \ + const cntx_t* cntx \ +) + +#define INVSCALV(...) INVSCALV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_invscalv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_invscalv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_invscalv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_invscalv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef INVSCALV +#undef INVSCALV_ diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..077e9dd061 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c @@ -0,0 +1,83 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef INVSCALV + +INVSCALV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + if (PASTEMAC(PRECISION_CHAR, eq1)(*alpha)) return; + if (PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) return; + + DATATYPE alpha_conj_inv; + PASTEMAC(PRECISION_CHAR, copycjs)(conjalpha, *alpha, alpha_conj_inv); + PASTEMAC(PRECISION_CHAR, inverts)(alpha_conj_inv); + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + RVV_TYPE_F(PREC, LMUL) xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + RVV_TYPE_F(PREC, LMUL) yvec_real, yvec_imag; + + VCMUL_VF(PREC, LMUL, yvec_real, yvec_imag, xvec_real, xvec_imag, alpha_conj_inv.real, alpha_conj_inv.imag, vl); + + RVV_TYPE_FX(PREC, LMUL, 2) yvec = VUNDEFINED_FX(PREC, LMUL, 2)(); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); + + if (incx == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, yvec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // INVSCALV diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..a38b97c335 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef INVSCALV + +INVSCALV(PRECISION_CHAR, void) +{ + (void)conjalpha; + (void)cntx; + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + if (PASTEMAC(PRECISION_CHAR, eq1)(*alpha)) return; + if (PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) return; + + DATATYPE alpha_inv = *alpha; + PASTEMAC(PRECISION_CHAR, inverts)(alpha_inv); + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + xvec = VFMUL_VF(PREC, LMUL)(xvec, alpha_inv, vl); + + if (incx == 1) + VSE_V_F(PREC, LMUL)(x, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // INVSCALV diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c index cd2dd2c188..4cae8257c3 100644 --- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c +++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c @@ -51,9 +51,9 @@ #define SCAL2V(...) SCAL2V_(__VA_ARGS__) -#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm +#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) // Single precision real diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c index b5788d632d..d1fb9940eb 100644 --- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c +++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c @@ -49,7 +49,7 @@ #define SCALV(...) SCALV_(__VA_ARGS__) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) // Single precision real diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c deleted file mode 100644 index ef9091f16c..0000000000 --- a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_ssetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, const cntx_t *cntx) { - (void)conjalpha; - (void)cntx; - const float* restrict alpha = alpha_; - float* restrict x = x_; - if (n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma" - : - : "r"(n), "i"(8 * FLT_SIZE)); - __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); - incx *= FLT_SIZE; - - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(x)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, const cntx_t *cntx) { - (void)conjalpha; - (void)cntx; - const double* restrict alpha = alpha_; - double* restrict x = x_; - if (n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma" - : - : "r"(n), "i"(8 * FLT_SIZE)); - __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); - incx *= FLT_SIZE; - - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(x)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define VLSE "vlse32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " - -void bli_csetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, const cntx_t *cntx) { - (void)cntx; - const scomplex* restrict alpha = alpha_; - scomplex* restrict x = x_; - if (n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" - : - : "r"(n), "i"(8 * FLT_SIZE)); - __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); - __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(VLSE "v4, (t0), zero"); - if (conjalpha == BLIS_CONJUGATE) - __asm__("vfneg.v v4, v4"); - incx *= 2 * FLT_SIZE; - - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLSE -#undef VSSEG2 -#undef VSSSEG2 - -#define FLT_SIZE 8 -#define VLSE "vlse64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, const cntx_t *cntx) { - (void)cntx; - const dcomplex* restrict alpha = alpha_; - dcomplex* restrict x = x_; - if (n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" - : - : "r"(n), "i"(8 * FLT_SIZE)); - __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); - __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(VLSE "v4, (t0), zero"); - if (conjalpha == BLIS_CONJUGATE) - __asm__("vfneg.v v4, v4"); - incx *= 2 * FLT_SIZE; - - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c new file mode 100644 index 0000000000..8c2ba7c72a --- /dev/null +++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define SETV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##setv_sifive_x280_intr(\ + conj_t conjalpha, \ + dim_t n, \ + const T* restrict alpha_, \ + T* restrict x_, inc_t incx, \ + const cntx_t* cntx \ +) + +#define SETV(...) SETV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_setv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_setv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_setv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_setv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SETV +#undef SETV_ diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..efee3a7f60 --- /dev/null +++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SETV + +SETV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + + DATATYPE alpha_conj; + PASTEMAC(PRECISION_CHAR, copycjs)(conjalpha, *alpha, alpha_conj); + + RVV_TYPE_F(PREC, LMUL) alpha_conj_real_vec = VFMV_V_F(PREC, LMUL)(alpha_conj.real, n); + RVV_TYPE_F(PREC, LMUL) alpha_conj_imag_vec = VFMV_V_F(PREC, LMUL)(alpha_conj.imag, n); + + RVV_TYPE_FX(PREC, LMUL, 2) alpha_conj_vec = VUNDEFINED_FX(PREC, LMUL, 2)(); + alpha_conj_vec = VSET_V_F(PREC, LMUL, 2)(alpha_conj_vec, 0, alpha_conj_real_vec); + alpha_conj_vec = VSET_V_F(PREC, LMUL, 2)(alpha_conj_vec, 1, alpha_conj_imag_vec); + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + + if (incx == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, alpha_conj_vec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, alpha_conj_vec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // SETV diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..4b73de5c4c --- /dev/null +++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c @@ -0,0 +1,64 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SETV + +SETV(PRECISION_CHAR, void) +{ + (void)conjalpha; + (void)cntx; + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + + RVV_TYPE_F(PREC, LMUL) alpha_vec = VFMV_V_F(PREC, LMUL)(*alpha, n); + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + + if (incx == 1) + VSE_V_F(PREC, LMUL)(x, alpha_vec, vl); + else + VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, alpha_vec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // SETV diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c deleted file mode 100644 index 2342e254a2..0000000000 --- a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, void * restrict y_, - inc_t incy, const cntx_t *cntx) { - (void)cntx; - float* restrict x = x_; - float* restrict y = y_; - if (n <= 0) - return; - - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == FLT_SIZE) - __asm__(VLE "v8, (%0)" : : "r"(y)); - else - __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy)); - - if (incx == FLT_SIZE) - __asm__(VSE "v8, (%0)" : : "r"(x)); - else - __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - double* restrict x = x_; - double* restrict y = y_; - if (n <= 0) - return; - - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == FLT_SIZE) - __asm__(VLE "v8, (%0)" : : "r"(y)); - else - __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy)); - - if (incx == FLT_SIZE) - __asm__(VSE "v8, (%0)" : : "r"(x)); - else - __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_cswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - scomplex* restrict x = x_; - scomplex* restrict y = y_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * 2 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == 2 * FLT_SIZE) - __asm__(VLE "v8, (%0)" : : "r"(y)); - else - __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy)); - - if (incx == 2 * FLT_SIZE) - __asm__(VSE "v8, (%0)" : : "r"(x)); - else - __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == 2 * FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - dcomplex* restrict x = x_; - dcomplex* restrict y = y_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == 2 * FLT_SIZE) - __asm__(VLSEG2 "v8, (%0)" : : "r"(y)); - else - __asm__(VLSSEG2 "v8, (%0), %1" : : "r"(y), "r"(incy)); - - if (incx == 2 * FLT_SIZE) - __asm__(VSSEG2 "v8, (%0)" : : "r"(x)); - else - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(y)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c new file mode 100644 index 0000000000..baf685d35f --- /dev/null +++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c @@ -0,0 +1,115 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define SWAPV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##swapv_sifive_x280_intr(\ + dim_t n, \ + T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* cntx \ +) + +#define SWAPV(...) SWAPV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_swapv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_swapv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_swapv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_swapv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SWAPV +#undef SWAPV_ diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..104ba52235 --- /dev/null +++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c @@ -0,0 +1,76 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SWAPV + +SWAPV(PRECISION_CHAR, void) +{ + (void)cntx; + DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, vl); + + if (incx == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, yvec, vl); + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, xvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, xvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // SWAPV diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..efa7222abf --- /dev/null +++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c @@ -0,0 +1,76 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SWAPV + +SWAPV(PRECISION_CHAR, void) +{ + (void)cntx; + DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL)(y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + if (incx == 1) + VSE_V_F(PREC, LMUL)(x, yvec, vl); + else + VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, yvec, vl); + if (incy == 1) + VSE_V_F(PREC, LMUL)(y, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // SWAPV diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c index dce4085bff..da688851d0 100644 --- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c +++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c @@ -51,7 +51,7 @@ #define XPBYV(...) XPBYV_(__VA_ARGS__) -#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm +#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) // Single precision real diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c deleted file mode 100644 index 43c2ba44e2..0000000000 --- a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c +++ /dev/null @@ -1,430 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_saxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, - const void *restrict alpha_, const void *restrict a_, inc_t inca, - inc_t lda, const void *restrict x_, inc_t incx, - void *restrict y_, inc_t incy, const cntx_t *restrict cntx) { - (void)conja; - (void)conjx; - (void)cntx; - const float *restrict alpha = alpha_; - const float *restrict a = a_; - const float *restrict x = x_; - float *restrict y = y_; - - if (m == 0 || b == 0) - return; - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = m; - while (avl) { - // process vl elements of y at a time - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - // x_tmp traverses x - // a points to the vl x b block of a needed this iteration - // a_tmp traverses the columns of this block - const float* restrict x_tmp = x; - const float* restrict a_tmp = a; - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - if (inca == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - - for (dim_t i = 1; i < b; ++i) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - if (inca == FLT_SIZE) - __asm__(VLE "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmacc.vf v0, ft0, v24"); - } - - if (incy == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(y)); - __asm__("vfmacc.vf v24, ft11, v0"); - __asm__(VSE "v24, (%0)" : : "r"(y)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy)); - __asm__("vfmacc.vf v24, ft11, v0"); - __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy)); - } - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_daxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, - const void *restrict alpha_, const void *restrict a_, inc_t inca, - inc_t lda, const void *restrict x_, inc_t incx, - void *restrict y_, inc_t incy, const cntx_t *restrict cntx) { - (void)conja; - (void)conjx; - (void)cntx; - const double *restrict alpha = alpha_; - const double *restrict a = a_; - const double *restrict x = x_; - double *restrict y = y_; - - if (m == 0 || b == 0) - return; - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = m; - while (avl) { - // process vl elements of y at a time - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - // x_tmp traverses x - // a points to the vl x b block of a needed this iteration - // a_tmp traverses the columns of this block - const double* restrict x_tmp = x; - const double* restrict a_tmp = a; - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - if (inca == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - - for (dim_t i = 1; i < b; ++i) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - if (inca == FLT_SIZE) - __asm__(VLE "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmacc.vf v0, ft0, v24"); - } - - if (incy == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(y)); - __asm__("vfmacc.vf v24, ft11, v0"); - __asm__(VSE "v24, (%0)" : : "r"(y)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy)); - __asm__("vfmacc.vf v24, ft11, v0"); - __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy)); - } - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLSEG "vlseg2e32.v " -#define VLSSEG "vlsseg2e32.v " -#define VSSEG "vsseg2e32.v " -#define VSSSEG "vssseg2e32.v " - -void bli_caxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, - const void *restrict alpha_, const void *restrict a_, - inc_t inca, inc_t lda, const void *restrict x_, - inc_t incx, void *restrict y_, inc_t incy, - const cntx_t *restrict cntx) { - (void)cntx; - const scomplex *restrict alpha = alpha_; - const scomplex *restrict a = a_; - const scomplex *restrict x = x_; - scomplex *restrict y = y_; - - if (m == 0 || b == 0) - return; - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - size_t avl = m; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - const scomplex* restrict x_tmp = x; - const scomplex* restrict a_tmp = a; - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); - if (inca == 2 * FLT_SIZE) - __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmul.vf v0, v24, ft0"); - __asm__("vfmul.vf v4, v24, ft1"); - if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfmsac.vf v4, ft0, v28"); - } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } else { - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfnmacc.vf v4, ft0, v28"); - } - - for (dim_t i = 1; i < b; ++i) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); - if (inca == 2 * FLT_SIZE) - __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmacc.vf v0, ft0, v24"); - if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v4, ft1, v24"); - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { - __asm__("vfnmsac.vf v4, ft1, v24"); - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v4, ft1, v24"); - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE - __asm__("vfnmsac.vf v4, ft1, v24"); - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } - } - - if (incy == 2 * FLT_SIZE) { - __asm__(VLSEG "v24, (%0)" : : "r"(y)); - __asm__("vfmacc.vf v24, ft10, v0"); - __asm__("vfmacc.vf v28, ft10, v4"); - __asm__("vfnmsac.vf v24, ft11, v4"); - __asm__("vfmacc.vf v28, ft11, v0"); - __asm__(VSSEG "v24, (%0)" : : "r"(y)); - } else { - __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); - __asm__("vfmacc.vf v24, ft10, v0"); - __asm__("vfmacc.vf v28, ft10, v4"); - __asm__("vfnmsac.vf v24, ft11, v4"); - __asm__("vfmacc.vf v28, ft11, v0"); - __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); - } - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLSEG -#undef VLSSEG -#undef VSSEG -#undef VSSSEG - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLSEG "vlseg2e64.v " -#define VLSSEG "vlsseg2e64.v " -#define VSSEG "vsseg2e64.v " -#define VSSSEG "vssseg2e64.v " - -void bli_zaxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, - const void *restrict alpha_, const void *restrict a_, - inc_t inca, inc_t lda, const void *restrict x_, - inc_t incx, void *restrict y_, inc_t incy, - const cntx_t *restrict cntx) { - (void)cntx; - const dcomplex *restrict alpha = alpha_; - const dcomplex *restrict a = a_; - const dcomplex *restrict x = x_; - dcomplex *restrict y = y_; - - if (m == 0 || b == 0) - return; - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - size_t avl = m; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - const dcomplex* restrict x_tmp = x; - const dcomplex* restrict a_tmp = a; - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); - if (inca == 2 * FLT_SIZE) - __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmul.vf v0, v24, ft0"); - __asm__("vfmul.vf v4, v24, ft1"); - if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfmsac.vf v4, ft0, v28"); - } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } else { - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfnmacc.vf v4, ft0, v28"); - } - - for (dim_t i = 1; i < b; ++i) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); - if (inca == 2 * FLT_SIZE) - __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmacc.vf v0, ft0, v24"); - if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v4, ft1, v24"); - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { - __asm__("vfnmsac.vf v4, ft1, v24"); - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v4, ft1, v24"); - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE - __asm__("vfnmsac.vf v4, ft1, v24"); - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } - } - - if (incy == 2 * FLT_SIZE) { - __asm__(VLSEG "v24, (%0)" : : "r"(y)); - __asm__("vfmacc.vf v24, ft10, v0"); - __asm__("vfmacc.vf v28, ft10, v4"); - __asm__("vfnmsac.vf v24, ft11, v4"); - __asm__("vfmacc.vf v28, ft11, v0"); - __asm__(VSSEG "v24, (%0)" : : "r"(y)); - } else { - __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); - __asm__("vfmacc.vf v24, ft10, v0"); - __asm__("vfmacc.vf v28, ft10, v4"); - __asm__("vfnmsac.vf v24, ft11, v4"); - __asm__("vfmacc.vf v28, ft11, v0"); - __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); - } - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c new file mode 100644 index 0000000000..a5e0268467 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c @@ -0,0 +1,121 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define AXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyf_sifive_x280_intr(\ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + dim_t b, \ + const T* restrict alpha_, \ + const T* restrict a_, inc_t inca, inc_t lda, \ + const T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* restrict cntx \ +) + +#define AXPYF(...) AXPYF_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpyf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpyf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpyf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpyf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef AXPYF +#undef AXPYF_ diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..0ab5509fab --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c @@ -0,0 +1,149 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPYF + +AXPYF(PRECISION_CHAR, void) +{ + // Computes y := y + alpha * conja(A) * conjx(x) + + (void) cntx; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (m <= 0 || b <= 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) + return; + + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) ax_vec_real, ax_vec_imag; + + for (size_t i = 0; i < b; ++i) { + DATATYPE x_tmp_conj; + PASTEMAC(PRECISION_CHAR, copycjs)(conjx, *x_tmp, x_tmp_conj); + + RVV_TYPE_FX(PREC, LMUL, 2) acol_vec; + if (inca == 1) + acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) a_tmp, vl); + else + acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) a_tmp, 2 * FLT_SIZE * inca, vl); + + RVV_TYPE_F(PREC, LMUL) acol_vec_real = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); + RVV_TYPE_F(PREC, LMUL) acol_vec_imag = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); + + if (bli_is_conj(conja)) { + if (i == 0) + VCMUL_VF_CONJ + ( + PREC, LMUL, + ax_vec_real, ax_vec_imag, + acol_vec_real, acol_vec_imag, + x_tmp_conj.real, x_tmp_conj.imag, + vl + ); + else + VCMACC_VF_CONJ + ( + PREC, LMUL, + ax_vec_real, ax_vec_imag, + x_tmp_conj.real, x_tmp_conj.imag, + acol_vec_real, acol_vec_imag, + vl + ); + } + else { + if (i == 0) + VCMUL_VF + ( + PREC, LMUL, + ax_vec_real, ax_vec_imag, + acol_vec_real, acol_vec_imag, + x_tmp_conj.real, x_tmp_conj.imag, + vl + ); + else + VCMACC_VF + ( + PREC, LMUL, + ax_vec_real, ax_vec_imag, + x_tmp_conj.real, x_tmp_conj.imag, + acol_vec_real, acol_vec_imag, + vl + ); + } + + a_tmp += lda; + x_tmp += incx; + } + + RVV_TYPE_FX(PREC, LMUL, 2) yvec; + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, vl); + + RVV_TYPE_F(PREC, LMUL) yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + RVV_TYPE_F(PREC, LMUL) yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + + VCMACC_VF + ( + PREC, LMUL, + yvec_real, yvec_imag, + alpha->real, alpha->imag, + ax_vec_real, ax_vec_imag, + vl + ); + + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, yvec, vl); + + a += vl * inca; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // AXPYF diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c new file mode 100644 index 0000000000..ae7dcb21d5 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c @@ -0,0 +1,96 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPYF + +AXPYF(PRECISION_CHAR, void) +{ + // Computes y := y + alpha * conja(A) * conjx(x) + + (void) conja; // Suppress unused parameter warnings + (void) conjx; + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (m <= 0 || b <= 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) + return; + + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) ax_vec; + + for (size_t i = 0; i < b; ++i) { + RVV_TYPE_F(PREC, LMUL) acol_vec; + if (inca == 1) + acol_vec = VLE_V_F(PREC, LMUL)(a_tmp, vl); + else + acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp, FLT_SIZE * inca, vl); + + if (i == 0) + ax_vec = VFMUL_VF(PREC, LMUL)(acol_vec, *x_tmp, vl); + else + ax_vec = VFMACC_VF(PREC, LMUL)(ax_vec, *x_tmp, acol_vec, vl); + + a_tmp += lda; + x_tmp += incx; + } + + RVV_TYPE_F(PREC, LMUL) yvec; + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL)(y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, ax_vec, vl); + + if (incy == 1) + VSE_V_F(PREC, LMUL)(y, yvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl); + + a += vl * inca; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // AXPYF diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c deleted file mode 100644 index ecb340707b..0000000000 --- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c +++ /dev/null @@ -1,3120 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "../riscv_cmul_macros_asm.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sdotxaxpyf_sifive_x280_asm( - conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict w_, inc_t incw, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - void* restrict z_, inc_t incz, - const cntx_t* restrict cntx - ) { - (void)conjat; - (void)conja; - (void)conjw; - (void)conjx; - (void)cntx; - const float *restrict alpha = alpha_; - const float *restrict beta = beta_; - const float *restrict a = a_; - const float *restrict w = w_; - const float *restrict x = x_; - float *restrict y = y_; - float *restrict z = z_; - - if (b == 0) - return; - else if (m == 0 || *alpha == 0.f) { - // scale y by beta - if (*beta == 0.f) - bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incw *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - incz *= FLT_SIZE; - inc_t a_bump = 5 * lda; - while (b >= 5) { - // compute dot product of w with 5 rows of a - const float* restrict w_tmp = w; - const float* restrict z_tmp = z; - const float* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const float* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmul.vv v0, v24, v28"); - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmul.vv v16, v24, v28"); - first = false; - } - else { - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmacc.vv v16, v24, v28"); - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmul.vv v0, v24, v28"); - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmul.vv v16, v24, v28"); - first = false; - } - else { - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmacc.vv v16, v24, v28"); - } - } // end a non-unit stride - - if (incz == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 5; - } - - if (b > 0) { - const float* restrict w_tmp = w; - const float* restrict z_tmp = z; - const float* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); - size_t avl = m; - bool first = true; - while (avl) { - const float* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - __asm__("vmv.v.i v20, 0"); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - case 3: - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - case 2: - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - case 1: - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmul.vv v0, v24, v28"); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - case 3: - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - case 2: - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - case 1: - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmul.vv v0, v24, v28"); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a non-unit stride - - if (incz == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 4: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 3: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 2: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 1: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_ddotxaxpyf_sifive_x280_asm( - conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict w_, inc_t incw, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - void* restrict z_, inc_t incz, - const cntx_t* restrict cntx - ) { - (void)conjat; - (void)conja; - (void)conjw; - (void)conjx; - (void)cntx; - const double *restrict alpha = alpha_; - const double *restrict beta = beta_; - const double *restrict a = a_; - const double *restrict w = w_; - const double *restrict x = x_; - double *restrict y = y_; - double *restrict z = z_; - - if (b == 0) - return; - else if (m == 0 || *alpha == 0.) { - // scale y by beta - if (*beta == 0.) - bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incw *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - incz *= FLT_SIZE; - inc_t a_bump = 5 * lda; - while (b >= 5) { - // compute dot product of w with 5 rows of a - const double* restrict w_tmp = w; - const double* restrict z_tmp = z; - const double* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const double* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmul.vv v0, v24, v28"); - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmul.vv v16, v24, v28"); - first = false; - } - else { - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmacc.vv v16, v24, v28"); - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmul.vv v0, v24, v28"); - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmul.vv v16, v24, v28"); - first = false; - } - else { - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmacc.vv v16, v24, v28"); - } - } // end a non-unit stride - - if (incz == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 5; - } - - if (b > 0) { - const double* restrict w_tmp = w; - const double* restrict z_tmp = z; - const double* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); - size_t avl = m; - bool first = true; - while (avl) { - const double* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - __asm__("vmv.v.i v20, 0"); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - case 3: - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - case 2: - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - case 1: - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmul.vv v0, v24, v28"); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - case 3: - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - case 2: - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - case 1: - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmul.vv v0, v24, v28"); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a non-unit stride - - if (incz == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 4: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 3: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 2: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 1: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define FMADD "fmadd.s " -#define FNMSUB "fnmsub.s " -#define FNEG "fneg.s " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define VSE "vse32.v " - -void bli_cdotxaxpyf_sifive_x280_asm - ( - conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict w_, inc_t incw, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - void* restrict z_, inc_t incz, - const cntx_t* restrict cntx - ) -{ - (void)cntx; - const scomplex *restrict alpha = alpha_; - const scomplex *restrict beta = beta_; - const scomplex *restrict a = a_; - const scomplex *restrict w = w_; - const scomplex *restrict x = x_; - scomplex *restrict y = y_; - scomplex *restrict z = z_; - - if (b == 0) - return; - else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) { - // scale y by beta - if (beta->real == 0.f && beta->imag == 0.f) - bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha, - // and fa6-fa7 to store beta - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta)); - __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); - // Reduce to case when A^T is not conjugated, then conjugate - // computed product A^T * w if needed. - conj_t conjatw = BLIS_NO_CONJUGATE; - if (conjat == BLIS_CONJUGATE) { - bli_toggle_conj(&conjat); - bli_toggle_conj(&conjw); - bli_toggle_conj(&conjatw); - } - conj_t conjax = BLIS_NO_CONJUGATE; - if (conja == BLIS_CONJUGATE) { - bli_toggle_conj(&conja); - bli_toggle_conj(&conjx); - bli_toggle_conj(&conjax); - } - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incw *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - incz *= 2 * FLT_SIZE; - // these are used to bump a and y, resp. - inc_t a_bump = 5 * lda; - inc_t y_bump = incy - FLT_SIZE; - while (b >= 5) { - // compute dot product of w with 6 rows of a - const scomplex* restrict w_tmp = w; - const scomplex* restrict z_tmp = z; - const scomplex* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const scomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - if (inca == 2 * FLT_SIZE) { - if (conjw == BLIS_NO_CONJUGATE) { - // a unit stride, conjw = no conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a unit stride, conjw = conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjw == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjw = no conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a non-unit stride, conjw = conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_CONJUGATE - } // end a non-unit stride - - if (incz == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - // a += 5 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 5; - } - - if (b > 0) { - // cleanup loop, 0 < b < 5 - const scomplex* restrict w_tmp = w; - const scomplex* restrict z_tmp = z; - const scomplex* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); - size_t avl = m; - bool first = true; - while (avl) { - const scomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v22, 0"); - if (inca == 2 * FLT_SIZE) { - if (conjw == BLIS_NO_CONJUGATE) { - // a unit stride, conjw = no conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a unit stride, conjw = conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjw == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjw = no conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a non-unit stride, conjw = conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_CONJUGATE - } // end a non-unit stride - - if (incz == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - y_bump = incy + FLT_SIZE; - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 4: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 3: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 2: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 1: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - } - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef FMADD -#undef FNMSUB -#undef FNEG -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef VSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define FMADD "fmadd.d " -#define FNMSUB "fnmsub.d " -#define FNEG "fneg.d " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define VSE "vse64.v " - -void bli_zdotxaxpyf_sifive_x280_asm - ( - conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict w_, inc_t incw, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - void* restrict z_, inc_t incz, - const cntx_t* restrict cntx - ) -{ - (void)cntx; - const dcomplex *restrict alpha = alpha_; - const dcomplex *restrict beta = beta_; - const dcomplex *restrict a = a_; - const dcomplex *restrict w = w_; - const dcomplex *restrict x = x_; - dcomplex *restrict y = y_; - dcomplex *restrict z = z_; - - if (b == 0) - return; - else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) { - // scale y by beta - if (beta->real == 0. && beta->imag == 0.) - bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha, - // and fa6-fa7 to store beta - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta)); - __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); - // Reduce to case when A^T is not conjugated, then conjugate - // computed product A^T * w if needed. - conj_t conjatw = BLIS_NO_CONJUGATE; - if (conjat == BLIS_CONJUGATE) { - bli_toggle_conj(&conjat); - bli_toggle_conj(&conjw); - bli_toggle_conj(&conjatw); - } - conj_t conjax = BLIS_NO_CONJUGATE; - if (conja == BLIS_CONJUGATE) { - bli_toggle_conj(&conja); - bli_toggle_conj(&conjx); - bli_toggle_conj(&conjax); - } - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incw *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - incz *= 2 * FLT_SIZE; - // these are used to bump a and y, resp. - inc_t a_bump = 5 * lda; - inc_t y_bump = incy - FLT_SIZE; - while (b >= 5) { - // compute dot product of w with 6 rows of a - const dcomplex* restrict w_tmp = w; - const dcomplex* restrict z_tmp = z; - const dcomplex* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const dcomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - if (inca == 2 * FLT_SIZE) { - if (conjw == BLIS_NO_CONJUGATE) { - // a unit stride, conjw = no conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a unit stride, conjw = conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjw == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjw = no conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a non-unit stride, conjw = conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_CONJUGATE - } // end a non-unit stride - - if (incz == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - // a += 5 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 5; - } - - if (b > 0) { - // cleanup loop, 0 < b < 5 - const dcomplex* restrict w_tmp = w; - const dcomplex* restrict z_tmp = z; - const dcomplex* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); - size_t avl = m; - bool first = true; - while (avl) { - const dcomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v22, 0"); - if (inca == 2 * FLT_SIZE) { - if (conjw == BLIS_NO_CONJUGATE) { - // a unit stride, conjw = no conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a unit stride, conjw = conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjw == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjw = no conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a non-unit stride, conjw = conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_CONJUGATE - } // end a non-unit stride - - if (incz == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - y_bump = incy + FLT_SIZE; - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 4: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 3: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 2: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 1: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - } - } - return; -} diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c new file mode 100644 index 0000000000..dc1bca9f6a --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c @@ -0,0 +1,137 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define DOTXAXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxaxpyf_sifive_x280_intr(\ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b, \ + const T* restrict alpha_, \ + const T* restrict a_, inc_t inca, inc_t lda, \ + const T* restrict w_, inc_t incw, \ + const T* restrict x_, inc_t incx, \ + const T* restrict beta_, \ + T* restrict y_, inc_t incy, \ + T* restrict z_, inc_t incz, \ + const cntx_t* restrict cntx \ +) + +#define DOTXAXPYF(...) DOTXAXPYF_(__VA_ARGS__) + +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr +#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) +#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr +#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotxaxpyf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotxaxpyf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m2 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m2 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SETV_ +#undef SETV +#undef SCALV_ +#undef SCALV + +#undef DOTXAXPYF +#undef DOTXAXPYF_ diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..d8a984064d --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c @@ -0,0 +1,427 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXAXPYF + +#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i) \ + do { \ + acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \ + acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ + acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ + do { \ + acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \ + acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ + acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ + do { \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc3_r, yacc3_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ + do { \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc3_r, yacc3_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ + do { \ + switch (b) { \ + case 3: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + case 2: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + case 1: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + } \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ + do { \ + switch (b) { \ + case 3: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + case 2: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + case 1: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + } \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_REDUCE(i) \ + do { \ + RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1); \ + RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1); \ + dot##i##_r = VF_REDUSUM_VS(PREC, LMUL)(yacc##i##_r, dot##i##_r, m); \ + dot##i##_i = VF_REDUSUM_VS(PREC, LMUL)(yacc##i##_i, dot##i##_i, m); \ + RVV_TYPE_F(PREC, m1) y##i##_r, y##i##_i; \ + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ + if (bli_is_conj(conjatw)) \ + VCMUL_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + else \ + VCMUL_VF(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ + y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ + } \ + else { \ + PASTEMAC(PRECISION_CHAR, scals)(*beta, y[i * incy]) \ + y##i##_r = VFMV_S_F(PREC, m1)(y[i * incy].real, 1); \ + y##i##_i = VFMV_S_F(PREC, m1)(y[i * incy].imag, 1); \ + if (bli_is_conj(conjatw)) \ + VCMACC_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + else \ + VCMACC_VF(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ + y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ + } \ + } while (0) + +DOTXAXPYF(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjat(A^T) * conjx(x) + + (void) cntx; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict w = w_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict y = y_; + DATATYPE* restrict z = z_; + + if (b == 0) return; + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + conj_t conjatw = BLIS_NO_CONJUGATE; + conj_t conjax = BLIS_NO_CONJUGATE; + if (bli_is_conj(conjw)) { + bli_toggle_conj(&conjat); + bli_toggle_conj(&conjw); + bli_toggle_conj(&conjatw); + } + if (bli_is_conj(conjx)) { + bli_toggle_conj(&conja); + bli_toggle_conj(&conjx); + bli_toggle_conj(&conjax); + } + + while (b >= 4) { + // Compute dot product of w with 4 columns of a. + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict w_tmp = w; + DATATYPE* restrict z_tmp = z; + RVV_TYPE_F(PREC, LMUL) yacc0_r, yacc0_i, yacc1_r, yacc1_i, + yacc2_r, yacc2_i, yacc3_r, yacc3_i; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) wvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) wvec_r, wvec_i, acol_vec_r, acol_vec_i; + RVV_TYPE_F(PREC, LMUL) zacc_r, zacc_i; + if (incw == 1) + wvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, vl); + else + wvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, 2 * FLT_SIZE * incw, vl); + wvec_r = VGET_V_F(PREC, LMUL, 2)(wvec, 0); + wvec_i = VGET_V_F(PREC, LMUL, 2)(wvec, 1); + + if (first) { + if (bli_is_conj(conjat)) { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, _CONJ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, ); + } + } + else { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , _CONJ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , ); + } + } + first = false; + } + else { + if (bli_is_conj(conjat)) { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, _CONJ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, ); + } + } + else { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , _CONJ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , ); + } + } + } + + RVV_TYPE_FX(PREC, LMUL, 2) zvec; + if (incz == 1) + zvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, vl); + else + zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, vl); + RVV_TYPE_F(PREC, LMUL) zvec_r = VGET_V_F(PREC, LMUL, 2)(zvec, 0); + RVV_TYPE_F(PREC, LMUL) zvec_i = VGET_V_F(PREC, LMUL, 2)(zvec, 1); + if (bli_is_conj(conjax)) + VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + else + VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_r); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_i); + if (incz == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, zvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, zvec, vl); + + a_tmp += vl * inca; + w_tmp += vl * incw; + z_tmp += vl * incz; + avl -= vl; + } + + DOTXAXPYF_SIFIVE_X280_REDUCE(0); + DOTXAXPYF_SIFIVE_X280_REDUCE(1); + DOTXAXPYF_SIFIVE_X280_REDUCE(2); + DOTXAXPYF_SIFIVE_X280_REDUCE(3); + + a += 4 * lda; + x += 4 * incx; + y += 4 * incy; + b -= 4; + } + + if (b > 0) { + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict w_tmp = w; + DATATYPE* restrict z_tmp = z; + RVV_TYPE_F(PREC, LMUL) yacc0_r, yacc0_i, yacc1_r, yacc1_i, yacc2_r, yacc2_i; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) wvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) wvec_r, wvec_i, acol_vec_r, acol_vec_i; + RVV_TYPE_F(PREC, LMUL) zacc_r = VFMV_V_F(PREC, LMUL)(0, vl); + RVV_TYPE_F(PREC, LMUL) zacc_i = VFMV_V_F(PREC, LMUL)(0, vl); + if (incw == 1) + wvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, vl); + else + wvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, 2 * FLT_SIZE * incw, vl); + wvec_r = VGET_V_F(PREC, LMUL, 2)(wvec, 0); + wvec_i = VGET_V_F(PREC, LMUL, 2)(wvec, 1); + + if (first) { + if (bli_is_conj(conjat)) { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, _CONJ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, ); + } + } + else { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , _CONJ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , ); + } + } + first = false; + } + else { + if (bli_is_conj(conjat)) { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, _CONJ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, ); + } + } + else { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , _CONJ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , ); + } + } + } + + RVV_TYPE_FX(PREC, LMUL, 2) zvec; + if (incz == 1) + zvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, vl); + else + zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, vl); + RVV_TYPE_F(PREC, LMUL) zvec_r = VGET_V_F(PREC, LMUL, 2)(zvec, 0); + RVV_TYPE_F(PREC, LMUL) zvec_i = VGET_V_F(PREC, LMUL, 2)(zvec, 1); + if (bli_is_conj(conjax)) + VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + else + VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_r); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_i); + if (incz == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, zvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, zvec, vl); + + a_tmp += vl * inca; + w_tmp += vl * incw; + z_tmp += vl * incz; + avl -= vl; + } + + switch (b) { + case 3: + DOTXAXPYF_SIFIVE_X280_REDUCE(2); + case 2: + DOTXAXPYF_SIFIVE_X280_REDUCE(1); + case 1: + DOTXAXPYF_SIFIVE_X280_REDUCE(0); + } + } + return; +} + +#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL +#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED +#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY +#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY +#undef DOTXAXPYF_SIFIVE_X280_REDUCE + +#endif // DOTXAXPYF diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c new file mode 100644 index 0000000000..57ef4f7447 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c @@ -0,0 +1,283 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXAXPYF + +#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i) \ + do { \ + acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ + do { \ + acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF) \ + do { \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + yacc3 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF) \ + do { \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl); \ + zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + yacc3 = VFMACC_VV_TU(PREC, LMUL)(yacc3, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF) \ + do { \ + switch (b) { \ + case 3: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ + case 2: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ + case 1: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \ + } \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF) \ + do { \ + switch (b) { \ + case 3: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ + case 2: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ + case 1: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \ + } \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_REDUCE(i) \ + do { \ + RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1); \ + dot##i = VF_REDUSUM_VS(PREC, LMUL)(yacc##i, dot##i, m); \ + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ + dot##i = VFMUL_VF(PREC, m1)(dot##i, *alpha, 1); \ + y[i * incy] = VFMV_F_S(PREC)(dot##i); \ + } \ + else { \ + y[i * incy] *= *beta; \ + RVV_TYPE_F(PREC, m1) y##i = VFMV_S_F(PREC, m1)(y[i * incy], 1); \ + y##i = VFMACC_VF(PREC, m1)(y##i, *alpha, dot##i, 1); \ + y[i * incy] = VFMV_F_S(PREC)(y##i); \ + } \ + } while (0) + +DOTXAXPYF(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjat(A^T) * conjw(w) + // z := z + alpha * conja(A) * conjx(x) + + (void) conjat; // Suppress unused parameter warnings + (void) conja; + (void) conjw; + (void) conjx; + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict w = w_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict y = y_; + DATATYPE* restrict z = z_; + + if (b == 0) return; + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + while (b >= 4) { + // Process 4 columns of a at a time. + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict w_tmp = w; + DATATYPE* restrict z_tmp = z; + RVV_TYPE_F(PREC, LMUL) yacc0, yacc1, yacc2, yacc3; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) wvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) zacc; + if (incw == 1) + wvec = VLE_V_F(PREC, LMUL)(w_tmp, vl); + else + wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl); + if (first) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED); + first = false; + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED); + } + + RVV_TYPE_F(PREC, LMUL) zvec; + if (incz == 1) + zvec = VLE_V_F(PREC, LMUL)(z_tmp, vl); + else + zvec = VLSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, vl); + zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, zacc, vl); + if (incz == 1) + VSE_V_F(PREC, LMUL)(z_tmp, zvec, vl); + else + VSSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, zvec, vl); + + a_tmp += vl * inca; + w_tmp += vl * incw; + z_tmp += vl * incz; + avl -= vl; + } + + DOTXAXPYF_SIFIVE_X280_REDUCE(0); + DOTXAXPYF_SIFIVE_X280_REDUCE(1); + DOTXAXPYF_SIFIVE_X280_REDUCE(2); + DOTXAXPYF_SIFIVE_X280_REDUCE(3); + + a += 4 * lda; + x += 4 * incx; + y += 4 * incy; + b -= 4; + } + + if (b > 0) { + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict w_tmp = w; + DATATYPE* restrict z_tmp = z; + RVV_TYPE_F(PREC, LMUL) yacc0, yacc1, yacc2; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) wvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) zacc = VFMV_V_F(PREC, LMUL)(0, vl); + if (incw == 1) + wvec = VLE_V_F(PREC, LMUL)(w_tmp, vl); + else + wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl); + if (first) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED); + first = false; + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED); + } + + RVV_TYPE_F(PREC, LMUL) zvec; + if (incz == 1) + zvec = VLE_V_F(PREC, LMUL)(z_tmp, vl); + else + zvec = VLSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, vl); + zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, zacc, vl); + if (incz == 1) + VSE_V_F(PREC, LMUL)(z_tmp, zvec, vl); + else + VSSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, zvec, vl); + + a_tmp += vl * inca; + w_tmp += vl * incw; + z_tmp += vl * incz; + avl -= vl; + } + + switch (b) { + case 3: + DOTXAXPYF_SIFIVE_X280_REDUCE(2); + case 2: + DOTXAXPYF_SIFIVE_X280_REDUCE(1); + case 1: + DOTXAXPYF_SIFIVE_X280_REDUCE(0); + } + } + return; +} + +#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL +#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED +#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY +#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY +#undef DOTXAXPYF_SIFIVE_X280_REDUCE + +#endif // DOTXAXPYF diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c deleted file mode 100644 index 5ac2d41667..0000000000 --- a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c +++ /dev/null @@ -1,2645 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "../riscv_cmul_macros_asm.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sdotxf_sifive_x280_asm( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - const cntx_t* restrict cntx - ) { - // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca) - // we process 6 elements of y per iteration, using y_tmp to load/store from - // y a points to the 6 x m block of a needed this iteration each 6 x m block - // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we - // use x_tmp to load from x a_row is used to load each of the 6 rows of this - // 6 x vl block - (void)conjat; - (void)conjx; - (void)cntx; - const float* restrict alpha = alpha_; - const float* restrict a = a_; - const float* restrict x = x_; - const float* restrict beta = beta_; - float* restrict y = y_; - - if (b == 0) - return; - else if (m == 0 || *alpha == 0.f) { - // scale y by beta - if (*beta == 0.f) - bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - inc_t a_bump = 6 * lda; // to bump a down 6 rows - - while (b >= 6) { - // compute dot product of x with 6 rows of a - const float* restrict x_tmp = x; - const float* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const float* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - __asm__(VLE "v0, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v0, v0, v28"); - __asm__(VLE "v4, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - __asm__(VLE "v8, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - __asm__(VLE "v12, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - __asm__(VLE "v16, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - __asm__(VLE "v20, (%0)" : : "r"(a_row)); - __asm__("vfmul.vv v20, v20, v28"); - first = false; - } - else { - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vv v20, v24, v28"); - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v0, v0, v28"); - __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmul.vv v20, v20, v28"); - first = false; - } - else { - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vv v20, v24, v28"); - } - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v20, v20, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v20, v20, ft10"); - __asm__(VSE "v20, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v20"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - // a += 6 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 6; - } - - if (b > 0) { - // compute dot product of x with remaining < 6 rows of a - const float* restrict x_tmp = x; - // a_col will move along the last row of a! - const float* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - size_t avl = m; - bool first = true; - while (avl) { - const float* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - switch (b) { - case 5: - __asm__(VLE "v16, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - case 4: - __asm__(VLE "v12, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - case 3: - __asm__(VLE "v8, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - case 2: - __asm__(VLE "v4, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_row)); - __asm__("vfmul.vv v0, v0, v28"); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - case 4: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - switch (b) { - case 5: - __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - case 4: - __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - case 3: - __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - case 2: - __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - case 1: - __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmul.vv v0, v0, v28"); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - case 4: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - __asm__("vmv.s.x v31, x0"); - switch (b) { - case 5: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 4: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 3: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 2: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 1: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_ddotxf_sifive_x280_asm( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - const cntx_t* restrict cntx - ) { - // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca) - // we process 6 elements of y per iteration, using y_tmp to load/store from - // y a points to the 6 x m block of a needed this iteration each 6 x m block - // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we - // use x_tmp to load from x a_row is used to load each of the 6 rows of this - // 6 x vl block - (void)conjat; - (void)conjx; - (void)cntx; - const double* restrict alpha = alpha_; - const double* restrict a = a_; - const double* restrict x = x_; - const double* restrict beta = beta_; - double* restrict y = y_; - - if (b == 0) - return; - else if (m == 0 || *alpha == 0.) { - // scale y by beta - if (*beta == 0.) - bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - inc_t a_bump = 6 * lda; // to bump a down 6 rows - - while (b >= 6) { - // compute dot product of x with 6 rows of a - const double* restrict x_tmp = x; - const double* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const double* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - __asm__(VLE "v0, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v0, v0, v28"); - __asm__(VLE "v4, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - __asm__(VLE "v8, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - __asm__(VLE "v12, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - __asm__(VLE "v16, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - __asm__(VLE "v20, (%0)" : : "r"(a_row)); - __asm__("vfmul.vv v20, v20, v28"); - first = false; - } - else { - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vv v20, v24, v28"); - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v0, v0, v28"); - __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmul.vv v20, v20, v28"); - first = false; - } - else { - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vv v20, v24, v28"); - } - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v20, v20, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v20, v20, ft10"); - __asm__(VSE "v20, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v20"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - // a += 6 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 6; - } - - if (b > 0) { - // compute dot product of x with remaining < 6 rows of a - const double* restrict x_tmp = x; - // a_col will move along the last row of a! - const double* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - size_t avl = m; - bool first = true; - while (avl) { - const double* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - switch (b) { - case 5: - __asm__(VLE "v16, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - case 4: - __asm__(VLE "v12, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - case 3: - __asm__(VLE "v8, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - case 2: - __asm__(VLE "v4, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_row)); - __asm__("vfmul.vv v0, v0, v28"); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - case 4: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - switch (b) { - case 5: - __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - case 4: - __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - case 3: - __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - case 2: - __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - case 1: - __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmul.vv v0, v0, v28"); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - case 4: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - __asm__("vmv.s.x v31, x0"); - switch (b) { - case 5: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 4: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 3: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 2: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 1: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define FMADD "fmadd.s " -#define FNMSUB "fnmsub.s " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define VSE "vse32.v " - -void bli_cdotxf_sifive_x280_asm( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - const cntx_t* restrict cntx - ) { - (void)cntx; - const scomplex* restrict alpha = alpha_; - const scomplex* restrict a = a_; - const scomplex* restrict x = x_; - const scomplex* restrict beta = beta_; - scomplex* restrict y = y_; - - if (b == 0) - return; - else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) { - // scale y by beta - if (beta->real == 0.f && beta->imag == 0.f) - bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); - // Reduce to case when A^T is not conjugated, then conjugate - // computed product A^T * x if needed. - conj_t conjatx = BLIS_NO_CONJUGATE; - if (conjat == BLIS_CONJUGATE) { - bli_toggle_conj(&conjat); - bli_toggle_conj(&conjx); - bli_toggle_conj(&conjatx); - } - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - // these are used to bump a and y, resp. - inc_t a_bump = 6 * lda; - inc_t y_bump = incy - FLT_SIZE; - while (b >= 6) { - // compute dot product of x with 6 rows of a - const scomplex* restrict x_tmp = x; - const scomplex* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const scomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == 2 * FLT_SIZE) { - if (conjx == BLIS_NO_CONJUGATE) { - // a unit stride, conjx = no conj - if (first) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a unit stride, conjx = conj - if (first) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv_conj(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjx == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjx = no conj - if (first) { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx = BLIS_CONJUGATE - // a non-unit stride, conjx = conj - if (first) { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv_conj(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_CONJUGATE - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v20, v20, v31"); - __asm__("vfredusum.vs v22, v22, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v20, v22, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v20, v22); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - // a += 6 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 6; - } - - if (b > 0) { - // cleanup loop, 0 < b < 6 - const scomplex* restrict x_tmp = x; - const scomplex* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - size_t avl = m; - bool first = true; - while (avl) { - const scomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == 2 * FLT_SIZE) { - if (conjx == BLIS_NO_CONJUGATE) { - // a unit stride, conjx = no conj - if (first) { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a unit stride, conjx = conj - if (first) { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjx == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjx = no conj - if (first) { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a non-unit stride, conjx = conj - if (first) { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_CONJUGATE - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - y_bump = incy + FLT_SIZE; - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 5: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 4: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 3: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 2: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 1: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef FMADD -#undef FNMSUB -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef VSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define FMADD "fmadd.d " -#define FNMSUB "fnmsub.d " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define VSE "vse64.v " - -void bli_zdotxf_sifive_x280_asm( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - const cntx_t* restrict cntx - ) { - (void)cntx; - const dcomplex* restrict alpha = alpha_; - const dcomplex* restrict a = a_; - const dcomplex* restrict x = x_; - const dcomplex* restrict beta = beta_; - dcomplex* restrict y = y_; - - if (b == 0) - return; - else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) { - // scale y by beta - if (beta->real == 0. && beta->imag == 0.) - bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); - // Reduce to case when A^T is not conjugated, then conjugate - // computed product A^T * x if needed. - conj_t conjatx = BLIS_NO_CONJUGATE; - if (conjat == BLIS_CONJUGATE) { - bli_toggle_conj(&conjat); - bli_toggle_conj(&conjx); - bli_toggle_conj(&conjatx); - } - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - // these are used to bump a and y, resp. - inc_t a_bump = 6 * lda; - inc_t y_bump = incy - FLT_SIZE; - while (b >= 6) { - // compute dot product of x with 6 rows of a - const dcomplex* restrict x_tmp = x; - const dcomplex* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const dcomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == 2 * FLT_SIZE) { - if (conjx == BLIS_NO_CONJUGATE) { - // a unit stride, conjx = no conj - if (first) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a unit stride, conjx = conj - if (first) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv_conj(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjx == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjx = no conj - if (first) { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx = BLIS_CONJUGATE - // a non-unit stride, conjx = conj - if (first) { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv_conj(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_CONJUGATE - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v20, v20, v31"); - __asm__("vfredusum.vs v22, v22, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v20, v22, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v20, v22); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - // a += 6 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 6; - } - - if (b > 0) { - // cleanup loop, 0 < b < 6 - const dcomplex* restrict x_tmp = x; - const dcomplex* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - size_t avl = m; - bool first = true; - while (avl) { - const dcomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == 2 * FLT_SIZE) { - if (conjx == BLIS_NO_CONJUGATE) { - // a unit stride, conjx = no conj - if (first) { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a unit stride, conjx = conj - if (first) { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjx == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjx = no conj - if (first) { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a non-unit stride, conjx = conj - if (first) { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_CONJUGATE - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - y_bump = incy + FLT_SIZE; - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 5: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 4: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 3: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 2: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 1: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - } - } // end cleanup - return; -} diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c new file mode 100644 index 0000000000..9396515b30 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c @@ -0,0 +1,132 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define DOTXF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxf_sifive_x280_intr(\ + conj_t conjat, \ + conj_t conjx, \ + dim_t m, \ + dim_t b, \ + const T* restrict alpha_, \ + const T* restrict a_, inc_t inca, inc_t lda, \ + const T* restrict x_, inc_t incx, \ + const T* restrict beta_, \ + T* restrict y_, inc_t incy, \ + const cntx_t* restrict cntx \ +) + +#define DOTXF(...) DOTXF_(__VA_ARGS__) + +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr +#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) +#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr +#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotxf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotxf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m2 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotxf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m2 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotxf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SETV_ +#undef SETV +#undef SCALV_ +#undef SCALV + +#undef DOTXF +#undef DOTXF_ diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..463a111f07 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c @@ -0,0 +1,324 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXF + +#define DOTXF_SIFIVE_X280_LOAD_ACOL(i) \ + do { \ + acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \ + acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ + acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ + do { \ + acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \ + acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ + acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, CONJ_SUF) \ + do { \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc5_r, acc5_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, CONJ_SUF) \ + do { \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc5_r, acc5_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, CONJ_SUF) \ + do { \ + switch (b) { \ + case 5: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + case 4: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + case 3: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + case 2: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + case 1: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + } \ + } while (0) + +#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, CONJ_SUF) \ + do { \ + switch (b) { \ + case 5: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + case 4: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + case 3: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + case 2: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + case 1: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + } \ + } while (0) + +#define DOTXF_SIFIVE_X280_REDUCE(i) \ + do { \ + RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1); \ + RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1); \ + dot##i##_r = VF_REDUSUM_VS(PREC, LMUL)(acc##i##_r, dot##i##_r, m); \ + dot##i##_i = VF_REDUSUM_VS(PREC, LMUL)(acc##i##_i, dot##i##_i, m); \ + RVV_TYPE_F(PREC, m1) y##i##_r, y##i##_i; \ + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ + if (bli_is_conj(conjatx)) \ + VCMUL_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + else \ + VCMUL_VF(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ + y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ + } \ + else { \ + PASTEMAC(PRECISION_CHAR, scals)(*beta, y[i * incy]) \ + y##i##_r = VFMV_S_F(PREC, m1)(y[i * incy].real, 1); \ + y##i##_i = VFMV_S_F(PREC, m1)(y[i * incy].imag, 1); \ + if (bli_is_conj(conjatx)) \ + VCMACC_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + else \ + VCMACC_VF(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ + y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ + } \ + } while (0) + +DOTXF(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjat(A^T) * conjx(x) + + (void) cntx; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict y = y_; + + if (b == 0) return; + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + conj_t conjatx = BLIS_NO_CONJUGATE; + if (bli_is_conj(conjx)) { + bli_toggle_conj(&conjat); + bli_toggle_conj(&conjx); + bli_toggle_conj(&conjatx); + } + + while (b >= 6) { + // Compute dot product of x with 6 columns of a. + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) acc0_r, acc0_i, acc1_r, acc1_i, acc2_r, acc2_i, + acc3_r, acc3_i, acc4_r, acc4_i, acc5_r, acc5_i; + RVV_TYPE_FX(PREC, LMUL, 2) xvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) xvec_r, xvec_i, acol_vec_r, acol_vec_i; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, 2 * FLT_SIZE * incx, vl); + xvec_r = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_i = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + + if (first) { + if (bli_is_conj(conjat)) { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ); + else + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ); + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , ); + else + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, ); + } + first = false; + } + else { + if (bli_is_conj(conjat)) { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY( , _CONJ); + else + DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ); + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY( , ); + else + DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, ); + } + } + + a_tmp += vl * inca; + x_tmp += vl * incx; + avl -= vl; + } + + DOTXF_SIFIVE_X280_REDUCE(0); + DOTXF_SIFIVE_X280_REDUCE(1); + DOTXF_SIFIVE_X280_REDUCE(2); + DOTXF_SIFIVE_X280_REDUCE(3); + DOTXF_SIFIVE_X280_REDUCE(4); + DOTXF_SIFIVE_X280_REDUCE(5); + + a += 6 * lda; + y += 6 * incy; + b -= 6; + } + + if (b > 0) { + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) acc0_r, acc0_i, acc1_r, acc1_i, acc2_r, acc2_i, + acc3_r, acc3_i, acc4_r, acc4_i; + RVV_TYPE_FX(PREC, LMUL, 2) xvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) xvec_r, xvec_i, acol_vec_r, acol_vec_i; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, 2 * FLT_SIZE * incx, vl); + xvec_r = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_i = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + + if (first) { + if (bli_is_conj(conjat)) { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ); + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , ); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, ); + } + first = false; + } + else { + if (bli_is_conj(conjat)) { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY( , _CONJ); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ); + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY( , ); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, ); + } + } + + a_tmp += vl * inca; + x_tmp += vl * incx; + avl -= vl; + } + + switch (b) { + case 5: + DOTXF_SIFIVE_X280_REDUCE(4); + case 4: + DOTXF_SIFIVE_X280_REDUCE(3); + case 3: + DOTXF_SIFIVE_X280_REDUCE(2); + case 2: + DOTXF_SIFIVE_X280_REDUCE(1); + case 1: + DOTXF_SIFIVE_X280_REDUCE(0); + } + } + return; +} + +#undef DOTXF_SIFIVE_X280_LOAD_ACOL +#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED +#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST +#undef DOTXF_SIFIVE_X280_LOOP_BODY +#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST +#undef DOTXF_SIFIVE_X280_CLEANUP_BODY +#undef DOTXF_SIFIVE_X280_REDUCE + +#endif // DOTXF diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c new file mode 100644 index 0000000000..8286e2476f --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c @@ -0,0 +1,262 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXF + +#define DOTXF_SIFIVE_X280_LOAD_ACOL(i) \ + do { \ + acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ + do { \ + acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF) \ + do { \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + acc5 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF) \ + do { \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + acc5 = VFMACC_VV_TU(PREC, LMUL)(acc5, acol_vec, xvec, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF) \ + do { \ + switch (b) { \ + case 5: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + case 4: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + case 3: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + case 2: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + case 1: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + } \ + } while (0) + +#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF) \ + do { \ + switch (b) { \ + case 5: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \ + case 4: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \ + case 3: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \ + case 2: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \ + case 1: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \ + } \ + } while (0) + +#define DOTXF_SIFIVE_X280_REDUCE(i) \ + do { \ + RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1); \ + dot##i = VF_REDUSUM_VS(PREC, LMUL)(acc##i, dot##i, m); \ + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ + dot##i = VFMUL_VF(PREC, m1)(dot##i, *alpha, 1); \ + y[i * incy] = VFMV_F_S(PREC)(dot##i); \ + } \ + else { \ + y[i * incy] *= *beta; \ + RVV_TYPE_F(PREC, m1) y##i = VFMV_S_F(PREC, m1)(y[i * incy], 1); \ + y##i = VFMACC_VF(PREC, m1)(y##i, *alpha, dot##i, 1); \ + y[i * incy] = VFMV_F_S(PREC)(y##i); \ + } \ + } while (0) + +DOTXF(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjat(A^T) * conjx(x) + + (void) conjat; // Suppress unused parameter warnings + (void) conjx; + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict y = y_; + + if (b == 0) return; + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + while (b >= 6) { + // Compute dot product of x with 6 columns of a. + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) acc0, acc1, acc2, acc3, acc4, acc5; + RVV_TYPE_F(PREC, LMUL) xvec, acol_vec; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x_tmp, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl); + if (first) { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(); + else + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED); + first = false; + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY(); + else + DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED); + } + + a_tmp += vl * inca; + x_tmp += vl * incx; + avl -= vl; + } + + DOTXF_SIFIVE_X280_REDUCE(0); + DOTXF_SIFIVE_X280_REDUCE(1); + DOTXF_SIFIVE_X280_REDUCE(2); + DOTXF_SIFIVE_X280_REDUCE(3); + DOTXF_SIFIVE_X280_REDUCE(4); + DOTXF_SIFIVE_X280_REDUCE(5); + + a += 6 * lda; + y += 6 * incy; + b -= 6; + } + + if (b > 0) { + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) acc0, acc1, acc2, acc3, acc4; + RVV_TYPE_F(PREC, LMUL) xvec, acol_vec; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x_tmp, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl); + if (first) { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED); + first = false; + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY(); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED); + } + + a_tmp += vl * inca; + x_tmp += vl * incx; + avl -= vl; + } + + switch (b) { + case 5: + DOTXF_SIFIVE_X280_REDUCE(4); + case 4: + DOTXF_SIFIVE_X280_REDUCE(3); + case 3: + DOTXF_SIFIVE_X280_REDUCE(2); + case 2: + DOTXF_SIFIVE_X280_REDUCE(1); + case 1: + DOTXF_SIFIVE_X280_REDUCE(0); + } + + } + return; +} + +#undef DOTXF_SIFIVE_X280_LOAD_ACOL +#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED +#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST +#undef DOTXF_SIFIVE_X280_LOOP_BODY +#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST +#undef DOTXF_SIFIVE_X280_CLEANUP_BODY +#undef DOTXF_SIFIVE_X280_REDUCE + +#endif // DOTXF diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c deleted file mode 100644 index 3ee4cdd20c..0000000000 --- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c +++ /dev/null @@ -1,1465 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "../riscv_cmul_macros_asm.h" -#include "../bli_kernels_sifive_x280.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " -#define VSSSEG8 "vssseg8e32.v " -#define VSSSEG7 "vssseg7e32.v " -#define VSSSEG6 "vssseg6e32.v " -#define VSSSEG5 "vssseg5e32.v " -#define VSSSEG4 "vssseg4e32.v " -#define VSSSEG3 "vssseg3e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define NR 64 - -void bli_spackm_sifive_x280_asm_7m4 - ( - conj_t conja, - pack_t schema, - dim_t cdim, - dim_t cdim_max, - dim_t cdim_bcast, - dim_t n, - dim_t n_max, - const void* restrict kappa_, - const void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp, - const void* restrict params, - const cntx_t* cntx - ) -{ - (void) conja; - (void) cntx; - const float* kappa = kappa_; - const float* a = a_; - float* p = p_; - - float kappa_cast = *kappa; - - // MRxk kernel - if (cdim <= 7 && cdim_max == 7 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 0: __asm__("vmv.v.i v0, 0"); - case 1: __asm__("vmv.v.i v1, 0"); - case 2: __asm__("vmv.v.i v2, 0"); - case 3: __asm__("vmv.v.i v3, 0"); - case 4: __asm__("vmv.v.i v4, 0"); - case 5: __asm__("vmv.v.i v5, 0"); - case 6: __asm__("vmv.v.i v6, 0"); - } - a += (cdim - 1) * inca; - size_t avl = n; - while (avl) { - const float* a_tmp = a; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 7: - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 6: - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast != 1.f) { - switch (cdim) { - case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - } - __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp)); - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= FLT_SIZE; - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast != 1.f) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSE "v0, (%0)" : : "r"(p)); - a += lda; - p += ldp; - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // NRxk kernel - else if (cdim <= 64 && cdim_max == 64 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v8, 0"); - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - dim_t cdim_tmp = cdim; - const float* a_tmp = a; - float* p_tmp = p; - while (cdim_tmp >= 8) { - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v7, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - if (kappa_cast != 1.f) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast)); - } - __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - p_tmp += 8; - cdim_tmp -= 8; - } - if (cdim_tmp > 0) { - a_tmp += (cdim_tmp - 1) * inca; - switch (cdim_tmp) { - case 7: - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 6: - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast != 1.f) { - switch (cdim_tmp) { - case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - } - switch (cdim_tmp) { - case 7: - __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 6: - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 5: - __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 4: - __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 3: - __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - } - p_tmp += cdim_tmp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); - for (size_t i = 0; i < vl; ++i) { - __asm__(VSE "v8, (%0)" : : "r"(p_tmp)); - p_tmp += ldp; - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v8, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= FLT_SIZE; - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast != 1.f) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSE "v0, (%0)" : : "r"(p)); - a += lda; - p += ldp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // generic kernel - else - { - bli_sspackm_sifive_x280_ref - ( - conja, - schema, - cdim, - cdim_max, - cdim_bcast, - n, - n_max, - kappa, - a, inca, lda, - p, ldp, - params, - cntx - ); - } -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef VSSSEG8 -#undef VSSSEG7 -#undef VSSSEG6 -#undef VSSSEG5 -#undef VSSSEG4 -#undef VSSSEG3 -#undef VSSSEG2 -#undef NR - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " -#define VSSSEG8 "vssseg8e64.v " -#define VSSSEG7 "vssseg7e64.v " -#define VSSSEG6 "vssseg6e64.v " -#define VSSSEG5 "vssseg5e64.v " -#define VSSSEG4 "vssseg4e64.v " -#define VSSSEG3 "vssseg3e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define NR 32 - -void bli_dpackm_sifive_x280_asm_7m4 - ( - conj_t conja, - pack_t schema, - dim_t cdim, - dim_t cdim_max, - dim_t cdim_bcast, - dim_t n, - dim_t n_max, - const void* restrict kappa_, - const void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp, - const void* restrict params, - const cntx_t* cntx - ) -{ - (void) conja; - (void) cntx; - const double* kappa = kappa_; - const double* a = a_; - double* p = p_; - - double kappa_cast = *kappa; - - // MRxk kernel - if (cdim <= 7 && cdim_max == 7 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 0: __asm__("vmv.v.i v0, 0"); - case 1: __asm__("vmv.v.i v1, 0"); - case 2: __asm__("vmv.v.i v2, 0"); - case 3: __asm__("vmv.v.i v3, 0"); - case 4: __asm__("vmv.v.i v4, 0"); - case 5: __asm__("vmv.v.i v5, 0"); - case 6: __asm__("vmv.v.i v6, 0"); - } - a += (cdim - 1) * inca; - size_t avl = n; - while (avl) { - const double* a_tmp = a; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 7: - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 6: - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast != 1.) { - switch (cdim) { - case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - } - __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp)); - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= FLT_SIZE; - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast != 1.) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSE "v0, (%0)" : : "r"(p)); - a += lda; - p += ldp; - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // NRxk kernel - else if (cdim <= 32 && cdim_max == 32 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v8, 0"); - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - dim_t cdim_tmp = cdim; - const double* a_tmp = a; - double* p_tmp = p; - while (cdim_tmp >= 8) { - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v7, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - if (kappa_cast != 1.) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast)); - } - __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - p_tmp += 8; - cdim_tmp -= 8; - } - if (cdim_tmp > 0) { - a_tmp += (cdim_tmp - 1) * inca; - switch (cdim_tmp) { - case 7: - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 6: - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast != 1.) { - switch (cdim_tmp) { - case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - } - switch (cdim_tmp) { - case 7: - __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 6: - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 5: - __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 4: - __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 3: - __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - } - p_tmp += cdim_tmp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); - for (size_t i = 0; i < vl; ++i) { - __asm__(VSE "v8, (%0)" : : "r"(p_tmp)); - p_tmp += ldp; - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v8, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= FLT_SIZE; - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast != 1.) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSE "v0, (%0)" : : "r"(p)); - a += lda; - p += ldp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // generic kernel - else - { - bli_ddpackm_sifive_x280_ref - ( - conja, - schema, - cdim, - cdim_max, - cdim_bcast, - n, - n_max, - kappa, - a, inca, lda, - p, ldp, - params, - cntx - ); - } -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef VSSSEG8 -#undef VSSSEG7 -#undef VSSSEG6 -#undef VSSSEG5 -#undef VSSSEG4 -#undef VSSSEG3 -#undef VSSSEG2 -#undef NR - -#define FLT_SIZE 4 -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define VSSSEG4 "vssseg4e32.v " -#define VSSSEG6 "vssseg6e32.v " -#define VSSSEG8 "vssseg8e32.v " -#define NR 32 - -void bli_cpackm_sifive_x280_asm_6m2 - ( - conj_t conja, - pack_t schema, - dim_t cdim, - dim_t cdim_max, - dim_t cdim_bcast, - dim_t n, - dim_t n_max, - const void* restrict kappa_, - const void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp, - const void* restrict params, - const cntx_t* cntx - ) -{ - (void) cntx; - const scomplex* kappa = kappa_; - const scomplex* a = a_; - scomplex* p = p_; - - scomplex kappa_cast = *kappa; - - // MRxk kernel - if (cdim <= 6 && cdim_max == 6 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - switch (cdim) { - case 0: - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - case 1: - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v3, 0"); - case 2: - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v5, 0"); - case 3: - __asm__("vmv.v.i v6, 0"); - __asm__("vmv.v.i v7, 0"); - case 4: - __asm__("vmv.v.i v8, 0"); - __asm__("vmv.v.i v9, 0"); - case 5: - __asm__("vmv.v.i v10, 0"); - __asm__("vmv.v.i v11, 0"); - } - } - else { - switch (cdim) { - case 0: - __asm__("vmv.v.i v12, 0"); - __asm__("vmv.v.i v13, 0"); - case 1: - __asm__("vmv.v.i v14, 0"); - __asm__("vmv.v.i v15, 0"); - case 2: - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v17, 0"); - case 3: - __asm__("vmv.v.i v18, 0"); - __asm__("vmv.v.i v19, 0"); - case 4: - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v21, 0"); - case 5: - __asm__("vmv.v.i v22, 0"); - __asm__("vmv.v.i v23, 0"); - } - } - a += (cdim - 1) * inca; - size_t avl = n; - while (avl) { - const scomplex* a_tmp = a; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 6: - __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - switch (cdim) { - case 6: __asm__("vfneg.v v11, v11"); - case 5: __asm__("vfneg.v v9, v9"); - case 4: __asm__("vfneg.v v7, v7"); - case 3: __asm__("vfneg.v v5, v5"); - case 2: __asm__("vfneg.v v3, v3"); - case 1: __asm__("vfneg.v v1, v1"); - } - } - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); - __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - switch (cdim) { - case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); - case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); - case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); - case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - else { - switch (cdim) { - case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); - case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); - case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); - case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); - __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= 2 * FLT_SIZE; - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v3, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v1, v1"); - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v2, (%0)" : : "r"(p)); - } - a += lda; - p += ldp; - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // NRxk kernel - else if (cdim <= 32 && cdim_max == 32 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v18, 0"); - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - dim_t cdim_tmp = cdim; - const scomplex* a_tmp = a; - scomplex* p_tmp = p; - while (cdim_tmp >= 4) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v1, v1"); - __asm__("vfneg.v v3, v3"); - __asm__("vfneg.v v5, v5"); - __asm__("vfneg.v v7, v7"); - } - __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); - } - __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - } - p_tmp += 4; - cdim_tmp -= 4; - } - if (cdim_tmp > 0) { - a_tmp += (cdim_tmp - 1) * inca; - switch (cdim_tmp) { - case 3: - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - switch (cdim_tmp) { - case 3: __asm__("vfneg.v v5, v5"); - case 2: __asm__("vfneg.v v3, v3"); - case 1: __asm__("vfneg.v v1, v1"); - } - } - switch (cdim_tmp) { - case 3: - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - } - } - else { - if (conja == BLIS_NO_CONJUGATE) { - switch (cdim_tmp) { - case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - else { - switch (cdim_tmp) { - case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - switch (cdim_tmp) { - case 3: - __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - } - } - p_tmp += cdim_tmp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); - for (size_t i = 0; i < vl; ++i) { - __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp)); - p_tmp += ldp; - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v16, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= 2 * FLT_SIZE; - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v6, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v2, v2"); - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v4, (%0)" : : "r"(p)); - } - a += lda; - p += ldp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // generic kernel - else - { - bli_ccpackm_sifive_x280_ref - ( - conja, - schema, - cdim, - cdim_max, - cdim_bcast, - n, - n_max, - kappa, - a, inca, lda, - p, ldp, - params, - cntx - ); - } -} - -#undef FLT_SIZE -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef VSSSEG4 -#undef VSSSEG6 -#undef VSSSEG8 -#undef NR - -#define FLT_SIZE 8 -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define VSSSEG4 "vssseg4e64.v " -#define VSSSEG6 "vssseg6e64.v " -#define VSSSEG8 "vssseg8e64.v " -#define NR 16 - -void bli_zpackm_sifive_x280_asm_6m2 - ( - conj_t conja, - pack_t schema, - dim_t cdim, - dim_t cdim_max, - dim_t cdim_bcast, - dim_t n, - dim_t n_max, - const void* restrict kappa_, - const void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp, - const void* restrict params, - const cntx_t* cntx - ) -{ - (void) cntx; - const dcomplex* kappa = kappa_; - const dcomplex* a = a_; - dcomplex* p = p_; - - dcomplex kappa_cast = *kappa; - - // MRxk kernel - if (cdim <= 6 && cdim_max == 6 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - switch (cdim) { - case 0: - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - case 1: - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v3, 0"); - case 2: - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v5, 0"); - case 3: - __asm__("vmv.v.i v6, 0"); - __asm__("vmv.v.i v7, 0"); - case 4: - __asm__("vmv.v.i v8, 0"); - __asm__("vmv.v.i v9, 0"); - case 5: - __asm__("vmv.v.i v10, 0"); - __asm__("vmv.v.i v11, 0"); - } - } - else { - switch (cdim) { - case 0: - __asm__("vmv.v.i v12, 0"); - __asm__("vmv.v.i v13, 0"); - case 1: - __asm__("vmv.v.i v14, 0"); - __asm__("vmv.v.i v15, 0"); - case 2: - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v17, 0"); - case 3: - __asm__("vmv.v.i v18, 0"); - __asm__("vmv.v.i v19, 0"); - case 4: - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v21, 0"); - case 5: - __asm__("vmv.v.i v22, 0"); - __asm__("vmv.v.i v23, 0"); - } - } - a += (cdim - 1) * inca; - size_t avl = n; - while (avl) { - const dcomplex* a_tmp = a; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 6: - __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - switch (cdim) { - case 6: __asm__("vfneg.v v11, v11"); - case 5: __asm__("vfneg.v v9, v9"); - case 4: __asm__("vfneg.v v7, v7"); - case 3: __asm__("vfneg.v v5, v5"); - case 2: __asm__("vfneg.v v3, v3"); - case 1: __asm__("vfneg.v v1, v1"); - } - } - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); - __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - switch (cdim) { - case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); - case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); - case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); - case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - else { - switch (cdim) { - case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); - case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); - case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); - case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); - __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= 2 * FLT_SIZE; - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v3, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v1, v1"); - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v2, (%0)" : : "r"(p)); - } - a += lda; - p += ldp; - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // NRxk kernel - else if (cdim <= 16 && cdim_max == 16 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v18, 0"); - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - dim_t cdim_tmp = cdim; - const dcomplex* a_tmp = a; - dcomplex* p_tmp = p; - while (cdim_tmp >= 4) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v1, v1"); - __asm__("vfneg.v v3, v3"); - __asm__("vfneg.v v5, v5"); - __asm__("vfneg.v v7, v7"); - } - __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); - } - __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - } - p_tmp += 4; - cdim_tmp -= 4; - } - if (cdim_tmp > 0) { - a_tmp += (cdim_tmp - 1) * inca; - switch (cdim_tmp) { - case 3: - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - switch (cdim_tmp) { - case 3: __asm__("vfneg.v v5, v5"); - case 2: __asm__("vfneg.v v3, v3"); - case 1: __asm__("vfneg.v v1, v1"); - } - } - switch (cdim_tmp) { - case 3: - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - } - } - else { - if (conja == BLIS_NO_CONJUGATE) { - switch (cdim_tmp) { - case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - else { - switch (cdim_tmp) { - case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - switch (cdim_tmp) { - case 3: - __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - } - } - p_tmp += cdim_tmp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); - for (size_t i = 0; i < vl; ++i) { - __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp)); - p_tmp += ldp; - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v16, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= 2 * FLT_SIZE; - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v6, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v2, v2"); - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v4, (%0)" : : "r"(p)); - } - a += lda; - p += ldp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // generic kernel - else - { - bli_zzpackm_sifive_x280_ref - ( - conja, - schema, - cdim, - cdim_max, - cdim_bcast, - n, - n_max, - kappa, - a, inca, lda, - p, ldp, - params, - cntx - ); - } -} diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c new file mode 100644 index 0000000000..119872197a --- /dev/null +++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c @@ -0,0 +1,168 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define PACKM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##packm_sifive_x280_intr(\ + conj_t conja, \ + pack_t schema, \ + dim_t cdim, \ + dim_t cdim_max, \ + dim_t cdim_bcast, \ + dim_t n, \ + dim_t n_max, \ + const T* restrict kappa_, \ + const T* restrict a_, inc_t inca, inc_t lda, \ + T* restrict p_, inc_t ldp, \ + const T* restrict params, \ + const cntx_t* cntx \ +) + +#define PACKM(...) PACKM_(__VA_ARGS__) + +#define REF_KERNEL_(PRECISION_CHAR) bli_##PRECISION_CHAR##PRECISION_CHAR##packm_sifive_x280_ref +#define REF_KERNEL(PRECISION_CHAR) REF_KERNEL_(PRECISION_CHAR) + +// LMUL is the LMUL used when a is "row major" (lda == 1). Since we use +// segment stores with more than 4 fields, this is usually m1. +// LMUL_MR is an LMUL large enough to hold MR floats (for spackm, cpackm) +// or doubles (for dpackm, zpackm). LMUL_NR is analogous. + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m1 +#define LMUL_MR m1 +#define LMUL_NR m4 +#define FLT_SIZE sizeof(float) +#define MR 7 +#define NR 64 + +#include "./bli_packm_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef LMUL_MR +#undef LMUL_NR +#undef FLT_SIZE +#undef MR +#undef NR + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m1 +#define LMUL_MR m1 +#define LMUL_NR m4 +#define FLT_SIZE sizeof(double) +#define MR 7 +#define NR 32 + +#include "./bli_packm_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef LMUL_MR +#undef LMUL_NR +#undef FLT_SIZE +#undef MR +#undef NR + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m1 +#define LMUL_MR m1 +#define LMUL_NR m2 +#define FLT_SIZE sizeof(float) +#define MR 6 +#define NR 32 + +#include "./bli_packm_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef LMUL_MR +#undef LMUL_NR +#undef FLT_SIZE +#undef MR +#undef NR + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m1 +#define LMUL_MR m1 +#define LMUL_NR m2 +#define FLT_SIZE sizeof(double) +#define MR 6 +#define NR 16 + +#include "./bli_packm_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef LMUL_MR +#undef LMUL_NR +#undef FLT_SIZE +#undef MR +#undef NR + +#undef REF_KERNEL_ +#undef REF_KERNEL + +#undef PACKM +#undef PACKM_ diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..ee49090dc9 --- /dev/null +++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c @@ -0,0 +1,545 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef PACKM + +PACKM(PRECISION_CHAR, void) +{ + (void) schema; // Suppress unused parameter warnings + (void) params; + (void) cntx; + const DATATYPE* restrict kappa = kappa_; + const DATATYPE* restrict a = a_; + DATATYPE* restrict p = p_; + + // MRxk kernel + if (cdim <= MR && cdim_max == MR && cdim_bcast == 1) + { + if (lda == 1) { + // a is "row major" + RVV_TYPE_F(PREC, LMUL) arow0_r, arow1_r, arow2_r, arow3_r, arow4_r, arow5_r; + RVV_TYPE_F(PREC, LMUL) arow0_i, arow1_i, arow2_i, arow3_i, arow4_i, arow5_i; + RVV_TYPE_F(PREC, LMUL) kappa_arow0_r, kappa_arow1_r, kappa_arow2_r, + kappa_arow3_r, kappa_arow4_r, kappa_arow5_r; + RVV_TYPE_F(PREC, LMUL) kappa_arow0_i, kappa_arow1_i, kappa_arow2_i, + kappa_arow3_i, kappa_arow4_i, kappa_arow5_i; + // pad lower edge + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + switch (cdim) { + case 0: + arow0_r = VFMV_V_F(PREC, LMUL)(0., n); + arow0_i = VFMV_V_F(PREC, LMUL)(0., n); + case 1: + arow1_r = VFMV_V_F(PREC, LMUL)(0., n); + arow1_i = VFMV_V_F(PREC, LMUL)(0., n); + case 2: + arow2_r = VFMV_V_F(PREC, LMUL)(0., n); + arow2_i = VFMV_V_F(PREC, LMUL)(0., n); + case 3: + arow3_r = VFMV_V_F(PREC, LMUL)(0., n); + arow3_i = VFMV_V_F(PREC, LMUL)(0., n); + case 4: + arow4_r = VFMV_V_F(PREC, LMUL)(0., n); + arow4_i = VFMV_V_F(PREC, LMUL)(0., n); + case 5: + arow5_r = VFMV_V_F(PREC, LMUL)(0., n); + arow5_i = VFMV_V_F(PREC, LMUL)(0., n); + } + } else { + switch (cdim) { + case 0: + kappa_arow0_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow0_i = VFMV_V_F(PREC, LMUL)(0., n); + case 1: + kappa_arow1_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow1_i = VFMV_V_F(PREC, LMUL)(0., n); + case 2: + kappa_arow2_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow2_i = VFMV_V_F(PREC, LMUL)(0., n); + case 3: + kappa_arow3_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow3_i = VFMV_V_F(PREC, LMUL)(0., n); + case 4: + kappa_arow4_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow4_i = VFMV_V_F(PREC, LMUL)(0., n); + case 5: + kappa_arow5_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow5_i = VFMV_V_F(PREC, LMUL)(0., n); + } + } + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) arow_vec; + switch (cdim) { + case 6: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 5 * inca), vl); + arow5_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow5_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 5: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 4 * inca), vl); + arow4_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow4_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 4: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 3 * inca), vl); + arow3_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow3_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 3: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 2 * inca), vl); + arow2_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow2_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 2: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 1 * inca), vl); + arow1_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow1_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 1: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 0 * inca), vl); + arow0_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow0_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + } + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + switch (cdim) { + case 6: + arow5_i = VFNEG_VF(PREC, LMUL)(arow5_i, vl); + case 5: + arow4_i = VFNEG_VF(PREC, LMUL)(arow4_i, vl); + case 4: + arow3_i = VFNEG_VF(PREC, LMUL)(arow3_i, vl); + case 3: + arow2_i = VFNEG_VF(PREC, LMUL)(arow2_i, vl); + case 2: + arow1_i = VFNEG_VF(PREC, LMUL)(arow1_i, vl); + case 1: + arow0_i = VFNEG_VF(PREC, LMUL)(arow0_i, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 6) ablock = VUNDEFINED_FX(PREC, LMUL, 6)(); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, arow0_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, arow0_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, arow1_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, arow1_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, arow2_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, arow2_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p, 2 * FLT_SIZE * ldp, ablock, vl); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, arow3_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, arow3_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, arow4_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, arow4_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, arow5_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, arow5_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*)(p + 3), 2 * FLT_SIZE * ldp, ablock, vl); + } else { + if (bli_is_conj(conja)) { + switch (cdim) { + case 6: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow5_r, kappa_arow5_i, arow5_r, arow5_i, kappa->real, kappa->imag, vl); + case 5: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow4_r, kappa_arow4_i, arow4_r, arow4_i, kappa->real, kappa->imag, vl); + case 4: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl); + case 3: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + case 2: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + case 1: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + } + } else { + switch (cdim) { + case 6: + VCMUL_VF(PREC, LMUL, kappa_arow5_r, kappa_arow5_i, arow5_r, arow5_i, kappa->real, kappa->imag, vl); + case 5: + VCMUL_VF(PREC, LMUL, kappa_arow4_r, kappa_arow4_i, arow4_r, arow4_i, kappa->real, kappa->imag, vl); + case 4: + VCMUL_VF(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl); + case 3: + VCMUL_VF(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + case 2: + VCMUL_VF(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + case 1: + VCMUL_VF(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 6) ablock = VUNDEFINED_FX(PREC, LMUL, 6)(); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, kappa_arow0_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, kappa_arow0_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, kappa_arow1_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, kappa_arow1_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, kappa_arow2_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, kappa_arow2_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p, 2 * FLT_SIZE * ldp, ablock, vl); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, kappa_arow3_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, kappa_arow3_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, kappa_arow4_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, kappa_arow4_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, kappa_arow5_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, kappa_arow5_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*)(p + 3), 2 * FLT_SIZE * ldp, ablock, vl); + } + + a += vl; + p += vl * ldp; + avl -= vl; + } + + RVV_TYPE_FX(PREC, LMUL_MR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_MR, 2)(); + zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max)); + zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max)); + for (size_t i = n; i < n_max; ++i) { + VSSEG2_V_F(PREC, LMUL_MR, 2)((BASE_DT*) p, zero_padding, cdim_max); + p += ldp; + } + } + else { + RVV_TYPE_FX(PREC, LMUL_MR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_MR, 2)(); + zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max)); + zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max)); + + for (size_t i = 0; i < n; ++i) { + RVV_TYPE_FX(PREC, LMUL_MR, 2) acol; + if (inca == 1) + acol = VLSEG2_V_F_TU(PREC, LMUL_MR, 2)(zero_padding, (BASE_DT*) a, cdim); + else + acol = VLSSEG2_V_F_TU(PREC, LMUL_MR, 2)(zero_padding, (BASE_DT*) a, 2 * FLT_SIZE * inca, cdim); + RVV_TYPE_F(PREC, LMUL_MR) acol_r = VGET_V_F(PREC, LMUL_MR, 2)(acol, 0); + RVV_TYPE_F(PREC, LMUL_MR) acol_i = VGET_V_F(PREC, LMUL_MR, 2)(acol, 1); + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + acol_i = VFNEG_VF_TU(PREC, LMUL_MR)(acol_i, acol_i, cdim); + acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 0, acol_r); + acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 1, acol_i); + } + } else { + RVV_TYPE_F(PREC, LMUL_MR) kappa_acol_r, kappa_acol_i; + if (bli_is_conj(conja)) + VCMUL_VF_CONJ_TU(PREC, LMUL_MR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim); + else + VCMUL_VF_TU(PREC, LMUL_MR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim); + acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 0, kappa_acol_r); + acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 1, kappa_acol_i); + } + + VSSEG2_V_F(PREC, LMUL_MR, 2)((BASE_DT*) p, acol, cdim_max); + + a += lda; + p += ldp; + } + + for (size_t i = n; i < n_max; ++i) { + VSSEG2_V_F(PREC, LMUL_MR, 2)((BASE_DT*) p, zero_padding, cdim_max); + p += ldp; + } + } + } + // NRxk kernel + else if (cdim <= NR && cdim_max == NR && cdim_bcast == 1) + { + if (lda == 1) { + // a is "row major" + RVV_TYPE_FX(PREC, LMUL_NR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_NR, 2)(); + zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max)); + zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max)); + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + dim_t cdim_tmp = cdim; + const DATATYPE* restrict a_tmp = a; + DATATYPE* restrict p_tmp = p; + while (cdim_tmp >= 4) { + RVV_TYPE_FX(PREC, LMUL, 2) arow_vec; + RVV_TYPE_F(PREC, LMUL) arow0_r, arow1_r, arow2_r, arow3_r; + RVV_TYPE_F(PREC, LMUL) arow0_i, arow1_i, arow2_i, arow3_i; + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 0 * inca), vl); + arow0_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow0_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 1 * inca), vl); + arow1_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow1_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 2 * inca), vl); + arow2_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow2_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 3 * inca), vl); + arow3_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow3_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + arow0_i = VFNEG_VF(PREC, LMUL)(arow0_i, vl); + arow1_i = VFNEG_VF(PREC, LMUL)(arow1_i, vl); + arow2_i = VFNEG_VF(PREC, LMUL)(arow2_i, vl); + arow3_i = VFNEG_VF(PREC, LMUL)(arow3_i, vl); + } + + RVV_TYPE_FX(PREC, LMUL, 8) ablock = VUNDEFINED_FX(PREC, LMUL, 8)(); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 0, arow0_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 1, arow0_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 2, arow1_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 3, arow1_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 4, arow2_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 5, arow2_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 6, arow3_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 7, arow3_i); + VSSSEG8_V_F(PREC, LMUL, 8)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock, vl); + } else { + RVV_TYPE_F(PREC, LMUL) kappa_arow0_r, kappa_arow1_r, kappa_arow2_r, kappa_arow3_r; + RVV_TYPE_F(PREC, LMUL) kappa_arow0_i, kappa_arow1_i, kappa_arow2_i, kappa_arow3_i; + if (bli_is_conj(conja)) { + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl); + } else { + VCMUL_VF(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + VCMUL_VF(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + VCMUL_VF(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + VCMUL_VF(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl); + } + + RVV_TYPE_FX(PREC, LMUL, 8) ablock = VUNDEFINED_FX(PREC, LMUL, 8)(); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 0, kappa_arow0_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 1, kappa_arow0_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 2, kappa_arow1_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 3, kappa_arow1_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 4, kappa_arow2_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 5, kappa_arow2_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 6, kappa_arow3_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 7, kappa_arow3_i); + VSSSEG8_V_F(PREC, LMUL, 8)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock, vl); + } + + a_tmp += 4 * inca; + p_tmp += 4; + cdim_tmp -= 4; + } + + if (cdim_tmp > 0) { + RVV_TYPE_FX(PREC, LMUL, 2) arow_vec; + RVV_TYPE_F(PREC, LMUL) arow0_r, arow1_r, arow2_r; + RVV_TYPE_F(PREC, LMUL) arow0_i, arow1_i, arow2_i; + switch (cdim_tmp) { + case 3: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 2 * inca), vl); + arow2_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow2_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 2: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 1 * inca), vl); + arow1_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow1_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 1: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 0 * inca), vl); + arow0_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow0_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + } + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + switch (cdim_tmp) { + case 3: + arow2_i = VFNEG_VF(PREC, LMUL)(arow2_i, vl); + case 2: + arow1_i = VFNEG_VF(PREC, LMUL)(arow1_i, vl); + case 1: + arow0_i = VFNEG_VF(PREC, LMUL)(arow0_i, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 6) ablock3 = VUNDEFINED_FX(PREC, LMUL, 6)(); + RVV_TYPE_FX(PREC, LMUL, 4) ablock2 = VUNDEFINED_FX(PREC, LMUL, 4)(); + RVV_TYPE_FX(PREC, LMUL, 2) ablock1 = VUNDEFINED_FX(PREC, LMUL, 2)(); + switch (cdim_tmp) { + case 3: + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 0, arow0_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 1, arow0_i); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 2, arow1_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 3, arow1_i); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 4, arow2_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 5, arow2_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock3, vl); + break; + case 2: + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 0, arow0_r); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 1, arow0_i); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 2, arow1_r); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 3, arow1_i); + VSSSEG4_V_F(PREC, LMUL, 4)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock2, vl); + break; + case 1: + ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 0, arow0_r); + ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 1, arow0_i); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock1, vl); + break; + } + } else { + RVV_TYPE_F(PREC, LMUL) kappa_arow0_r, kappa_arow1_r, kappa_arow2_r; + RVV_TYPE_F(PREC, LMUL) kappa_arow0_i, kappa_arow1_i, kappa_arow2_i; + if (bli_is_conj(conja)) { + switch (cdim_tmp) { + case 3: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + case 2: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + case 1: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + } + } else { + switch (cdim_tmp) { + case 3: + VCMUL_VF(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + case 2: + VCMUL_VF(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + case 1: + VCMUL_VF(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 6) ablock3 = VUNDEFINED_FX(PREC, LMUL, 6)(); + RVV_TYPE_FX(PREC, LMUL, 4) ablock2 = VUNDEFINED_FX(PREC, LMUL, 4)(); + RVV_TYPE_FX(PREC, LMUL, 2) ablock1 = VUNDEFINED_FX(PREC, LMUL, 2)(); + switch (cdim_tmp) { + case 3: + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 0, kappa_arow0_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 1, kappa_arow0_i); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 2, kappa_arow1_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 3, kappa_arow1_i); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 4, kappa_arow2_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 5, kappa_arow2_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock3, vl); + break; + case 2: + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 0, kappa_arow0_r); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 1, kappa_arow0_i); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 2, kappa_arow1_r); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 3, kappa_arow1_i); + VSSSEG4_V_F(PREC, LMUL, 4)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock2, vl); + break; + case 1: + ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 0, kappa_arow0_r); + ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 1, kappa_arow0_i); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock1, vl); + break; + } + } + + p_tmp += cdim_tmp; + } + + // pad lower edge + for (size_t i = 0; i < vl; ++i) { + VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p_tmp, zero_padding, cdim_max - cdim); + p_tmp += ldp; + } + + a += vl; + p += vl * ldp; + avl -= vl; + } + + // pad right edge + for (size_t i = n; i < n_max; ++i) { + VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p, zero_padding, cdim_max); + p += ldp; + } + } else { + RVV_TYPE_FX(PREC, LMUL_NR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_NR, 2)(); + zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max)); + zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max)); + + for (size_t i = 0; i < n; ++i) { + RVV_TYPE_FX(PREC, LMUL_NR, 2) acol; + if (inca == 1) + acol = VLSEG2_V_F_TU(PREC, LMUL_NR, 2)(zero_padding, (BASE_DT*) a, cdim); + else + acol = VLSSEG2_V_F_TU(PREC, LMUL_NR, 2)(zero_padding, (BASE_DT*) a, 2 * FLT_SIZE * inca, cdim); + RVV_TYPE_F(PREC, LMUL_NR) acol_r = VGET_V_F(PREC, LMUL_NR, 2)(acol, 0); + RVV_TYPE_F(PREC, LMUL_NR) acol_i = VGET_V_F(PREC, LMUL_NR, 2)(acol, 1); + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + acol_i = VFNEG_VF_TU(PREC, LMUL_NR)(acol_i, acol_i, cdim); + acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 0, acol_r); + acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 1, acol_i); + } + } else { + RVV_TYPE_F(PREC, LMUL_NR) kappa_acol_r, kappa_acol_i; + if (bli_is_conj(conja)) + VCMUL_VF_CONJ_TU(PREC, LMUL_NR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim); + else + VCMUL_VF_TU(PREC, LMUL_NR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim); + acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 0, kappa_acol_r); + acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 1, kappa_acol_i); + } + + VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p, acol, cdim_max); + + a += lda; + p += ldp; + } + + for (size_t i = n; i < n_max; ++i) { + VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p, zero_padding, cdim_max); + p += ldp; + } + } + } + // generic kernel + else + { + REF_KERNEL(PRECISION_CHAR) + ( + conja, + schema, + cdim, + cdim_max, + cdim_bcast, + n, + n_max, + kappa, + a, inca, lda, + p, ldp, + params, + cntx + ); + } + + return; +} + +#endif // PACKM diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c new file mode 100644 index 0000000000..741714d60a --- /dev/null +++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c @@ -0,0 +1,364 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef PACKM + +PACKM(PRECISION_CHAR, void) +{ + (void) conja; // Suppress unused parameter warnings + (void) schema; + (void) params; + (void) cntx; + const DATATYPE* restrict kappa = kappa_; + const DATATYPE* restrict a = a_; + DATATYPE* restrict p = p_; + + // MRxk kernel + if (cdim <= MR && cdim_max == MR && cdim_bcast == 1) + { + if (lda == 1) { + // a is "row major" + // pad the lower edge with zeros + RVV_TYPE_F(PREC, LMUL) arow0, arow1, arow2, arow3, arow4, arow5, arow6; + switch (cdim) { + case 0: + arow0 = VFMV_V_F(PREC, LMUL)(0., n); + case 1: + arow1 = VFMV_V_F(PREC, LMUL)(0., n); + case 2: + arow2 = VFMV_V_F(PREC, LMUL)(0., n); + case 3: + arow3 = VFMV_V_F(PREC, LMUL)(0., n); + case 4: + arow4 = VFMV_V_F(PREC, LMUL)(0., n); + case 5: + arow5 = VFMV_V_F(PREC, LMUL)(0., n); + case 6: + arow6 = VFMV_V_F(PREC, LMUL)(0., n); + } + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + switch (cdim) { + case 7: + arow6 = VLE_V_F(PREC, LMUL)(a + 6 * inca, vl); + case 6: + arow5 = VLE_V_F(PREC, LMUL)(a + 5 * inca, vl); + case 5: + arow4 = VLE_V_F(PREC, LMUL)(a + 4 * inca, vl); + case 4: + arow3 = VLE_V_F(PREC, LMUL)(a + 3 * inca, vl); + case 3: + arow2 = VLE_V_F(PREC, LMUL)(a + 2 * inca, vl); + case 2: + arow1 = VLE_V_F(PREC, LMUL)(a + 1 * inca, vl); + case 1: + arow0 = VLE_V_F(PREC, LMUL)(a + 0 * inca, vl); + } + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + switch (cdim) { + case 7: + arow6 = VFMUL_VF(PREC, LMUL)(arow6, *kappa, vl); + case 6: + arow5 = VFMUL_VF(PREC, LMUL)(arow5, *kappa, vl); + case 5: + arow4 = VFMUL_VF(PREC, LMUL)(arow4, *kappa, vl); + case 4: + arow3 = VFMUL_VF(PREC, LMUL)(arow3, *kappa, vl); + case 3: + arow2 = VFMUL_VF(PREC, LMUL)(arow2, *kappa, vl); + case 2: + arow1 = VFMUL_VF(PREC, LMUL)(arow1, *kappa, vl); + case 1: + arow0 = VFMUL_VF(PREC, LMUL)(arow0, *kappa, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 7) ablock = VUNDEFINED_FX(PREC, LMUL, 7)(); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 0, arow0); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 1, arow1); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 2, arow2); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 3, arow3); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 4, arow4); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 5, arow5); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 6, arow6); + VSSSEG7_V_F(PREC, LMUL, 7)(p, FLT_SIZE * ldp, ablock, vl); + + a += vl; + p += vl * ldp; + avl -= vl; + } + + RVV_TYPE_F(PREC, LMUL_MR) zero_padding = VFMV_V_F(PREC, LMUL_MR)(0., cdim_max); + for (size_t i = n; i < n_max; ++i) { + VSE_V_F(PREC, LMUL_MR)(p, zero_padding, cdim_max); + p += ldp; + } + } + else { + RVV_TYPE_F(PREC, LMUL_MR) zero_padding = VFMV_V_F(PREC, LMUL_MR)(0., cdim_max); + for (size_t i = 0; i < n; ++i) { + RVV_TYPE_F(PREC, LMUL_MR) acol_vec; + if (inca == 1) + acol_vec = VLE_V_F_TU(PREC, LMUL_MR)(zero_padding, a, cdim); + else + acol_vec = VLSE_V_F_TU(PREC, LMUL_MR)(zero_padding, a, FLT_SIZE * inca, cdim); + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) + acol_vec = VFMUL_VF_TU(PREC, LMUL_MR)(acol_vec, acol_vec, *kappa, cdim); + + VSE_V_F(PREC, LMUL_MR)(p, acol_vec, cdim_max); + + a += lda; + p += ldp; + } + + for (size_t i = n; i < n_max; ++i) { + VSE_V_F(PREC, LMUL_MR)(p, zero_padding, cdim_max); + p += ldp; + } + } + } + // NRxk kernel + else if (cdim <= NR && cdim_max == NR && cdim_bcast == 1) + { + if (lda == 1) { + // a is "row major" + RVV_TYPE_F(PREC, LMUL_NR) zero_padding = VFMV_V_F(PREC, LMUL_NR)(0., cdim_max); + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + dim_t cdim_tmp = cdim; + const DATATYPE* restrict a_tmp = a; + DATATYPE* restrict p_tmp = p; + while (cdim_tmp >= 8) { + RVV_TYPE_F(PREC, LMUL) arow0, arow1, arow2, arow3, arow4, arow5, arow6, arow7; + arow0 = VLE_V_F(PREC, LMUL)(a_tmp + 0 * inca, vl); + arow1 = VLE_V_F(PREC, LMUL)(a_tmp + 1 * inca, vl); + arow2 = VLE_V_F(PREC, LMUL)(a_tmp + 2 * inca, vl); + arow3 = VLE_V_F(PREC, LMUL)(a_tmp + 3 * inca, vl); + arow4 = VLE_V_F(PREC, LMUL)(a_tmp + 4 * inca, vl); + arow5 = VLE_V_F(PREC, LMUL)(a_tmp + 5 * inca, vl); + arow6 = VLE_V_F(PREC, LMUL)(a_tmp + 6 * inca, vl); + arow7 = VLE_V_F(PREC, LMUL)(a_tmp + 7 * inca, vl); + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + arow0 = VFMUL_VF(PREC, LMUL)(arow0, *kappa, vl); + arow1 = VFMUL_VF(PREC, LMUL)(arow1, *kappa, vl); + arow2 = VFMUL_VF(PREC, LMUL)(arow2, *kappa, vl); + arow3 = VFMUL_VF(PREC, LMUL)(arow3, *kappa, vl); + arow4 = VFMUL_VF(PREC, LMUL)(arow4, *kappa, vl); + arow5 = VFMUL_VF(PREC, LMUL)(arow5, *kappa, vl); + arow6 = VFMUL_VF(PREC, LMUL)(arow6, *kappa, vl); + arow7 = VFMUL_VF(PREC, LMUL)(arow7, *kappa, vl); + } + + RVV_TYPE_FX(PREC, LMUL, 8) ablock = VUNDEFINED_FX(PREC, LMUL, 8)(); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 0, arow0); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 1, arow1); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 2, arow2); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 3, arow3); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 4, arow4); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 5, arow5); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 6, arow6); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 7, arow7); + VSSSEG8_V_F(PREC, LMUL, 8)(p_tmp, FLT_SIZE * ldp, ablock, vl); + + a_tmp += 8 * inca; + p_tmp += 8; + cdim_tmp -= 8; + } + + if (cdim_tmp > 0) { + RVV_TYPE_F(PREC, LMUL) arow0, arow1, arow2, arow3, arow4, arow5, arow6; + switch (cdim_tmp) { + case 7: + arow6 = VLE_V_F(PREC, LMUL)(a_tmp + 6 * inca, vl); + case 6: + arow5 = VLE_V_F(PREC, LMUL)(a_tmp + 5 * inca, vl); + case 5: + arow4 = VLE_V_F(PREC, LMUL)(a_tmp + 4 * inca, vl); + case 4: + arow3 = VLE_V_F(PREC, LMUL)(a_tmp + 3 * inca, vl); + case 3: + arow2 = VLE_V_F(PREC, LMUL)(a_tmp + 2 * inca, vl); + case 2: + arow1 = VLE_V_F(PREC, LMUL)(a_tmp + 1 * inca, vl); + case 1: + arow0 = VLE_V_F(PREC, LMUL)(a_tmp + 0 * inca, vl); + } + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + switch (cdim_tmp) { + case 7: + arow6 = VFMUL_VF(PREC, LMUL)(arow6, *kappa, vl); + case 6: + arow5 = VFMUL_VF(PREC, LMUL)(arow5, *kappa, vl); + case 5: + arow4 = VFMUL_VF(PREC, LMUL)(arow4, *kappa, vl); + case 4: + arow3 = VFMUL_VF(PREC, LMUL)(arow3, *kappa, vl); + case 3: + arow2 = VFMUL_VF(PREC, LMUL)(arow2, *kappa, vl); + case 2: + arow1 = VFMUL_VF(PREC, LMUL)(arow1, *kappa, vl); + case 1: + arow0 = VFMUL_VF(PREC, LMUL)(arow0, *kappa, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 7) ablock7 = VUNDEFINED_FX(PREC, LMUL, 7)(); + RVV_TYPE_FX(PREC, LMUL, 6) ablock6 = VUNDEFINED_FX(PREC, LMUL, 6)(); + RVV_TYPE_FX(PREC, LMUL, 5) ablock5 = VUNDEFINED_FX(PREC, LMUL, 5)(); + RVV_TYPE_FX(PREC, LMUL, 4) ablock4 = VUNDEFINED_FX(PREC, LMUL, 4)(); + RVV_TYPE_FX(PREC, LMUL, 3) ablock3 = VUNDEFINED_FX(PREC, LMUL, 3)(); + RVV_TYPE_FX(PREC, LMUL, 2) ablock2 = VUNDEFINED_FX(PREC, LMUL, 2)(); + switch (cdim_tmp) { + case 7: + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 0, arow0); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 1, arow1); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 2, arow2); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 3, arow3); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 4, arow4); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 5, arow5); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 6, arow6); + VSSSEG7_V_F(PREC, LMUL, 7)(p_tmp, FLT_SIZE * ldp, ablock7, vl); + break; + case 6: + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 0, arow0); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 1, arow1); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 2, arow2); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 3, arow3); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 4, arow4); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 5, arow5); + VSSSEG6_V_F(PREC, LMUL, 6)(p_tmp, FLT_SIZE * ldp, ablock6, vl); + break; + case 5: + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 0, arow0); + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 1, arow1); + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 2, arow2); + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 3, arow3); + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 4, arow4); + VSSSEG5_V_F(PREC, LMUL, 5)(p_tmp, FLT_SIZE * ldp, ablock5, vl); + break; + case 4: + ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 0, arow0); + ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 1, arow1); + ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 2, arow2); + ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 3, arow3); + VSSSEG4_V_F(PREC, LMUL, 4)(p_tmp, FLT_SIZE * ldp, ablock4, vl); + break; + case 3: + ablock3 = VSET_V_F(PREC, LMUL, 3)(ablock3, 0, arow0); + ablock3 = VSET_V_F(PREC, LMUL, 3)(ablock3, 1, arow1); + ablock3 = VSET_V_F(PREC, LMUL, 3)(ablock3, 2, arow2); + VSSSEG3_V_F(PREC, LMUL, 3)(p_tmp, FLT_SIZE * ldp, ablock3, vl); + break; + case 2: + ablock2 = VSET_V_F(PREC, LMUL, 2)(ablock2, 0, arow0); + ablock2 = VSET_V_F(PREC, LMUL, 2)(ablock2, 1, arow1); + VSSSEG2_V_F(PREC, LMUL, 2)(p_tmp, FLT_SIZE * ldp, ablock2, vl); + break; + case 1: + VSSE_V_F(PREC, LMUL)(p_tmp, FLT_SIZE * ldp, arow0, vl); + break; + } + p_tmp += cdim_tmp; + } + + for (size_t i = 0; i < vl; ++i) { + VSE_V_F(PREC, LMUL_NR)(p_tmp, zero_padding, cdim_max - cdim); + p_tmp += ldp; + } + + a += vl; + p += vl * ldp; + avl -= vl; + } + + for (size_t i = n; i < n_max; ++i) { + VSE_V_F(PREC, LMUL_NR)(p, zero_padding, cdim_max); + p += ldp; + } + } else { + RVV_TYPE_F(PREC, LMUL_NR) zero_padding = VFMV_V_F(PREC, LMUL_NR)(0., cdim_max); + for (size_t i = 0; i < n; ++i) { + RVV_TYPE_F(PREC, LMUL_NR) acol_vec; + if (inca == 1) + acol_vec = VLE_V_F_TU(PREC, LMUL_NR)(zero_padding, a, cdim); + else + acol_vec = VLSE_V_F_TU(PREC, LMUL_NR)(zero_padding, a, FLT_SIZE * inca, cdim); + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) + acol_vec = VFMUL_VF_TU(PREC, LMUL_NR)(acol_vec, acol_vec, *kappa, cdim); + + VSE_V_F(PREC, LMUL_NR)(p, acol_vec, cdim_max); + + a += lda; + p += ldp; + } + + for (size_t i = n; i < n_max; ++i) { + VSE_V_F(PREC, LMUL_NR)(p, zero_padding, cdim_max); + p += ldp; + } + } + } + // generic kernel + else + { + REF_KERNEL(PRECISION_CHAR) + ( + conja, + schema, + cdim, + cdim_max, + cdim_bcast, + n, + n_max, + kappa, + a, inca, lda, + p, ldp, + params, + cntx + ); + } + + return; +} + +#endif // PACKM diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h index 0ee01041ea..9ffc2f7449 100644 --- a/kernels/sifive_x280/bli_kernels_sifive_x280.h +++ b/kernels/sifive_x280/bli_kernels_sifive_x280.h @@ -38,10 +38,10 @@ ADDV_KER_PROT(double, d, addv_sifive_x280_intr) ADDV_KER_PROT(scomplex, c, addv_sifive_x280_intr) ADDV_KER_PROT(dcomplex, z, addv_sifive_x280_intr) -AMAXV_KER_PROT(float, s, amaxv_sifive_x280_asm) -AMAXV_KER_PROT(double, d, amaxv_sifive_x280_asm) -AMAXV_KER_PROT(scomplex, c, amaxv_sifive_x280_asm) -AMAXV_KER_PROT(dcomplex, z, amaxv_sifive_x280_asm) +AMAXV_KER_PROT(float, s, amaxv_sifive_x280_intr) +AMAXV_KER_PROT(double, d, amaxv_sifive_x280_intr) +AMAXV_KER_PROT(scomplex, c, amaxv_sifive_x280_intr) +AMAXV_KER_PROT(dcomplex, z, amaxv_sifive_x280_intr) AXPBYV_KER_PROT(float, s, axpbyv_sifive_x280_intr) AXPBYV_KER_PROT(double, d, axpbyv_sifive_x280_intr) @@ -53,10 +53,10 @@ AXPYV_KER_PROT(double, d, axpyv_sifive_x280_intr) AXPYV_KER_PROT(scomplex, c, axpyv_sifive_x280_intr) AXPYV_KER_PROT(dcomplex, z, axpyv_sifive_x280_intr) -COPYV_KER_PROT(float, s, copyv_sifive_x280_asm) -COPYV_KER_PROT(double, d, copyv_sifive_x280_asm) -COPYV_KER_PROT(scomplex, c, copyv_sifive_x280_asm) -COPYV_KER_PROT(dcomplex, z, copyv_sifive_x280_asm) +COPYV_KER_PROT(float, s, copyv_sifive_x280_intr) +COPYV_KER_PROT(double, d, copyv_sifive_x280_intr) +COPYV_KER_PROT(scomplex, c, copyv_sifive_x280_intr) +COPYV_KER_PROT(dcomplex, z, copyv_sifive_x280_intr) DOTV_KER_PROT(float, s, dotv_sifive_x280_intr) DOTV_KER_PROT(double, d, dotv_sifive_x280_intr) @@ -68,15 +68,15 @@ DOTXV_KER_PROT(double, d, dotxv_sifive_x280_intr) DOTXV_KER_PROT(scomplex, c, dotxv_sifive_x280_intr) DOTXV_KER_PROT(dcomplex, z, dotxv_sifive_x280_intr) -INVERTV_KER_PROT(float, s, invertv_sifive_x280_asm) -INVERTV_KER_PROT(double, d, invertv_sifive_x280_asm) -INVERTV_KER_PROT(scomplex, c, invertv_sifive_x280_asm) -INVERTV_KER_PROT(dcomplex, z, invertv_sifive_x280_asm) +INVERTV_KER_PROT(float, s, invertv_sifive_x280_intr) +INVERTV_KER_PROT(double, d, invertv_sifive_x280_intr) +INVERTV_KER_PROT(scomplex, c, invertv_sifive_x280_intr) +INVERTV_KER_PROT(dcomplex, z, invertv_sifive_x280_intr) -INVSCALV_KER_PROT(float, s, invscalv_sifive_x280_asm) -INVSCALV_KER_PROT(double, d, invscalv_sifive_x280_asm) -INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_asm) -INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_asm) +INVSCALV_KER_PROT(float, s, invscalv_sifive_x280_intr) +INVSCALV_KER_PROT(double, d, invscalv_sifive_x280_intr) +INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_intr) +INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_intr) SCAL2V_KER_PROT(float, s, scal2v_sifive_x280_intr) SCAL2V_KER_PROT(double, d, scal2v_sifive_x280_intr) @@ -88,20 +88,20 @@ SCALV_KER_PROT(double, d, scalv_sifive_x280_intr) SCALV_KER_PROT(scomplex, c, scalv_sifive_x280_intr) SCALV_KER_PROT(dcomplex, z, scalv_sifive_x280_intr) -SETV_KER_PROT(float, s, setv_sifive_x280_asm) -SETV_KER_PROT(double, d, setv_sifive_x280_asm) -SETV_KER_PROT(scomplex, c, setv_sifive_x280_asm) -SETV_KER_PROT(dcomplex, z, setv_sifive_x280_asm) +SETV_KER_PROT(float, s, setv_sifive_x280_intr) +SETV_KER_PROT(double, d, setv_sifive_x280_intr) +SETV_KER_PROT(scomplex, c, setv_sifive_x280_intr) +SETV_KER_PROT(dcomplex, z, setv_sifive_x280_intr) SUBV_KER_PROT(float, s, subv_sifive_x280_intr) SUBV_KER_PROT(double, d, subv_sifive_x280_intr) SUBV_KER_PROT(scomplex, c, subv_sifive_x280_intr) SUBV_KER_PROT(dcomplex, z, subv_sifive_x280_intr) -SWAPV_KER_PROT(float, s, swapv_sifive_x280_asm) -SWAPV_KER_PROT(double, d, swapv_sifive_x280_asm) -SWAPV_KER_PROT(scomplex, c, swapv_sifive_x280_asm) -SWAPV_KER_PROT(dcomplex, z, swapv_sifive_x280_asm) +SWAPV_KER_PROT(float, s, swapv_sifive_x280_intr) +SWAPV_KER_PROT(double, d, swapv_sifive_x280_intr) +SWAPV_KER_PROT(scomplex, c, swapv_sifive_x280_intr) +SWAPV_KER_PROT(dcomplex, z, swapv_sifive_x280_intr) XPBYV_KER_PROT(float, s, xpbyv_sifive_x280_intr) XPBYV_KER_PROT(double, d, xpbyv_sifive_x280_intr) @@ -114,31 +114,31 @@ AXPY2V_KER_PROT(double, d, axpy2v_sifive_x280_intr) AXPY2V_KER_PROT(scomplex, c, axpy2v_sifive_x280_intr) AXPY2V_KER_PROT(dcomplex, z, axpy2v_sifive_x280_intr) -AXPYF_KER_PROT(float, s, axpyf_sifive_x280_asm) -AXPYF_KER_PROT(double, d, axpyf_sifive_x280_asm) -AXPYF_KER_PROT(scomplex, c, axpyf_sifive_x280_asm) -AXPYF_KER_PROT(dcomplex, z, axpyf_sifive_x280_asm) +AXPYF_KER_PROT(float, s, axpyf_sifive_x280_intr) +AXPYF_KER_PROT(double, d, axpyf_sifive_x280_intr) +AXPYF_KER_PROT(scomplex, c, axpyf_sifive_x280_intr) +AXPYF_KER_PROT(dcomplex, z, axpyf_sifive_x280_intr) -DOTXF_KER_PROT(float, s, dotxf_sifive_x280_asm) -DOTXF_KER_PROT(double, d, dotxf_sifive_x280_asm) -DOTXF_KER_PROT(scomplex, c, dotxf_sifive_x280_asm) -DOTXF_KER_PROT(dcomplex, z, dotxf_sifive_x280_asm) +DOTXF_KER_PROT(float, s, dotxf_sifive_x280_intr) +DOTXF_KER_PROT(double, d, dotxf_sifive_x280_intr) +DOTXF_KER_PROT(scomplex, c, dotxf_sifive_x280_intr) +DOTXF_KER_PROT(dcomplex, z, dotxf_sifive_x280_intr) DOTAXPYV_KER_PROT(float, s, dotaxpyv_sifive_x280_intr) DOTAXPYV_KER_PROT(double, d, dotaxpyv_sifive_x280_intr) DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_x280_intr) DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_x280_intr) -DOTXAXPYF_KER_PROT(float, s, dotxaxpyf_sifive_x280_asm) -DOTXAXPYF_KER_PROT(double, d, dotxaxpyf_sifive_x280_asm) -DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_asm) -DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_asm) +DOTXAXPYF_KER_PROT(float, s, dotxaxpyf_sifive_x280_intr) +DOTXAXPYF_KER_PROT(double, d, dotxaxpyf_sifive_x280_intr) +DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_intr) +DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_intr) // Level 1m -PACKM_KER_PROT(float, s, packm_sifive_x280_asm_7m4) -PACKM_KER_PROT(double, d, packm_sifive_x280_asm_7m4) -PACKM_KER_PROT(scomplex, c, packm_sifive_x280_asm_6m2) -PACKM_KER_PROT(dcomplex, z, packm_sifive_x280_asm_6m2) +PACKM_KER_PROT(float, s, packm_sifive_x280_intr) +PACKM_KER_PROT(double, d, packm_sifive_x280_intr) +PACKM_KER_PROT(scomplex, c, packm_sifive_x280_intr) +PACKM_KER_PROT(dcomplex, z, packm_sifive_x280_intr) // Reference 1m PACKM_KER_PROT(float, ss, packm_sifive_x280_ref) diff --git a/kernels/sifive_x280/riscv_cmul_macros_intr.h b/kernels/sifive_x280/riscv_cmul_macros_intr.h new file mode 100644 index 0000000000..ea33fc5d10 --- /dev/null +++ b/kernels/sifive_x280/riscv_cmul_macros_intr.h @@ -0,0 +1,129 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "riscv_overloaded_intrinsics.h" + +// macros to emit complex multiplication +// caveat: the destination registers cannot overlap the source registers! + +// vd = vs2 * f[rs1] +#define VCMUL_VF(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \ + do { \ + VD_R = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_R, VL); \ + VD_I = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_I, VL); \ + VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd = conj(vs2) * f[rs1] +#define VCMUL_VF_CONJ(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \ + do { \ + VD_R = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_R, VL); \ + VD_I = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_I, VL); \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd = vs2 * f[rs1] +#define VCMUL_VF_TU(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \ + do { \ + VD_R = VFMUL_VF_TU(PREC, LMUL)(VS2_R, VS2_R, RS1_R, VL); \ + VD_I = VFMUL_VF_TU(PREC, LMUL)(VS2_I, VS2_I, RS1_R, VL); \ + VD_R = VFNMSAC_VF_TU(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMACC_VF_TU(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + } while(0) + +// vd = conj(vs2) * f[rs1] +#define VCMUL_VF_CONJ_TU(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \ + do { \ + VD_R = VFMUL_VF_TU(PREC, LMUL)(VS2_R, VS2_R, RS1_R, VL); \ + VD_I = VFMUL_VF_TU(PREC, LMUL)(VS2_I, VS2_I, RS1_R, VL); \ + VD_R = VFMACC_VF_TU(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMSAC_VF_TU(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + } while(0) + +// vd = vs2 * vs1 +#define VCMUL_VV(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, VS1_R, VS1_I, VL) \ + do { \ + VD_R = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_R, VL); \ + VD_I = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_I, VL); \ + VD_R = VFNMSAC_VV(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL); \ + VD_I = VFMACC_VV(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL); \ + } while(0) + +// vd = conj(vs2) * vs1 +#define VCMUL_VV_CONJ(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, VS1_R, VS1_I, VL) \ + do { \ + VD_R = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_R, VL); \ + VD_I = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_I, VL); \ + VD_R = VFMACC_VV(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VV(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL); \ + } while(0) + +// vd += vs2 * f[rs1] +#define VCMACC_VF(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd += conj(vs2) * f[rs1] +#define VCMACC_VF_CONJ(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd += vs2 * vs1 +#define VCMACC_VV_TU(PREC, LMUL, VD_R, VD_I, VS1_R, VS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMACC_VV_TU(PREC, LMUL)(VD_R, VS1_R, VS2_R, VL); \ + VD_I = VFMACC_VV_TU(PREC, LMUL)(VD_I, VS1_I, VS2_R, VL); \ + VD_R = VFNMSAC_VV_TU(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL); \ + VD_I = VFMACC_VV_TU(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL); \ + } while(0) + +// vd += conj(vs2) * vs1 +#define VCMACC_VV_CONJ_TU(PREC, LMUL, VD_R, VD_I, VS1_R, VS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMACC_VV_TU(PREC, LMUL)(VD_R, VS1_R, VS2_R, VL); \ + VD_I = VFMACC_VV_TU(PREC, LMUL)(VD_I, VS1_I, VS2_R, VL); \ + VD_R = VFMACC_VV_TU(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VV_TU(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL); \ + } while(0) + diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h index 6a1d11b131..df33afc0ea 100644 --- a/kernels/sifive_x280/riscv_overloaded_intrinsics.h +++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h @@ -33,10 +33,16 @@ */ // 6. Configuration-Setting and Utility Functions +#define RVV_TYPE_B_(RATIO) vbool##RATIO##_t +#define RVV_TYPE_B(RATIO) RVV_TYPE_B_(RATIO) +#define RVV_TYPE_U_(PRECISION, LMUL) vuint##PRECISION##LMUL##_t +#define RVV_TYPE_U(PRECISION, LMUL) RVV_TYPE_U_(PRECISION, LMUL) #define RVV_TYPE_F_(PRECISION, LMUL) vfloat##PRECISION##LMUL##_t #define RVV_TYPE_F(PRECISION, LMUL) RVV_TYPE_F_(PRECISION, LMUL) #define RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) vfloat##PRECISION##LMUL##x##NFIELDS##_t #define RVV_TYPE_FX(PRECISION, LMUL, NFIELDS) RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) +#define VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS) __riscv_vundefined_f##PRECISION##LMUL##x##NFIELDS +#define VUNDEFINED_FX(PRECISION, LMUL, NFIELDS) VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS) #define VSETVL_(PRECISION, LMUL) __riscv_vsetvl_e##PRECISION##LMUL #define VSETVL(PRECISION, LMUL) VSETVL_(PRECISION, LMUL) @@ -50,6 +56,14 @@ #define VLSEG2_V_F(PRECISION, LMUL, NFIELDS) VLSEG2_V_F_(PRECISION, LMUL, NFIELDS) #define VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS #define VLSSEG2_V_F(PRECISION, LMUL, NFIELDS) VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS) +#define VLE_V_F_TU_(PRECISION, LMUL) __riscv_vle##PRECISION##_v_f##PRECISION##LMUL##_tu +#define VLE_V_F_TU(PRECISION, LMUL) VLE_V_F_TU_(PRECISION, LMUL) +#define VLSE_V_F_TU_(PRECISION, LMUL) __riscv_vlse##PRECISION##_v_f##PRECISION##LMUL##_tu +#define VLSE_V_F_TU(PRECISION, LMUL) VLSE_V_F_TU_(PRECISION, LMUL) +#define VLSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS) __riscv_vlseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS##_tu +#define VLSEG2_V_F_TU(PRECISION, LMUL, NFIELDS) VLSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS) +#define VLSSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS) __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS##_tu +#define VLSSEG2_V_F_TU(PRECISION, LMUL, NFIELDS) VLSSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS) // Stores #define VSE_V_F_(PRECISION, LMUL) __riscv_vse##PRECISION##_v_f##PRECISION##LMUL #define VSE_V_F(PRECISION, LMUL) VSE_V_F_(PRECISION, LMUL) @@ -59,6 +73,24 @@ #define VSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSEG2_V_F_(PRECISION, LMUL, NFIELDS) #define VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS #define VSSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG3_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg3e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG3_V_F(PRECISION, LMUL, NFIELDS) VSSSEG3_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG4_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg4e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG4_V_F(PRECISION, LMUL, NFIELDS) VSSSEG4_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG5_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg5e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG5_V_F(PRECISION, LMUL, NFIELDS) VSSSEG5_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG6_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg6e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG6_V_F(PRECISION, LMUL, NFIELDS) VSSSEG6_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG7_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg7e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG7_V_F(PRECISION, LMUL, NFIELDS) VSSSEG7_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG8_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg8e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG8_V_F(PRECISION, LMUL, NFIELDS) VSSSEG8_V_F_(PRECISION, LMUL, NFIELDS) + +// Integer Operations +#define VADD_VX_U_(PRECISION, LMUL) __riscv_vadd_vx_u##PRECISION##LMUL +#define VADD_VX_U(PRECISION, LMUL) VADD_VX_U_(PRECISION, LMUL) +#define VMERGE_VVM_TU_U_(PRECISION, LMUL) __riscv_vmerge_vvm_u##PRECISION##LMUL##_tu +#define VMERGE_VVM_TU_U(PRECISION, LMUL) VMERGE_VVM_TU_U_(PRECISION, LMUL) // 13. Vector Floating-Point Operations #define VFADD_VV_(PRECISION, LMUL) __riscv_vfadd_vv_f##PRECISION##LMUL @@ -67,40 +99,88 @@ #define VFSUB_VV(PRECISION, LMUL) VFSUB_VV_(PRECISION, LMUL) #define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL #define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL) +#define VFMUL_VF_TU_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL##_tu +#define VFMUL_VF_TU(PRECISION, LMUL) VFMUL_VF_TU_(PRECISION, LMUL) #define VFMUL_VV_(PRECISION, LMUL) __riscv_vfmul_vv_f##PRECISION##LMUL #define VFMUL_VV(PRECISION, LMUL) VFMUL_VV_(PRECISION, LMUL) -#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL -#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL) +#define VFDIV_VV_(PRECISION, LMUL) __riscv_vfdiv_vv_f##PRECISION##LMUL +#define VFDIV_VV(PRECISION, LMUL) VFDIV_VV_(PRECISION, LMUL) +#define VFRDIV_VF_(PRECISION, LMUL) __riscv_vfrdiv_vf_f##PRECISION##LMUL +#define VFRDIV_VF(PRECISION, LMUL) VFRDIV_VF_(PRECISION, LMUL) #define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL #define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL) +#define VFMACC_VF_TU_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL##_tu +#define VFMACC_VF_TU(PRECISION, LMUL) VFMACC_VF_TU_(PRECISION, LMUL) #define VFMACC_VV_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL #define VFMACC_VV(PRECISION, LMUL) VFMACC_VV_(PRECISION, LMUL) #define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu #define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL) #define VFMSAC_VF_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL #define VFMSAC_VF(PRECISION, LMUL) VFMSAC_VF_(PRECISION, LMUL) +#define VFMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL##_tu +#define VFMSAC_VF_TU(PRECISION, LMUL) VFMSAC_VF_TU_(PRECISION, LMUL) #define VFNMSAC_VF_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL #define VFNMSAC_VF(PRECISION, LMUL) VFNMSAC_VF_(PRECISION, LMUL) +#define VFNMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL##_tu +#define VFNMSAC_VF_TU(PRECISION, LMUL) VFNMSAC_VF_TU_(PRECISION, LMUL) +#define VFNMSAC_VV_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL +#define VFNMSAC_VV(PRECISION, LMUL) VFNMSAC_VV_(PRECISION, LMUL) #define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu #define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL) #define VFMADD_VF_(PRECISION, LMUL) __riscv_vfmadd_vf_f##PRECISION##LMUL #define VFMADD_VF(PRECISION, LMUL) VFMADD_VF_(PRECISION, LMUL) #define VFMSUB_VF_(PRECISION, LMUL) __riscv_vfmsub_vf_f##PRECISION##LMUL #define VFMSUB_VF(PRECISION, LMUL) VFMSUB_VF_(PRECISION, LMUL) +#define VFABS_V_(PRECISION, LMUL) __riscv_vfabs_v_f##PRECISION##LMUL +#define VFABS_V(PRECISION, LMUL) VFABS_V_(PRECISION, LMUL) #define VFNEG_VF_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL #define VFNEG_VF(PRECISION, LMUL) VFNEG_VF_(PRECISION, LMUL) +#define VFNEG_VF_TU_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL##_tu +#define VFNEG_VF_TU(PRECISION, LMUL) VFNEG_VF_TU_(PRECISION, LMUL) #define VFMV_V_V_(PRECISION, LMUL) VREINTERPRET_V_I_F(PRECISION, LMUL)( __riscv_vmv_v_v_i##PRECISION##LMUL( VREINTERPRET_V_F_I(PRECISION, LMUL) CURRY_1ARG #define VFMV_V_V(PRECISION, LMUL) VFMV_V_V_(PRECISION, LMUL) +#define VFMV_V_F_(PRECISION, LMUL) __riscv_vfmv_v_f_f##PRECISION##LMUL +#define VFMV_V_F(PRECISION, LMUL) VFMV_V_F_(PRECISION, LMUL) +#define VMFEQ_VV_(PRECISION, LMUL, RATIO) __riscv_vmfeq_vv_f##PRECISION##LMUL##_b##RATIO +#define VMFEQ_VV(PRECISION, LMUL, RATIO) VMFEQ_VV_(PRECISION, LMUL, RATIO) +#define VMFNE_VV_(PRECISION, LMUL, RATIO) __riscv_vmfne_vv_f##PRECISION##LMUL##_b##RATIO +#define VMFNE_VV(PRECISION, LMUL, RATIO) VMFNE_VV_(PRECISION, LMUL, RATIO) +#define VMFGT_VV_(PRECISION, LMUL, RATIO) __riscv_vmfgt_vv_f##PRECISION##LMUL##_b##RATIO +#define VMFGT_VV(PRECISION, LMUL, RATIO) VMFGT_VV_(PRECISION, LMUL, RATIO) +#define VMFGE_VV_(PRECISION, LMUL, RATIO) __riscv_vmfge_vv_f##PRECISION##LMUL##_b##RATIO +#define VMFGE_VV(PRECISION, LMUL, RATIO) VMFGE_VV_(PRECISION, LMUL, RATIO) +#define VFMAX_VV_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL +#define VFMAX_VV(PRECISION, LMUL) VFMAX_VV_(PRECISION, LMUL) +#define VFMAX_VV_TU_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL##_tu +#define VFMAX_VV_TU(PRECISION, LMUL) VFMAX_VV_TU_(PRECISION, LMUL) +#define VMERGE_VVM_F_(PRECISION, LMUL) __riscv_vmerge_vvm_f##PRECISION##LMUL +#define VMERGE_VVM_F(PRECISION, LMUL) VMERGE_VVM_F_(PRECISION, LMUL) + +// Mask Operations +#define VFIRST_M_(RATIO) __riscv_vfirst_m_b##RATIO +#define VFIRST_M(RATIO) VFIRST_M_(RATIO) +#define VID_V_(PRECISION, LMUL) __riscv_vid_v_u##PRECISION##LMUL +#define VID_V(PRECISION, LMUL) VID_V_(PRECISION, LMUL) // 14. Vector Reduction Operations #define VF_REDUSUM_VS_(PRECISION, LMUL) __riscv_vfredusum_vs_f##PRECISION##LMUL##_f##PRECISION##m1 #define VF_REDUSUM_VS(PRECISION, LMUL) VF_REDUSUM_VS_(PRECISION, LMUL) +#define VFREDMAX_VS_(PRECISION, LMUL) __riscv_vfredmax_vs_f##PRECISION##LMUL##_f##PRECISION##m1 +#define VFREDMAX_VS(PRECISION, LMUL) VFREDMAX_VS_(PRECISION, LMUL) +#define VREDMINU_VS_M_(PRECISION, LMUL) __riscv_vredminu_vs_u##PRECISION##LMUL##_u##PRECISION##m1_m +#define VREDMINU_VS_M(PRECISION, LMUL) VREDMINU_VS_M_(PRECISION, LMUL) // 16. Vector Permutation Operations +#define VMV_S_X_U_(PRECISION, LMUL) __riscv_vmv_s_x_u##PRECISION##LMUL +#define VMV_S_X_U(PRECISION, LMUL) VMV_S_X_U_(PRECISION, LMUL) #define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL #define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL) +#define VMV_X_S_U_(PRECISION) __riscv_vmv_x_s_u##PRECISION##m1_u##PRECISION +#define VMV_X_S_U(PRECISION) VMV_X_S_U_(PRECISION) #define VFMV_F_S_(PRECISION) __riscv_vfmv_f_s_f##PRECISION##m1_f##PRECISION #define VFMV_F_S(PRECISION) VFMV_F_S_(PRECISION) +#define VRGATHER_VX_F_(PRECISION, LMUL) __riscv_vrgather_vx_f##PRECISION##LMUL +#define VRGATHER_VX_F(PRECISION, LMUL) VRGATHER_VX_F_(PRECISION, LMUL) // Miscellaneous Vector Function #define VREINTERPRET_V_I_F_(PRECISION, LMUL) __riscv_vreinterpret_v_i##PRECISION##LMUL##_f##PRECISION##LMUL @@ -111,6 +191,8 @@ #define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS) #define VSET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vset_v_f##PRECISION##LMUL##_f##PRECISION##LMUL##x##NFIELDS #define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS) +#define VLMUL_EXT_V_F_M1_(PRECISION, LMUL) __riscv_vlmul_ext_v_f##PRECISION##m1##_f##PRECISION##LMUL +#define VLMUL_EXT_V_F_M1(PRECISION, LMUL) VLMUL_EXT_V_F_M1_(PRECISION, LMUL) // Non-vector functions #define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__)) From 436ec9d8fea346c807c4c7b01328afa978197211 Mon Sep 17 00:00:00 2001 From: Michael Yeh <111819036+myeh01@users.noreply.github.com> Date: Thu, 1 Aug 2024 14:39:00 -0700 Subject: [PATCH 2/8] Reorder intrinsics --- .../sifive_x280/riscv_overloaded_intrinsics.h | 97 ++++++++++--------- 1 file changed, 49 insertions(+), 48 deletions(-) diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h index df33afc0ea..acf3cb5b47 100644 --- a/kernels/sifive_x280/riscv_overloaded_intrinsics.h +++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h @@ -41,8 +41,6 @@ #define RVV_TYPE_F(PRECISION, LMUL) RVV_TYPE_F_(PRECISION, LMUL) #define RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) vfloat##PRECISION##LMUL##x##NFIELDS##_t #define RVV_TYPE_FX(PRECISION, LMUL, NFIELDS) RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) -#define VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS) __riscv_vundefined_f##PRECISION##LMUL##x##NFIELDS -#define VUNDEFINED_FX(PRECISION, LMUL, NFIELDS) VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS) #define VSETVL_(PRECISION, LMUL) __riscv_vsetvl_e##PRECISION##LMUL #define VSETVL(PRECISION, LMUL) VSETVL_(PRECISION, LMUL) @@ -86,7 +84,7 @@ #define VSSSEG8_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg8e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS #define VSSSEG8_V_F(PRECISION, LMUL, NFIELDS) VSSSEG8_V_F_(PRECISION, LMUL, NFIELDS) -// Integer Operations +// 11. Vector Integer Arithmetic Operations #define VADD_VX_U_(PRECISION, LMUL) __riscv_vadd_vx_u##PRECISION##LMUL #define VADD_VX_U(PRECISION, LMUL) VADD_VX_U_(PRECISION, LMUL) #define VMERGE_VVM_TU_U_(PRECISION, LMUL) __riscv_vmerge_vvm_u##PRECISION##LMUL##_tu @@ -97,50 +95,34 @@ #define VFADD_VV(PRECISION, LMUL) VFADD_VV_(PRECISION, LMUL) #define VFSUB_VV_(PRECISION, LMUL) __riscv_vfsub_vv_f##PRECISION##LMUL #define VFSUB_VV(PRECISION, LMUL) VFSUB_VV_(PRECISION, LMUL) -#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL -#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL) -#define VFMUL_VF_TU_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL##_tu -#define VFMUL_VF_TU(PRECISION, LMUL) VFMUL_VF_TU_(PRECISION, LMUL) #define VFMUL_VV_(PRECISION, LMUL) __riscv_vfmul_vv_f##PRECISION##LMUL #define VFMUL_VV(PRECISION, LMUL) VFMUL_VV_(PRECISION, LMUL) +#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL +#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL) #define VFDIV_VV_(PRECISION, LMUL) __riscv_vfdiv_vv_f##PRECISION##LMUL #define VFDIV_VV(PRECISION, LMUL) VFDIV_VV_(PRECISION, LMUL) #define VFRDIV_VF_(PRECISION, LMUL) __riscv_vfrdiv_vf_f##PRECISION##LMUL #define VFRDIV_VF(PRECISION, LMUL) VFRDIV_VF_(PRECISION, LMUL) -#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL -#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL) -#define VFMACC_VF_TU_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL##_tu -#define VFMACC_VF_TU(PRECISION, LMUL) VFMACC_VF_TU_(PRECISION, LMUL) #define VFMACC_VV_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL #define VFMACC_VV(PRECISION, LMUL) VFMACC_VV_(PRECISION, LMUL) -#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu -#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL) +#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL +#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL) #define VFMSAC_VF_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL #define VFMSAC_VF(PRECISION, LMUL) VFMSAC_VF_(PRECISION, LMUL) -#define VFMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL##_tu -#define VFMSAC_VF_TU(PRECISION, LMUL) VFMSAC_VF_TU_(PRECISION, LMUL) -#define VFNMSAC_VF_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL -#define VFNMSAC_VF(PRECISION, LMUL) VFNMSAC_VF_(PRECISION, LMUL) -#define VFNMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL##_tu -#define VFNMSAC_VF_TU(PRECISION, LMUL) VFNMSAC_VF_TU_(PRECISION, LMUL) #define VFNMSAC_VV_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL #define VFNMSAC_VV(PRECISION, LMUL) VFNMSAC_VV_(PRECISION, LMUL) -#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu -#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL) +#define VFNMSAC_VF_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL +#define VFNMSAC_VF(PRECISION, LMUL) VFNMSAC_VF_(PRECISION, LMUL) #define VFMADD_VF_(PRECISION, LMUL) __riscv_vfmadd_vf_f##PRECISION##LMUL #define VFMADD_VF(PRECISION, LMUL) VFMADD_VF_(PRECISION, LMUL) #define VFMSUB_VF_(PRECISION, LMUL) __riscv_vfmsub_vf_f##PRECISION##LMUL #define VFMSUB_VF(PRECISION, LMUL) VFMSUB_VF_(PRECISION, LMUL) -#define VFABS_V_(PRECISION, LMUL) __riscv_vfabs_v_f##PRECISION##LMUL -#define VFABS_V(PRECISION, LMUL) VFABS_V_(PRECISION, LMUL) +#define VFMAX_VV_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL +#define VFMAX_VV(PRECISION, LMUL) VFMAX_VV_(PRECISION, LMUL) #define VFNEG_VF_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL #define VFNEG_VF(PRECISION, LMUL) VFNEG_VF_(PRECISION, LMUL) -#define VFNEG_VF_TU_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL##_tu -#define VFNEG_VF_TU(PRECISION, LMUL) VFNEG_VF_TU_(PRECISION, LMUL) -#define VFMV_V_V_(PRECISION, LMUL) VREINTERPRET_V_I_F(PRECISION, LMUL)( __riscv_vmv_v_v_i##PRECISION##LMUL( VREINTERPRET_V_F_I(PRECISION, LMUL) CURRY_1ARG -#define VFMV_V_V(PRECISION, LMUL) VFMV_V_V_(PRECISION, LMUL) -#define VFMV_V_F_(PRECISION, LMUL) __riscv_vfmv_v_f_f##PRECISION##LMUL -#define VFMV_V_F(PRECISION, LMUL) VFMV_V_F_(PRECISION, LMUL) +#define VFABS_V_(PRECISION, LMUL) __riscv_vfabs_v_f##PRECISION##LMUL +#define VFABS_V(PRECISION, LMUL) VFABS_V_(PRECISION, LMUL) #define VMFEQ_VV_(PRECISION, LMUL, RATIO) __riscv_vmfeq_vv_f##PRECISION##LMUL##_b##RATIO #define VMFEQ_VV(PRECISION, LMUL, RATIO) VMFEQ_VV_(PRECISION, LMUL, RATIO) #define VMFNE_VV_(PRECISION, LMUL, RATIO) __riscv_vmfne_vv_f##PRECISION##LMUL##_b##RATIO @@ -149,36 +131,53 @@ #define VMFGT_VV(PRECISION, LMUL, RATIO) VMFGT_VV_(PRECISION, LMUL, RATIO) #define VMFGE_VV_(PRECISION, LMUL, RATIO) __riscv_vmfge_vv_f##PRECISION##LMUL##_b##RATIO #define VMFGE_VV(PRECISION, LMUL, RATIO) VMFGE_VV_(PRECISION, LMUL, RATIO) -#define VFMAX_VV_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL -#define VFMAX_VV(PRECISION, LMUL) VFMAX_VV_(PRECISION, LMUL) -#define VFMAX_VV_TU_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL##_tu -#define VFMAX_VV_TU(PRECISION, LMUL) VFMAX_VV_TU_(PRECISION, LMUL) #define VMERGE_VVM_F_(PRECISION, LMUL) __riscv_vmerge_vvm_f##PRECISION##LMUL #define VMERGE_VVM_F(PRECISION, LMUL) VMERGE_VVM_F_(PRECISION, LMUL) +#define VFMV_V_V_(PRECISION, LMUL) VREINTERPRET_V_I_F(PRECISION, LMUL)( __riscv_vmv_v_v_i##PRECISION##LMUL( VREINTERPRET_V_F_I(PRECISION, LMUL) CURRY_1ARG +#define VFMV_V_V(PRECISION, LMUL) VFMV_V_V_(PRECISION, LMUL) +#define VFMV_V_F_(PRECISION, LMUL) __riscv_vfmv_v_f_f##PRECISION##LMUL +#define VFMV_V_F(PRECISION, LMUL) VFMV_V_F_(PRECISION, LMUL) -// Mask Operations -#define VFIRST_M_(RATIO) __riscv_vfirst_m_b##RATIO -#define VFIRST_M(RATIO) VFIRST_M_(RATIO) -#define VID_V_(PRECISION, LMUL) __riscv_vid_v_u##PRECISION##LMUL -#define VID_V(PRECISION, LMUL) VID_V_(PRECISION, LMUL) +#define VFMUL_VF_TU_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL##_tu +#define VFMUL_VF_TU(PRECISION, LMUL) VFMUL_VF_TU_(PRECISION, LMUL) +#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu +#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL) +#define VFMACC_VF_TU_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL##_tu +#define VFMACC_VF_TU(PRECISION, LMUL) VFMACC_VF_TU_(PRECISION, LMUL) +#define VFMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL##_tu +#define VFMSAC_VF_TU(PRECISION, LMUL) VFMSAC_VF_TU_(PRECISION, LMUL) +#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu +#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL) +#define VFNMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL##_tu +#define VFNMSAC_VF_TU(PRECISION, LMUL) VFNMSAC_VF_TU_(PRECISION, LMUL) +#define VFMAX_VV_TU_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL##_tu +#define VFMAX_VV_TU(PRECISION, LMUL) VFMAX_VV_TU_(PRECISION, LMUL) +#define VFNEG_VF_TU_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL##_tu +#define VFNEG_VF_TU(PRECISION, LMUL) VFNEG_VF_TU_(PRECISION, LMUL) // 14. Vector Reduction Operations +#define VREDMINU_VS_M_(PRECISION, LMUL) __riscv_vredminu_vs_u##PRECISION##LMUL##_u##PRECISION##m1_m +#define VREDMINU_VS_M(PRECISION, LMUL) VREDMINU_VS_M_(PRECISION, LMUL) #define VF_REDUSUM_VS_(PRECISION, LMUL) __riscv_vfredusum_vs_f##PRECISION##LMUL##_f##PRECISION##m1 #define VF_REDUSUM_VS(PRECISION, LMUL) VF_REDUSUM_VS_(PRECISION, LMUL) #define VFREDMAX_VS_(PRECISION, LMUL) __riscv_vfredmax_vs_f##PRECISION##LMUL##_f##PRECISION##m1 #define VFREDMAX_VS(PRECISION, LMUL) VFREDMAX_VS_(PRECISION, LMUL) -#define VREDMINU_VS_M_(PRECISION, LMUL) __riscv_vredminu_vs_u##PRECISION##LMUL##_u##PRECISION##m1_m -#define VREDMINU_VS_M(PRECISION, LMUL) VREDMINU_VS_M_(PRECISION, LMUL) + +// 15. Vector Mask Operations +#define VFIRST_M_(RATIO) __riscv_vfirst_m_b##RATIO +#define VFIRST_M(RATIO) VFIRST_M_(RATIO) +#define VID_V_(PRECISION, LMUL) __riscv_vid_v_u##PRECISION##LMUL +#define VID_V(PRECISION, LMUL) VID_V_(PRECISION, LMUL) // 16. Vector Permutation Operations -#define VMV_S_X_U_(PRECISION, LMUL) __riscv_vmv_s_x_u##PRECISION##LMUL -#define VMV_S_X_U(PRECISION, LMUL) VMV_S_X_U_(PRECISION, LMUL) -#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL -#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL) #define VMV_X_S_U_(PRECISION) __riscv_vmv_x_s_u##PRECISION##m1_u##PRECISION #define VMV_X_S_U(PRECISION) VMV_X_S_U_(PRECISION) +#define VMV_S_X_U_(PRECISION, LMUL) __riscv_vmv_s_x_u##PRECISION##LMUL +#define VMV_S_X_U(PRECISION, LMUL) VMV_S_X_U_(PRECISION, LMUL) #define VFMV_F_S_(PRECISION) __riscv_vfmv_f_s_f##PRECISION##m1_f##PRECISION #define VFMV_F_S(PRECISION) VFMV_F_S_(PRECISION) +#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL +#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL) #define VRGATHER_VX_F_(PRECISION, LMUL) __riscv_vrgather_vx_f##PRECISION##LMUL #define VRGATHER_VX_F(PRECISION, LMUL) VRGATHER_VX_F_(PRECISION, LMUL) @@ -187,12 +186,14 @@ #define VREINTERPRET_V_I_F(PRECISION, LMUL) VREINTERPRET_V_I_F_(PRECISION, LMUL) #define VREINTERPRET_V_F_I_(PRECISION, LMUL) __riscv_vreinterpret_v_f##PRECISION##LMUL##_i##PRECISION##LMUL #define VREINTERPRET_V_F_I(PRECISION, LMUL) VREINTERPRET_V_F_I_(PRECISION, LMUL) -#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL -#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS) -#define VSET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vset_v_f##PRECISION##LMUL##_f##PRECISION##LMUL##x##NFIELDS -#define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS) #define VLMUL_EXT_V_F_M1_(PRECISION, LMUL) __riscv_vlmul_ext_v_f##PRECISION##m1##_f##PRECISION##LMUL #define VLMUL_EXT_V_F_M1(PRECISION, LMUL) VLMUL_EXT_V_F_M1_(PRECISION, LMUL) +#define VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS) __riscv_vundefined_f##PRECISION##LMUL##x##NFIELDS +#define VUNDEFINED_FX(PRECISION, LMUL, NFIELDS) VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS) +#define VSET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vset_v_f##PRECISION##LMUL##_f##PRECISION##LMUL##x##NFIELDS +#define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS) +#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL +#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS) // Non-vector functions #define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__)) From f16ad581839112fc66ba5ae3841157e2bf1256e8 Mon Sep 17 00:00:00 2001 From: Michael Yeh Date: Thu, 1 Aug 2024 14:44:28 -0700 Subject: [PATCH 3/8] Update toolchain --- travis/do_riscv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/do_riscv.sh b/travis/do_riscv.sh index 56c2b85c26..560e3385be 100755 --- a/travis/do_riscv.sh +++ b/travis/do_riscv.sh @@ -3,7 +3,7 @@ set -e set -x -TAG=2023.10.18 +TAG=2024.04.12 # The prebuilt toolchains only support hardfloat, so we only # test these for now. From 4e5e5215d47c97e666f257882f94b35654cfa2f8 Mon Sep 17 00:00:00 2001 From: Michael Yeh Date: Mon, 5 Aug 2024 12:39:55 -0700 Subject: [PATCH 4/8] Update toolchain --- travis/do_riscv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/do_riscv.sh b/travis/do_riscv.sh index 560e3385be..82b6afee62 100755 --- a/travis/do_riscv.sh +++ b/travis/do_riscv.sh @@ -3,7 +3,7 @@ set -e set -x -TAG=2024.04.12 +TAG=2024.08.03 # The prebuilt toolchains only support hardfloat, so we only # test these for now. From b4398151398ab2a925b1f5d2eaa08edb4d3e420e Mon Sep 17 00:00:00 2001 From: Michael Yeh <111819036+myeh01@users.noreply.github.com> Date: Mon, 5 Aug 2024 13:24:54 -0700 Subject: [PATCH 5/8] Fix scal2v hack --- .../bli_scal2v_sifive_x280_intr_complex.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c index 4a25ce3e32..2e946a2a4c 100644 --- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c @@ -77,13 +77,9 @@ SCAL2V(PRECISION_CHAR, void) yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl); } - // FIXME: remove the #pragmas and change the __riscv_vset_v_f intrinsics to use - // __riscv_vcreate_v_f once they become available in LLVM. - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wuninitialized" + yvec = VUNDEFINED_FX(PREC, LMUL, 2)(); yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); - #pragma GCC diagnostic pop if (incy == 1) VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl); From d167216d2fbd192866428327a06e2450da6caac6 Mon Sep 17 00:00:00 2001 From: Michael Yeh Date: Tue, 10 Sep 2024 01:43:04 -0700 Subject: [PATCH 6/8] Use intrinsics for level 3 kernels --- .../sifive_x280/bli_cntx_init_sifive_x280.c | 26 +- .../sifive_x280/3/bli_gemm_sifive_x280_asm.c | 2406 ----------------- .../bli_gemm_sifive_x280_intr.c | 138 + .../bli_gemm_sifive_x280_intr_complex.c | 517 ++++ .../bli_gemm_sifive_x280_intr_real.c | 339 +++ .../bli_gemmtrsm_l_sifive_x280_asm_complex.c | 327 --- .../bli_gemmtrsm_l_sifive_x280_asm_real.c | 253 -- .../bli_gemmtrsm_u_sifive_x280_asm_complex.c | 331 --- .../bli_gemmtrsm_u_sifive_x280_asm_real.c | 260 -- .../bli_gemmtrsm_sifive_x280_intr.c} | 97 +- .../bli_gemmtrsm_sifive_x280_intr_complex.c | 437 +++ .../bli_gemmtrsm_sifive_x280_intr_real.c | 364 +++ kernels/sifive_x280/bli_kernels_sifive_x280.h | 26 +- kernels/sifive_x280/riscv_cmul_macros_intr.h | 18 + .../sifive_x280/riscv_overloaded_intrinsics.h | 2 + 15 files changed, 1877 insertions(+), 3664 deletions(-) delete mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c rename kernels/sifive_x280/3/{bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c => bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c} (75%) create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c index 0f5a39d104..668891cf3f 100644 --- a/config/sifive_x280/bli_cntx_init_sifive_x280.c +++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c @@ -157,19 +157,19 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_intr, // Level 3 - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_x280_asm_7m4, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_x280_asm_7m4, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_x280_asm_6m2, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_x280_asm_6m2, - - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_x280_asm, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_x280_asm, - BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_asm, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_asm, - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_x280_asm, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_x280_asm, - BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_asm, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_asm, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_x280_intr, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_x280_intr, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_x280_intr, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_x280_intr, + + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_x280_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_x280_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_x280_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_x280_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_intr, BLIS_VA_END ); diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c deleted file mode 100644 index f4a5a26caf..0000000000 --- a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c +++ /dev/null @@ -1,2406 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#include "blis.h" -#include "../riscv_cmul_macros_asm.h" -#include "../bli_kernels_sifive_x280.h" -#include -#include -#include -#include - -// byte-size of the floating point type -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " -#define PACKMR 8 -#define PACKNR 64 - -void bli_sgemm_7m4 - ( - dim_t N, - dim_t K, - const float* restrict alpha, - const float* restrict a, - const float* restrict b, - const float* restrict beta, - float* restrict c, inc_t rsc, inc_t csc - ) -{ - // 7 x N x K sgemm, 0 < N <= 64 = vlmax, K > 0 - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - bool first = true; - // compute a*b - for (dim_t k = 0; k < K; ++k) { - __asm__(VLE "v28, (%0)" : : "r"(b)); - if (first) { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmul.vf v0, v28, ft0"); - - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmul.vf v4, v28, ft1"); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmul.vf v8, v28, ft2"); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmul.vf v12, v28, ft3"); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmul.vf v16, v28, ft4"); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmul.vf v20, v28, ft5"); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__("vfmul.vf v24, v28, ft6"); - - first = false; - } - else { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmacc.vf v0, ft0, v28"); - - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmacc.vf v4, ft1, v28"); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmacc.vf v8, ft2, v28"); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmacc.vf v12, ft3, v28"); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmacc.vf v16, ft4, v28"); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmacc.vf v20, ft5, v28"); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__("vfmacc.vf v24, ft6, v28"); - } - - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); - } - - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - - // compute alpha*a*b + beta*c - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("vfmul.vf v24, v24, ft10"); - } - else { // beta != 0.f - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - float *c_tmp = c; - if (csc == FLT_SIZE) { // c unit column stride - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v0, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v24, v24, ft10"); - __asm__("vfmacc.vf v24, ft11, v28"); - } // end c unit column stride - else { // c non-unit column stride - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v0, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v24, v24, ft10"); - __asm__("vfmacc.vf v24, ft11, v28"); - } // end c non-unit column stride - } // end beta != 0.f - - // store c - if (csc == FLT_SIZE) { - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v24, (%0)" : : "r"(c)); - } - else { - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - } - - return; -} - -void bli_sgemm_7m4_cleanup - ( - dim_t M, - dim_t N, - dim_t K, - const float* restrict alpha, - const float* restrict a, - const float* restrict b, - const float* restrict beta, - float* restrict c, inc_t rsc, inc_t csc - ) -{ - // M x N x K sgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0 - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - bool first = true; - // compute a*b - for (dim_t k = 0; k < K; ++k) { - __asm__(VLE "v28, (%0)" : : "r"(b)); - if (first) { - switch (M) { - case 6: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmul.vf v20, v28, ft5"); - case 5: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmul.vf v16, v28, ft4"); - case 4: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmul.vf v12, v28, ft3"); - case 3: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmul.vf v8, v28, ft2"); - case 2: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmul.vf v4, v28, ft1"); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmul.vf v0, v28, ft0"); - } - first = false; - } - else { - switch (M) { - case 6: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmacc.vf v20, ft5, v28"); - case 5: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmacc.vf v16, ft4, v28"); - case 4: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmacc.vf v12, ft3, v28"); - case 3: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmacc.vf v8, ft2, v28"); - case 2: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmacc.vf v4, ft1, v28"); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmacc.vf v0, ft0, v28"); - } - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); - } - - c += (M - 1) * rsc; - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - - // compute alpha*a*b + beta*c - if (*beta == 0.f) { - switch (M) { - case 6: - __asm__("vfmul.vf v20, v20, ft10"); - case 5: - __asm__("vfmul.vf v16, v16, ft10"); - case 4: - __asm__("vfmul.vf v12, v12, ft10"); - case 3: - __asm__("vfmul.vf v8, v8, ft10"); - case 2: - __asm__("vfmul.vf v4, v4, ft10"); - case 1: - __asm__("vfmul.vf v0, v0, ft10"); - } - } - else { // beta != 0.f - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - float *c_tmp = c; - if (csc == FLT_SIZE) { - switch (M) { - case 6: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - case 5: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - case 4: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - case 3: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - case 2: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - case 1: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmacc.vf v0, ft11, v28"); - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 6: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - case 5: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - case 4: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - case 3: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - case 2: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - case 1: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmacc.vf v0, ft11, v28"); - } - } // end c non-unit column stride - } // end beta != 0.f - - // store c - if (csc == FLT_SIZE) { - switch (M) { - case 6: - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSE "v0, (%0)" : : "r"(c)); - } - } - else { - switch (M) { - case 6: - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } - return; -} - -void bli_sgemm_7m4_k0 - ( - dim_t M, - dim_t N, - const float* restrict beta, - float* restrict c, inc_t rsc, inc_t csc - ) -{ - // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0 - // This may not produce the same result as the reference kernel if alpha is infinite or NaN. - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - c += (M - 1) * rsc; - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - if (*beta == 0.f) { - // set c to 0 - __asm__("vmv.v.i v0, 0"); - if (csc == FLT_SIZE) { // c unit column stride - switch (M) { - case 7: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSE "v0, (%0)" : : "r"(c)); - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 7: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } // end c non-unit column stride - } // end beta == 0.f - else { // beta != 0.f - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta)); - if (csc == FLT_SIZE) { // c unit column stride - switch (M) { - case 7: - __asm__(VLE "v24, (%0)" : : "r"(c)); - __asm__("vfmul.vf v24, v24, ft0"); - __asm__(VSE "v24, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VLE "v20, (%0)" : : "r"(c)); - __asm__("vfmul.vf v20, v20, ft0"); - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VLE "v16, (%0)" : : "r"(c)); - __asm__("vfmul.vf v16, v16, ft0"); - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VLE "v12, (%0)" : : "r"(c)); - __asm__("vfmul.vf v12, v12, ft0"); - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VLE "v8, (%0)" : : "r"(c)); - __asm__("vfmul.vf v8, v8, ft0"); - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VLE "v4, (%0)" : : "r"(c)); - __asm__("vfmul.vf v4, v4, ft0"); - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VLE "v0, (%0)" : : "r"(c)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__(VSE "v0, (%0)" : : "r"(c)); - - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 7: - __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v24, v24, ft0"); - __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft0"); - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft0"); - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft0"); - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft0"); - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft0"); - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } // end c non-unit column stride - } // end beta != 0.f - return; -} - -void bli_sgemm_sifive_x280_asm_7m4 - ( - dim_t M, - dim_t N, - dim_t K, - const void* restrict alpha_, - const void* restrict a_, - const void* restrict b_, - const void* restrict beta_, - void* restrict c_, inc_t rsc, inc_t csc, - const auxinfo_t* restrict data, - const cntx_t* restrict cntx - ) -{ - (void) data; - (void) cntx; - const float* restrict alpha = alpha_; - const float* restrict beta = beta_; - const float* restrict a = a_; - const float* restrict b = b_; - float* restrict c = c_; - - // M x N x K sgemm - if (M <= 0 || N <= 0 || K < 0) - return; - else if (K == 0) - bli_sgemm_7m4_k0(M, N, beta, c, rsc, csc); - else if (M == 7) - bli_sgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc); - else - bli_sgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef PACKMR -#undef PACKNR - -// byte-size of the floating point type -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " -#define PACKMR 8 -#define PACKNR 32 - -void bli_dgemm_7m4 - ( - dim_t N, - dim_t K, - const double* restrict alpha, - const double* restrict a, - const double* restrict b, - const double* restrict beta, - double* restrict c, inc_t rsc, inc_t csc - ) -{ - // 7 x N x K dgemm, 0 < N <= 64 = vlmax, K > 0 - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - bool first = true; - // compute a*b - for (dim_t k = 0; k < K; ++k) { - __asm__(VLE "v28, (%0)" : : "r"(b)); - if (first) { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmul.vf v0, v28, ft0"); - - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmul.vf v4, v28, ft1"); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmul.vf v8, v28, ft2"); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmul.vf v12, v28, ft3"); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmul.vf v16, v28, ft4"); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmul.vf v20, v28, ft5"); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__("vfmul.vf v24, v28, ft6"); - - first = false; - } - else { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmacc.vf v0, ft0, v28"); - - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmacc.vf v4, ft1, v28"); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmacc.vf v8, ft2, v28"); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmacc.vf v12, ft3, v28"); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmacc.vf v16, ft4, v28"); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmacc.vf v20, ft5, v28"); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__("vfmacc.vf v24, ft6, v28"); - } - - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); - } - - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - - // compute alpha*a*b + beta*c - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("vfmul.vf v24, v24, ft10"); - } - else { // beta != 0. - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - double *c_tmp = c; - if (csc == FLT_SIZE) { // c unit column stride - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v0, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v24, v24, ft10"); - __asm__("vfmacc.vf v24, ft11, v28"); - } // end c unit column stride - else { // c non-unit column stride - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v0, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v24, v24, ft10"); - __asm__("vfmacc.vf v24, ft11, v28"); - } // end c non-unit column stride - } // end beta != 0. - - // store c - if (csc == FLT_SIZE) { - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v24, (%0)" : : "r"(c)); - } - else { - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - } - - return; -} - -void bli_dgemm_7m4_cleanup - ( - dim_t M, - dim_t N, - dim_t K, - const double* restrict alpha, - const double* restrict a, - const double* restrict b, - const double* restrict beta, - double* restrict c, inc_t rsc, inc_t csc - ) -{ - // M x N x K dgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0 - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - bool first = true; - // compute a*b - for (dim_t k = 0; k < K; ++k) { - __asm__(VLE "v28, (%0)" : : "r"(b)); - if (first) { - switch (M) { - case 6: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmul.vf v20, v28, ft5"); - case 5: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmul.vf v16, v28, ft4"); - case 4: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmul.vf v12, v28, ft3"); - case 3: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmul.vf v8, v28, ft2"); - case 2: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmul.vf v4, v28, ft1"); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmul.vf v0, v28, ft0"); - } - first = false; - } - else { - switch (M) { - case 6: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmacc.vf v20, ft5, v28"); - case 5: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmacc.vf v16, ft4, v28"); - case 4: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmacc.vf v12, ft3, v28"); - case 3: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmacc.vf v8, ft2, v28"); - case 2: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmacc.vf v4, ft1, v28"); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmacc.vf v0, ft0, v28"); - } - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); - } - - c += (M - 1) * rsc; - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - - // compute alpha*a*b + beta*c - if (*beta == 0.) { - switch (M) { - case 6: - __asm__("vfmul.vf v20, v20, ft10"); - case 5: - __asm__("vfmul.vf v16, v16, ft10"); - case 4: - __asm__("vfmul.vf v12, v12, ft10"); - case 3: - __asm__("vfmul.vf v8, v8, ft10"); - case 2: - __asm__("vfmul.vf v4, v4, ft10"); - case 1: - __asm__("vfmul.vf v0, v0, ft10"); - } - } - else { // beta != 0. - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - double *c_tmp = c; - if (csc == FLT_SIZE) { - switch (M) { - case 6: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - case 5: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - case 4: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - case 3: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - case 2: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - case 1: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmacc.vf v0, ft11, v28"); - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 6: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - case 5: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - case 4: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - case 3: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - case 2: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - case 1: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmacc.vf v0, ft11, v28"); - } - } // end c non-unit column stride - } // end beta != 0. - - // store c - if (csc == FLT_SIZE) { - switch (M) { - case 6: - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSE "v0, (%0)" : : "r"(c)); - } - } - else { - switch (M) { - case 6: - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } - return; -} - -void bli_dgemm_7m4_k0 - ( - dim_t M, - dim_t N, - const double* restrict beta, - double* restrict c, inc_t rsc, inc_t csc - ) -{ - // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0 - // This may not produce the same result as the reference kernel if alpha is infinite or NaN. - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - c += (M - 1) * rsc; - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - if (*beta == 0.) { - // set c to 0 - __asm__("vmv.v.i v0, 0"); - if (csc == FLT_SIZE) { // c unit column stride - switch (M) { - case 7: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSE "v0, (%0)" : : "r"(c)); - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 7: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } // end c non-unit column stride - } // end beta == 0. - else { // beta != 0. - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta)); - if (csc == FLT_SIZE) { // c unit column stride - switch (M) { - case 7: - __asm__(VLE "v24, (%0)" : : "r"(c)); - __asm__("vfmul.vf v24, v24, ft0"); - __asm__(VSE "v24, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VLE "v20, (%0)" : : "r"(c)); - __asm__("vfmul.vf v20, v20, ft0"); - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VLE "v16, (%0)" : : "r"(c)); - __asm__("vfmul.vf v16, v16, ft0"); - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VLE "v12, (%0)" : : "r"(c)); - __asm__("vfmul.vf v12, v12, ft0"); - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VLE "v8, (%0)" : : "r"(c)); - __asm__("vfmul.vf v8, v8, ft0"); - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VLE "v4, (%0)" : : "r"(c)); - __asm__("vfmul.vf v4, v4, ft0"); - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VLE "v0, (%0)" : : "r"(c)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__(VSE "v0, (%0)" : : "r"(c)); - - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 7: - __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v24, v24, ft0"); - __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft0"); - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft0"); - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft0"); - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft0"); - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft0"); - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } // end c non-unit column stride - } // end beta != 0. - return; -} - -void bli_dgemm_sifive_x280_asm_7m4 - ( - dim_t M, - dim_t N, - dim_t K, - const void* restrict alpha_, - const void* restrict a_, - const void* restrict b_, - const void* restrict beta_, - void* restrict c_, inc_t rsc, inc_t csc, - const auxinfo_t* restrict data, - const cntx_t* restrict cntx - ) -{ - (void) data; - (void) cntx; - const double* restrict alpha = alpha_; - const double* restrict beta = beta_; - const double* restrict a = a_; - const double* restrict b = b_; - double* restrict c = c_; - - // M x N x K dgemm - if (M <= 0 || N <= 0 || K < 0) - return; - else if (K == 0) - bli_dgemm_7m4_k0(M, N, beta, c, rsc, csc); - else if (M == 7) - bli_dgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc); - else - bli_dgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef PACKMR -#undef PACKNR - -// byte-size of underlying floating point type -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define PACKMR 8 -#define PACKNR 32 - -void bli_cgemm_6m2 - ( - dim_t N, - dim_t K, - const scomplex* restrict alpha, - const scomplex* restrict a, - const scomplex* restrict b, - const scomplex* restrict beta, - scomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // 6 x N x K cgemm, N <= 32 = vlmax, K > 0 - // pairs of register groups hold the real and imag. parts of rows of c and b - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmul_vf(v0, v2, v24, v26, ft0, ft1); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmul_vf(v4, v6, v24, v26, ft2, ft3); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmul_vf(v8, v10, v24, v26, ft4, ft5); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmul_vf(v12, v14, v24, v26, ft6, ft7); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmul_vf(v16, v18, v24, v26, ft8, ft9); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmul_vf(v20, v22, v24, v26, ft10, ft11); - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - while (K > 0) { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v28, v30); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v28, v30); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v28, v30); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v28, v30); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v28, v30); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmacc_vf(v20, v22, ft10, ft11, v28, v30); - K -= 1; - - if (K == 0) { break; } - - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v24, v26); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v24, v26); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v24, v26); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v24, v26); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v24, v26); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmacc_vf(v20, v22, ft10, ft11, v24, v26); - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); - - __asm__("vfmul.vf v24, v2, ft1"); - __asm__("vfmul.vf v26, v0, ft1"); - __asm__("vfmul.vf v28, v6, ft1"); - __asm__("vfmul.vf v30, v4, ft1"); - - __asm__("vfmsub.vf v0, ft0, v24"); - __asm__("vfmadd.vf v2, ft0, v26"); - __asm__("vfmsub.vf v4, ft0, v28"); - __asm__("vfmadd.vf v6, ft0, v30"); - - __asm__("vfmul.vf v24, v10, ft1"); - __asm__("vfmul.vf v26, v8, ft1"); - __asm__("vfmul.vf v28, v14, ft1"); - __asm__("vfmul.vf v30, v12, ft1"); - - __asm__("vfmsub.vf v8, ft0, v24"); - __asm__("vfmadd.vf v10, ft0, v26"); - __asm__("vfmsub.vf v12, ft0, v28"); - __asm__("vfmadd.vf v14, ft0, v30"); - - __asm__("vfmul.vf v24, v18, ft1"); - __asm__("vfmul.vf v26, v16, ft1"); - __asm__("vfmul.vf v28, v22, ft1"); - __asm__("vfmul.vf v30, v20, ft1"); - - __asm__("vfmsub.vf v16, ft0, v24"); - __asm__("vfmadd.vf v18, ft0, v26"); - __asm__("vfmsub.vf v20, ft0, v28"); - __asm__("vfmadd.vf v22, ft0, v30"); - - scomplex beta_cast = *beta; - if (beta_cast.real != 0.f || beta_cast.imag != 0.f) { - if (csc == 2 * FLT_SIZE) { - scomplex *c_tmp = c; - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - - vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); - } - else { - scomplex *c_tmp = c; - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - - vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); - } - } - - if (csc == 2 * FLT_SIZE) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v20, (%0)" : : "r"(c)); - } - else { - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc)); - } - - return; -} - -void bli_cgemm_6m2_cleanup - ( - dim_t M, - dim_t N, - dim_t K, - const scomplex* restrict alpha, - const scomplex* restrict a, - const scomplex* restrict b, - const scomplex* restrict beta, - scomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // M x N x K cgemm, 0 < M < 6, N <= 32 = vlmax, K > 0 - // pairs of register groups hold the real and imag. parts of rows of c and b - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmul_vf(v16, v18, v24, v26, ft8, ft9); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmul_vf(v12, v14, v24, v26, ft6, ft7); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmul_vf(v8, v10, v24, v26, ft4, ft5); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmul_vf(v4, v6, v24, v26, ft2, ft3); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmul_vf(v0, v2, v24, v26, ft0, ft1); - } - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - while (K > 0) { - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v28, v30); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v28, v30); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v28, v30); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v28, v30); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v28, v30); - } - K -= 1; - - if (K == 0) { break; } - - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v24, v26); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v24, v26); - } - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - c += (M - 1) * rsc; - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); - - switch (M) { - case 5: - __asm__("vfmul.vf v24, v18, ft1"); - __asm__("vfmul.vf v26, v16, ft1"); - __asm__("vfmsub.vf v16, ft0, v24"); - __asm__("vfmadd.vf v18, ft0, v26"); - case 4: - __asm__("vfmul.vf v28, v14, ft1"); - __asm__("vfmul.vf v30, v12, ft1"); - __asm__("vfmsub.vf v12, ft0, v28"); - __asm__("vfmadd.vf v14, ft0, v30"); - case 3: - __asm__("vfmul.vf v24, v10, ft1"); - __asm__("vfmul.vf v26, v8, ft1"); - __asm__("vfmsub.vf v8, ft0, v24"); - __asm__("vfmadd.vf v10, ft0, v26"); - case 2: - __asm__("vfmul.vf v28, v6, ft1"); - __asm__("vfmul.vf v30, v4, ft1"); - __asm__("vfmsub.vf v4, ft0, v28"); - __asm__("vfmadd.vf v6, ft0, v30"); - case 1: - __asm__("vfmul.vf v24, v2, ft1"); - __asm__("vfmul.vf v26, v0, ft1"); - __asm__("vfmsub.vf v0, ft0, v24"); - __asm__("vfmadd.vf v2, ft0, v26"); - } - - scomplex beta_cast = *beta; - if (beta_cast.real != 0.f || beta_cast.imag != 0.f) { - if (csc == 2 * FLT_SIZE) { - scomplex *c_tmp = c; - switch (M) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - case 4: - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - case 2: - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - } - } - else { - scomplex *c_tmp = c; - switch (M) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - case 4: - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - case 2: - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - } - } - } - - if (csc == 2 * FLT_SIZE) { - switch (M) { - case 5: - __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - } - } - else { - switch (M) { - case 5: - __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } - - return; -} - -void bli_cgemm_6m2_k0 - ( - dim_t M, - dim_t N, - const scomplex* restrict beta, - scomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0 - // This may not produce the same result as the reference kernel if alpha is infinite or NaN. - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - csc *= 2 * FLT_SIZE; - - scomplex beta_cast = *beta; - if (beta_cast.real == 0.f && beta_cast.imag == 0.f) { - // set c to 0 - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - for (size_t i = 0; i < M; ++i) { - if (csc == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - c += rsc; - } - } - else { - // scale c by beta - for (size_t i = 0; i < M; ++i) { - if (csc == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(c)); - vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - } - c += rsc; - } - } - return; -} - -void bli_cgemm_sifive_x280_asm_6m2 - ( - dim_t M, - dim_t N, - dim_t K, - const void* restrict alpha_, - const void* restrict a_, - const void* restrict b_, - const void* restrict beta_, - void* restrict c_, inc_t rsc, inc_t csc, - const auxinfo_t* restrict data, - const cntx_t* restrict cntx - ) -{ - // M x N x K cgemm - (void) data; - (void) cntx; - const scomplex* restrict alpha = alpha_; - const scomplex* restrict beta = beta_; - const scomplex* restrict a = a_; - const scomplex* restrict b = b_; - scomplex* restrict c = c_; - - if (M <= 0 || N <= 0 || K < 0) - return; - else if (K == 0) - bli_cgemm_6m2_k0(M, N, beta, c, rsc, csc); - else if (M == 6) - bli_cgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc); - else - bli_cgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef PACKMR -#undef PACKNR - -// byte-size of underlying floating point type -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define PACKMR 8 -#define PACKNR 16 - -void bli_zgemm_6m2 - ( - dim_t N, - dim_t K, - const dcomplex* restrict alpha, - const dcomplex* restrict a, - const dcomplex* restrict b, - const dcomplex* restrict beta, - dcomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // 6 x N x K zgemm, N <= 32 = vlmax, K > 0 - // pairs of register groups hold the real and imag. parts of rows of c and b - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmul_vf(v0, v2, v24, v26, ft0, ft1); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmul_vf(v4, v6, v24, v26, ft2, ft3); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmul_vf(v8, v10, v24, v26, ft4, ft5); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmul_vf(v12, v14, v24, v26, ft6, ft7); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmul_vf(v16, v18, v24, v26, ft8, ft9); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmul_vf(v20, v22, v24, v26, ft10, ft11); - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - while (K > 0) { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v28, v30); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v28, v30); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v28, v30); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v28, v30); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v28, v30); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmacc_vf(v20, v22, ft10, ft11, v28, v30); - K -= 1; - - if (K == 0) { break; } - - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v24, v26); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v24, v26); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v24, v26); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v24, v26); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v24, v26); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmacc_vf(v20, v22, ft10, ft11, v24, v26); - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); - - __asm__("vfmul.vf v24, v2, ft1"); - __asm__("vfmul.vf v26, v0, ft1"); - __asm__("vfmul.vf v28, v6, ft1"); - __asm__("vfmul.vf v30, v4, ft1"); - - __asm__("vfmsub.vf v0, ft0, v24"); - __asm__("vfmadd.vf v2, ft0, v26"); - __asm__("vfmsub.vf v4, ft0, v28"); - __asm__("vfmadd.vf v6, ft0, v30"); - - __asm__("vfmul.vf v24, v10, ft1"); - __asm__("vfmul.vf v26, v8, ft1"); - __asm__("vfmul.vf v28, v14, ft1"); - __asm__("vfmul.vf v30, v12, ft1"); - - __asm__("vfmsub.vf v8, ft0, v24"); - __asm__("vfmadd.vf v10, ft0, v26"); - __asm__("vfmsub.vf v12, ft0, v28"); - __asm__("vfmadd.vf v14, ft0, v30"); - - __asm__("vfmul.vf v24, v18, ft1"); - __asm__("vfmul.vf v26, v16, ft1"); - __asm__("vfmul.vf v28, v22, ft1"); - __asm__("vfmul.vf v30, v20, ft1"); - - __asm__("vfmsub.vf v16, ft0, v24"); - __asm__("vfmadd.vf v18, ft0, v26"); - __asm__("vfmsub.vf v20, ft0, v28"); - __asm__("vfmadd.vf v22, ft0, v30"); - - dcomplex beta_cast = *beta; - if (beta_cast.real != 0. || beta_cast.imag != 0.) { - if (csc == 2 * FLT_SIZE) { - dcomplex *c_tmp = c; - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - - vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); - } - else { - dcomplex *c_tmp = c; - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - - vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); - } - } - - if (csc == 2 * FLT_SIZE) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v20, (%0)" : : "r"(c)); - } - else { - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc)); - } - - return; -} - -void bli_zgemm_6m2_cleanup - ( - dim_t M, - dim_t N, - dim_t K, - const dcomplex* restrict alpha, - const dcomplex* restrict a, - const dcomplex* restrict b, - const dcomplex* restrict beta, - dcomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // M x N x K zgemm, 0 < M < 6, N <= 32 = vlmax, K > 0 - // pairs of register groups hold the real and imag. parts of rows of c and b - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmul_vf(v16, v18, v24, v26, ft8, ft9); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmul_vf(v12, v14, v24, v26, ft6, ft7); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmul_vf(v8, v10, v24, v26, ft4, ft5); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmul_vf(v4, v6, v24, v26, ft2, ft3); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmul_vf(v0, v2, v24, v26, ft0, ft1); - } - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - while (K > 0) { - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v28, v30); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v28, v30); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v28, v30); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v28, v30); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v28, v30); - } - K -= 1; - - if (K == 0) { break; } - - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v24, v26); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v24, v26); - } - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - c += (M - 1) * rsc; - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); - - switch (M) { - case 5: - __asm__("vfmul.vf v24, v18, ft1"); - __asm__("vfmul.vf v26, v16, ft1"); - __asm__("vfmsub.vf v16, ft0, v24"); - __asm__("vfmadd.vf v18, ft0, v26"); - case 4: - __asm__("vfmul.vf v28, v14, ft1"); - __asm__("vfmul.vf v30, v12, ft1"); - __asm__("vfmsub.vf v12, ft0, v28"); - __asm__("vfmadd.vf v14, ft0, v30"); - case 3: - __asm__("vfmul.vf v24, v10, ft1"); - __asm__("vfmul.vf v26, v8, ft1"); - __asm__("vfmsub.vf v8, ft0, v24"); - __asm__("vfmadd.vf v10, ft0, v26"); - case 2: - __asm__("vfmul.vf v28, v6, ft1"); - __asm__("vfmul.vf v30, v4, ft1"); - __asm__("vfmsub.vf v4, ft0, v28"); - __asm__("vfmadd.vf v6, ft0, v30"); - case 1: - __asm__("vfmul.vf v24, v2, ft1"); - __asm__("vfmul.vf v26, v0, ft1"); - __asm__("vfmsub.vf v0, ft0, v24"); - __asm__("vfmadd.vf v2, ft0, v26"); - } - - dcomplex beta_cast = *beta; - if (beta_cast.real != 0. || beta_cast.imag != 0.) { - if (csc == 2 * FLT_SIZE) { - dcomplex *c_tmp = c; - switch (M) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - case 4: - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - case 2: - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - } - } - else { - dcomplex *c_tmp = c; - switch (M) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - case 4: - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - case 2: - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - } - } - } - - if (csc == 2 * FLT_SIZE) { - switch (M) { - case 5: - __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - } - } - else { - switch (M) { - case 5: - __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } - - return; -} - -void bli_zgemm_6m2_k0 - ( - dim_t M, - dim_t N, - const dcomplex* restrict beta, - dcomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0 - // This may not produce the same result as the reference kernel if alpha is infinite or NaN. - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - csc *= 2 * FLT_SIZE; - - dcomplex beta_cast = *beta; - if (beta_cast.real == 0. && beta_cast.imag == 0.) { - // set c to 0 - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - for (size_t i = 0; i < M; ++i) { - if (csc == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - c += rsc; - } - } - else { - // scale c by beta - for (size_t i = 0; i < M; ++i) { - if (csc == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(c)); - vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - } - c += rsc; - } - } - return; -} - -void bli_zgemm_sifive_x280_asm_6m2 - ( - dim_t M, - dim_t N, - dim_t K, - const void* restrict alpha_, - const void* restrict a_, - const void* restrict b_, - const void* restrict beta_, - void* restrict c_, inc_t rsc, inc_t csc, - const auxinfo_t* restrict data, - const cntx_t* restrict cntx - ) -{ - // M x N x K zgemm - (void) data; - (void) cntx; - const dcomplex* restrict alpha = alpha_; - const dcomplex* restrict beta = beta_; - const dcomplex* restrict a = a_; - const dcomplex* restrict b = b_; - dcomplex* restrict c = c_; - - if (M <= 0 || N <= 0 || K < 0) - return; - else if (K == 0) - bli_zgemm_6m2_k0(M, N, beta, c, rsc, csc); - else if (M == 6) - bli_zgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc); - else - bli_zgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef PACKMR -#undef PACKNR diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c new file mode 100644 index 0000000000..664d4616f3 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c @@ -0,0 +1,138 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define GEMM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemm_sifive_x280_intr(\ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const void* restrict alpha_, \ + const void* restrict a_, \ + const void* restrict b_, \ + const void* restrict beta_, \ + void* restrict c_, inc_t rsc, inc_t csc, \ + const auxinfo_t* restrict data, \ + const cntx_t* restrict cntx \ +) + +#define GEMM(...) GEMM_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) +#define PACKMR 8 +#define PACKNR 64 + +#include "./bli_gemm_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef PACKMR +#undef PACKNR + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) +#define PACKMR 8 +#define PACKNR 32 + +#include "./bli_gemm_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef PACKMR +#undef PACKNR + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m2 +#define FLT_SIZE sizeof(float) +#define PACKMR 8 +#define PACKNR 32 + +#include "./bli_gemm_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef PACKMR +#undef PACKNR + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m2 +#define FLT_SIZE sizeof(double) +#define PACKMR 8 +#define PACKNR 16 + +#include "./bli_gemm_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef PACKMR +#undef PACKNR + +#undef GEMM +#undef GEMM_ diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..15a19ab49d --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c @@ -0,0 +1,517 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMM + +GEMM(PRECISION_CHAR, void) +{ + (void) data; // Suppress unused parameter warnings + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict b = b_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict c = c_; + + if (m <= 0 || n <= 0 || k < 0) + return; + else if (k == 0) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_FX(PREC, LMUL, 2) zero_splat = VUNDEFINED_FX(PREC, LMUL, 2)(); + zero_splat = VSET_V_F(PREC, LMUL, 2)(zero_splat, 0, VFMV_V_F(PREC, LMUL)(0., n)); + zero_splat = VSET_V_F(PREC, LMUL, 2)(zero_splat, 1, VFMV_V_F(PREC, LMUL)(0., n)); + + for (dim_t i = 0; i < m; ++i) { + if (csc == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), zero_splat, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), 2 * FLT_SIZE * csc, zero_splat, n); + } + } + else { + for (dim_t i = 0; i < m; ++i) { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + RVV_TYPE_F(PREC, LMUL) beta_c0_r, beta_c0_i; + + if (csc == 1) + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), n); + else + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, beta_c0_r, beta_c0_i, c0_r, c0_i, beta->real, beta->imag, n); + c0 = VSET_V_F(PREC, LMUL, 2)(c0, 0, beta_c0_r); + c0 = VSET_V_F(PREC, LMUL, 2)(c0, 1, beta_c0_i); + if (csc == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), c0, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), 2 * FLT_SIZE * csc, c0, n); + } + } + } + else if (m == 6) { + RVV_TYPE_F(PREC, LMUL) ab0_r, ab1_r, ab2_r, ab3_r, ab4_r, ab5_r; + RVV_TYPE_F(PREC, LMUL) ab0_i, ab1_i, ab2_i, ab3_i, ab4_i, ab5_i; + RVV_TYPE_FX(PREC, LMUL, 2) b0, b1; + RVV_TYPE_F(PREC, LMUL) b0_r, b1_r; + RVV_TYPE_F(PREC, LMUL) b0_i, b1_i; + + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += PACKNR; + } + + VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, b0_r, b0_i, a[0].real, a[0].imag, n); + VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, b0_r, b0_i, a[1].real, a[1].imag, n); + VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, b0_r, b0_i, a[2].real, a[2].imag, n); + VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, b0_r, b0_i, a[3].real, a[3].imag, n); + VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, b0_r, b0_i, a[4].real, a[4].imag, n); + VCMUL_VF(PREC, LMUL, ab5_r, ab5_i, b0_r, b0_i, a[5].real, a[5].imag, n); + + a += PACKMR; + k -= 1; + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + } + + while (k > 0) { + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5].real, a[5].imag, b1_r, b1_i, n); + + a += PACKMR; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += PACKNR; + } + + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5].real, a[5].imag, b0_r, b0_i, n); + + a += PACKMR; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + } + } + + RVV_TYPE_F(PREC, LMUL) temp0_r, temp1_r; + RVV_TYPE_F(PREC, LMUL) temp0_i, temp1_i; + temp0_r = VFMUL_VF(PREC, LMUL)(ab0_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab0_r, alpha->imag, n); + temp1_r = VFMUL_VF(PREC, LMUL)(ab1_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab1_r, alpha->imag, n); + + ab0_r = VFMSUB_VF(PREC, LMUL)(ab0_r, alpha->real, temp0_r, n); + ab0_i = VFMADD_VF(PREC, LMUL)(ab0_i, alpha->real, temp0_i, n); + ab1_r = VFMSUB_VF(PREC, LMUL)(ab1_r, alpha->real, temp1_r, n); + ab1_i = VFMADD_VF(PREC, LMUL)(ab1_i, alpha->real, temp1_i, n); + + temp0_r = VFMUL_VF(PREC, LMUL)(ab2_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab2_r, alpha->imag, n); + temp1_r = VFMUL_VF(PREC, LMUL)(ab3_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab3_r, alpha->imag, n); + + ab2_r = VFMSUB_VF(PREC, LMUL)(ab2_r, alpha->real, temp0_r, n); + ab2_i = VFMADD_VF(PREC, LMUL)(ab2_i, alpha->real, temp0_i, n); + ab3_r = VFMSUB_VF(PREC, LMUL)(ab3_r, alpha->real, temp1_r, n); + ab3_i = VFMADD_VF(PREC, LMUL)(ab3_i, alpha->real, temp1_i, n); + + temp0_r = VFMUL_VF(PREC, LMUL)(ab4_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab4_r, alpha->imag, n); + temp1_r = VFMUL_VF(PREC, LMUL)(ab5_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab5_r, alpha->imag, n); + + ab4_r = VFMSUB_VF(PREC, LMUL)(ab4_r, alpha->real, temp0_r, n); + ab4_i = VFMADD_VF(PREC, LMUL)(ab4_i, alpha->real, temp0_i, n); + ab5_r = VFMSUB_VF(PREC, LMUL)(ab5_r, alpha->real, temp1_r, n); + ab5_i = VFMADD_VF(PREC, LMUL)(ab5_i, alpha->real, temp1_i, n); + + if (!PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + if (csc == 1) { + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, beta->real, beta->imag, c0_r, c0_i, n); + } + else { + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, beta->real, beta->imag, c0_r, c0_i, n); + } + } + + RVV_TYPE_FX(PREC, LMUL, 2) ab0 = VCREATE_V_FX(PREC, LMUL, 2)(ab0_r, ab0_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab1 = VCREATE_V_FX(PREC, LMUL, 2)(ab1_r, ab1_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab2 = VCREATE_V_FX(PREC, LMUL, 2)(ab2_r, ab2_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab3 = VCREATE_V_FX(PREC, LMUL, 2)(ab3_r, ab3_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab4 = VCREATE_V_FX(PREC, LMUL, 2)(ab4_r, ab4_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab5 = VCREATE_V_FX(PREC, LMUL, 2)(ab5_r, ab5_i); + + if (csc == 1) { + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), ab0, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), ab1, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), ab2, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), ab3, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), ab4, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), ab5, n); + } + else { + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, ab0, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, ab1, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, ab2, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, ab3, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, ab4, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), 2 * FLT_SIZE * csc, ab5, n); + } + } + else { + RVV_TYPE_F(PREC, LMUL) ab0_r, ab1_r, ab2_r, ab3_r, ab4_r; + RVV_TYPE_F(PREC, LMUL) ab0_i, ab1_i, ab2_i, ab3_i, ab4_i; + RVV_TYPE_FX(PREC, LMUL, 2) b0, b1; + RVV_TYPE_F(PREC, LMUL) b0_r, b1_r; + RVV_TYPE_F(PREC, LMUL) b0_i, b1_i; + + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += PACKNR; + } + + switch (m) { + case 5: + VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, b0_r, b0_i, a[4].real, a[4].imag, n); + case 4: + VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, b0_r, b0_i, a[3].real, a[3].imag, n); + case 3: + VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, b0_r, b0_i, a[2].real, a[2].imag, n); + case 2: + VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, b0_r, b0_i, a[1].real, a[1].imag, n); + case 1: + VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, b0_r, b0_i, a[0].real, a[0].imag, n); + } + + a += PACKMR; + k -= 1; + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + } + + while (k > 0) { + switch (m) { + case 5: + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b1_r, b1_i, n); + case 4: + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b1_r, b1_i, n); + case 3: + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b1_r, b1_i, n); + case 2: + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b1_r, b1_i, n); + case 1: + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b1_r, b1_i, n); + } + + a += PACKMR; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += PACKNR; + } + + switch (m) { + case 5: + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b0_r, b0_i, n); + case 4: + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b0_r, b0_i, n); + case 3: + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b0_r, b0_i, n); + case 2: + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b0_r, b0_i, n); + case 1: + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b0_r, b0_i, n); + } + + a += PACKMR; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + } + } + + RVV_TYPE_F(PREC, LMUL) temp0_r, temp1_r; + RVV_TYPE_F(PREC, LMUL) temp0_i, temp1_i; + switch (m) { + case 5: + temp0_r = VFMUL_VF(PREC, LMUL)(ab4_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab4_r, alpha->imag, n); + ab4_r = VFMSUB_VF(PREC, LMUL)(ab4_r, alpha->real, temp0_r, n); + ab4_i = VFMADD_VF(PREC, LMUL)(ab4_i, alpha->real, temp0_i, n); + case 4: + temp1_r = VFMUL_VF(PREC, LMUL)(ab3_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab3_r, alpha->imag, n); + ab3_r = VFMSUB_VF(PREC, LMUL)(ab3_r, alpha->real, temp1_r, n); + ab3_i = VFMADD_VF(PREC, LMUL)(ab3_i, alpha->real, temp1_i, n); + case 3: + temp0_r = VFMUL_VF(PREC, LMUL)(ab2_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab2_r, alpha->imag, n); + ab2_r = VFMSUB_VF(PREC, LMUL)(ab2_r, alpha->real, temp0_r, n); + ab2_i = VFMADD_VF(PREC, LMUL)(ab2_i, alpha->real, temp0_i, n); + case 2: + temp1_r = VFMUL_VF(PREC, LMUL)(ab1_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab1_r, alpha->imag, n); + ab1_r = VFMSUB_VF(PREC, LMUL)(ab1_r, alpha->real, temp1_r, n); + ab1_i = VFMADD_VF(PREC, LMUL)(ab1_i, alpha->real, temp1_i, n); + case 1: + temp0_r = VFMUL_VF(PREC, LMUL)(ab0_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab0_r, alpha->imag, n); + ab0_r = VFMSUB_VF(PREC, LMUL)(ab0_r, alpha->real, temp0_r, n); + ab0_i = VFMADD_VF(PREC, LMUL)(ab0_i, alpha->real, temp0_i, n); + } + + if (!PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + if (csc == 1) { + switch (m) { + case 5: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + case 4: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + case 3: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + case 2: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + case 1: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + } + + } + else { + switch (m) { + case 5: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + case 4: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + case 3: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + case 2: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + case 1: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + } + } + } + + RVV_TYPE_FX(PREC, LMUL, 2) ab0, ab1, ab2, ab3, ab4; + switch (m) { + case 5: + ab4 = VCREATE_V_FX(PREC, LMUL, 2)(ab4_r, ab4_i); + case 4: + ab3 = VCREATE_V_FX(PREC, LMUL, 2)(ab3_r, ab3_i); + case 3: + ab2 = VCREATE_V_FX(PREC, LMUL, 2)(ab2_r, ab2_i); + case 2: + ab1 = VCREATE_V_FX(PREC, LMUL, 2)(ab1_r, ab1_i); + case 1: + ab0 = VCREATE_V_FX(PREC, LMUL, 2)(ab0_r, ab0_i); + } + + if (csc == 1) { + switch (m) { + case 5: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), ab4, n); + case 4: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), ab3, n); + case 3: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), ab2, n); + case 2: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), ab1, n); + case 1: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), ab0, n); + } + } + else { + switch (m) { + case 5: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, ab4, n); + case 4: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, ab3, n); + case 3: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, ab2, n); + case 2: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, ab1, n); + case 1: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, ab0, n); + } + } + } + + return; +} + +#endif // GEMM diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c new file mode 100644 index 0000000000..605b93fb79 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c @@ -0,0 +1,339 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMM + +GEMM(PRECISION_CHAR, void) +{ + (void) data; // Suppress unused parameter warnings + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict b = b_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict c = c_; + + if (m <= 0 || n <= 0 || k < 0) + return; + else if (k == 0) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_F(PREC, LMUL) zero_splat = VFMV_V_F(PREC, LMUL)(0., n); + for (dim_t i = 0; i < m; ++i) { + if (csc == 1) + VSE_V_F(PREC, LMUL)(c + i * rsc, zero_splat, n); + else + VSSE_V_F(PREC, LMUL)(c + i * rsc, FLT_SIZE * csc, zero_splat, n); + } + } + else { + for (dim_t i = 0; i < m; ++i) { + RVV_TYPE_F(PREC, LMUL) c0; + if (csc == 1) + c0 = VLE_V_F(PREC, LMUL)(c + i * rsc, n); + else + c0 = VLSE_V_F(PREC, LMUL)(c + i * rsc, FLT_SIZE * csc, n); + c0 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + if (csc == 1) + VSE_V_F(PREC, LMUL)(c + i * rsc, c0, n); + else + VSSE_V_F(PREC, LMUL)(c + i * rsc, FLT_SIZE * csc, c0, n); + } + } + } + else if (m == 7) { + RVV_TYPE_F(PREC, LMUL) ab0, ab1, ab2, ab3, ab4, ab5, ab6; + bool first = true; + for (dim_t i = 0; i < k; ++i) { + RVV_TYPE_F(PREC, LMUL) b0 = VLE_V_F(PREC, LMUL)(b, n); + if (first) { + ab0 = VFMUL_VF(PREC, LMUL)(b0, a[0], n); + ab1 = VFMUL_VF(PREC, LMUL)(b0, a[1], n); + ab2 = VFMUL_VF(PREC, LMUL)(b0, a[2], n); + ab3 = VFMUL_VF(PREC, LMUL)(b0, a[3], n); + ab4 = VFMUL_VF(PREC, LMUL)(b0, a[4], n); + ab5 = VFMUL_VF(PREC, LMUL)(b0, a[5], n); + ab6 = VFMUL_VF(PREC, LMUL)(b0, a[6], n); + first = false; + } + else { + ab0 = VFMACC_VF(PREC, LMUL)(ab0, a[0], b0, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, a[1], b0, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, a[2], b0, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, a[3], b0, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, a[4], b0, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, a[5], b0, n); + ab6 = VFMACC_VF(PREC, LMUL)(ab6, a[6], b0, n); + } + + a += PACKMR; + b += PACKNR; + } + + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab6 = VFMUL_VF(PREC, LMUL)(ab6, *alpha, n); + } + else { + RVV_TYPE_F(PREC, LMUL) c0; + if (csc == 1) { + c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n); + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 6 * rsc, n); + ab6 = VFMUL_VF(PREC, LMUL)(ab6, *alpha, n); + ab6 = VFMACC_VF(PREC, LMUL)(ab6, *beta, c0, n); + } + else { + c0 = VLSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, n); + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 6 * rsc, FLT_SIZE * csc, n); + ab6 = VFMUL_VF(PREC, LMUL)(ab6, *alpha, n); + ab6 = VFMACC_VF(PREC, LMUL)(ab6, *beta, c0, n); + } + } + + if (csc == 1) { + VSE_V_F(PREC, LMUL)(c + 0 * rsc, ab0, n); + VSE_V_F(PREC, LMUL)(c + 1 * rsc, ab1, n); + VSE_V_F(PREC, LMUL)(c + 2 * rsc, ab2, n); + VSE_V_F(PREC, LMUL)(c + 3 * rsc, ab3, n); + VSE_V_F(PREC, LMUL)(c + 4 * rsc, ab4, n); + VSE_V_F(PREC, LMUL)(c + 5 * rsc, ab5, n); + VSE_V_F(PREC, LMUL)(c + 6 * rsc, ab6, n); + } + else { + VSSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, ab0, n); + VSSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, ab1, n); + VSSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, ab2, n); + VSSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, ab3, n); + VSSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, ab4, n); + VSSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, ab5, n); + VSSE_V_F(PREC, LMUL)(c + 6 * rsc, FLT_SIZE * csc, ab6, n); + } + } + else { + // 0 < m < 7 + RVV_TYPE_F(PREC, LMUL) ab0, ab1, ab2, ab3, ab4, ab5; + bool first = true; + for (dim_t i = 0; i < k; ++i) { + RVV_TYPE_F(PREC, LMUL) b0 = VLE_V_F(PREC, LMUL)(b, n); + if (first) { + switch (m) { + case 6: + ab5 = VFMUL_VF(PREC, LMUL)(b0, a[5], n); + case 5: + ab4 = VFMUL_VF(PREC, LMUL)(b0, a[4], n); + case 4: + ab3 = VFMUL_VF(PREC, LMUL)(b0, a[3], n); + case 3: + ab2 = VFMUL_VF(PREC, LMUL)(b0, a[2], n); + case 2: + ab1 = VFMUL_VF(PREC, LMUL)(b0, a[1], n); + case 1: + ab0 = VFMUL_VF(PREC, LMUL)(b0, a[0], n); + } + first = false; + } + else { + switch (m) { + case 6: + ab5 = VFMACC_VF(PREC, LMUL)(ab5, a[5], b0, n); + case 5: + ab4 = VFMACC_VF(PREC, LMUL)(ab4, a[4], b0, n); + case 4: + ab3 = VFMACC_VF(PREC, LMUL)(ab3, a[3], b0, n); + case 3: + ab2 = VFMACC_VF(PREC, LMUL)(ab2, a[2], b0, n); + case 2: + ab1 = VFMACC_VF(PREC, LMUL)(ab1, a[1], b0, n); + case 1: + ab0 = VFMACC_VF(PREC, LMUL)(ab0, a[0], b0, n); + } + } + + a += PACKMR; + b += PACKNR; + } + + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + switch (m) { + case 6: + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + case 5: + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + case 4: + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + case 3: + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + case 2: + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + case 1: + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + } + } + else { + RVV_TYPE_F(PREC, LMUL) c0; + if (csc == 1) { + switch (m) { + case 6: + c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n); + case 5: + c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n); + case 4: + c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n); + case 3: + c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n); + case 2: + c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n); + case 1: + c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n); + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n); + } + } + else { + switch (m) { + case 6: + c0 = VLSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n); + case 5: + c0 = VLSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n); + case 4: + c0 = VLSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n); + case 3: + c0 = VLSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n); + case 2: + c0 = VLSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n); + case 1: + c0 = VLSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, n); + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n); + } + } + } + + if (csc == 1) { + switch (m) { + case 6: + VSE_V_F(PREC, LMUL)(c + 5 * rsc, ab5, n); + case 5: + VSE_V_F(PREC, LMUL)(c + 4 * rsc, ab4, n); + case 4: + VSE_V_F(PREC, LMUL)(c + 3 * rsc, ab3, n); + case 3: + VSE_V_F(PREC, LMUL)(c + 2 * rsc, ab2, n); + case 2: + VSE_V_F(PREC, LMUL)(c + 1 * rsc, ab1, n); + case 1: + VSE_V_F(PREC, LMUL)(c + 0 * rsc, ab0, n); + } + } + else { + switch (m) { + case 6: + VSSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, ab5, n); + case 5: + VSSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, ab4, n); + case 4: + VSSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, ab3, n); + case 3: + VSSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, ab2, n); + case 2: + VSSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, ab1, n); + case 1: + VSSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, ab0, n); + } + } + } + + return; +} + +#endif // GEMM diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c deleted file mode 100644 index 18df010d05..0000000000 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#ifdef GEMMTRSM - -GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) -{ - (void) data; - (void) cntx; - const DATATYPE* restrict alpha = alpha_; - const DATATYPE* restrict a10 = a10_; - const DATATYPE* restrict a11 = a11_; - const DATATYPE* restrict b01 = b01_; - const DATATYPE* restrict b11 = b11_; - DATATYPE* restrict c11 = c11_; - - if (m <= 0 || n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - - DATATYPE alpha_cast = *alpha; - if (alpha_cast.real == 0 && alpha_cast.imag == 0) { - switch (m) { - case 6: - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v22, 0"); - case 5: - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v18, 0"); - case 4: - __asm__("vmv.v.i v12, 0"); - __asm__("vmv.v.i v14, 0"); - case 3: - __asm__("vmv.v.i v8, 0"); - __asm__("vmv.v.i v10, 0"); - case 2: - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v6, 0"); - case 1: - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - } - } - else { - const DATATYPE* b11_tmp = b11 + (m - 1) * PACKNR; - switch (m) { - case 6: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 5: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 3: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 1: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag); - } - } - - if (k >= 1) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b01)); - __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); - } - if (k >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b01)); - __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - while (k > 0) { - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE)); - vcnmsac_vf(v0, v2, ft0, ft1, v24, v26); - } - k -= 1; - - if (k == 0) { break; } - - if (k >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b01)); - __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE)); - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v28, v30); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v28, v30); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v28, v30); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v28, v30); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v28, v30); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE)); - vcnmsac_vf(v0, v2, ft0, ft1, v28, v30); - } - k -= 1; - - if (k >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b01)); - __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE)); - vcmul_vf(v24, v26, v0, v2, ft0, ft1); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 1) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE)); - vcmul_vf(v24, v26, v4, v6, ft2, ft3); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 2) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE)); - vcmul_vf(v24, v26, v8, v10, ft4, ft5); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 3) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); - vcmul_vf(v24, v26, v12, v14, ft6, ft7); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 4) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcmul_vf(v24, v26, v16, v18, ft8, ft9); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 5) return; - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcmul_vf(v24, v26, v20, v22, ft10, ft11); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - return; -} - -#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c deleted file mode 100644 index a0f9134731..0000000000 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#ifdef GEMMTRSM - -GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) -{ - const DATATYPE* restrict alpha = alpha_; - const DATATYPE* restrict a10 = a10_; - const DATATYPE* restrict a11 = a11_; - const DATATYPE* restrict b01 = b01_; - const DATATYPE* restrict b11 = b11_; - DATATYPE* restrict c11 = c11_; - - if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR)) - return; - - dim_t b11_offset, temp; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE)); - - // Multiply step sizes by data size - __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE)); - __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE)); - - __asm__("addi %0, %1, %2": "=r"(b11_offset): "r"(m), "I"(-1)); - __asm__("li %0, %1": "=r"(temp): "I"(PACKNR * FLT_SIZE)); - __asm__("mul %0, %0, %1": "+r"(b11_offset): "r"(temp)); - // b11_offset = (m-1)*PACKNR*FLT_SIZE - - __asm__("add %0, %0, %1": "+r"(b11): "r"(b11_offset)); - __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha)); // TO DO: optimize alpha = 1 case - switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha - case 7: __asm__(VLE " v0, (%0)": : "r"(b11)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 6: __asm__(VLE " v4, (%0)": : "r"(b11)); - __asm__("vfmul.vf v4, v4, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 5: __asm__(VLE " v8, (%0)": : "r"(b11)); - __asm__("vfmul.vf v8, v8, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 4: __asm__(VLE " v12, (%0)": : "r"(b11)); - __asm__("vfmul.vf v12, v12, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 3: __asm__(VLE " v16, (%0)": : "r"(b11)); - __asm__("vfmul.vf v16, v16, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 2: __asm__(VLE " v20, (%0)": : "r"(b11)); - __asm__("vfmul.vf v20, v20, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 1: __asm__(VLE " v24, (%0)": : "r"(b11)); - __asm__("vfmul.vf v24, v24, f0"); - // no sub of b11 on final entry - } - // b11 now reset to original value - // v0 = row 6 of b11 - // v4 = row 5 of b11 - // v8 = row 4 of b11 - // v12 = row 3 of b11 - // v16 = row 2 of b11 - // v20 = row 1 of b11 - // v24 = row 0 of b11 - - // GEMM: B11 := alpha * B11 - A10 * B01 - for (dim_t i = 0; i < k; i++){ - __asm__(VLE " v28, (%0)": : "r"(b01)); // kth row of b01 - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(6*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v0, f6, v28"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(5*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v4, f5, v28"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(4*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v8, f4, v28"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(3*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v12, f3, v28"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(2*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v16, f2, v28"); - case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(1*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v20, f1, v28"); - case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(0*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v24, f0, v28"); - } - __asm__("addi %0, %0, %1": "+r"(a10): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b01): "I"(PACKNR * FLT_SIZE)); - } - // TRSM: B11 := inv(A11) * B11 - // TO DO: Investigate code size reduction (loop rerolling) - - // Row 0 - __asm__(FLT_LOAD " f0, %0(%1)": : "I"(0*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v24, v24, f0"); - __asm__(VSE " v24, (%0)": : "r"(b11)); - __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 1) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v24"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v24"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v24"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v24"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v16, f2, v24"); - case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v20, f1, v24"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 1 - __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v20, v20, f1"); - __asm__(VSE " v20, (%0)": : "r"(b11)); - __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 2) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v20"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v20"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v20"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v20"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v16, f2, v20"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 2 - __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v16, v16, f2"); - __asm__(VSE " v16, (%0)": : "r"(b11)); - __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 3) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v16"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v16"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v16"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v16"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 3 - __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v12, v12, f3"); - __asm__(VSE " v12, (%0)": : "r"(b11)); - __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 4) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v12"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v12"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v12"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 4 - __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v8, v8, f4"); - __asm__(VSE " v8, (%0)": : "r"(b11)); - __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 5) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v8"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v8"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 5 - __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v4, v4, f5"); - __asm__(VSE " v4, (%0)": : "r"(b11)); - __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 6) return; - - __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v4"); - - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 6 - __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v0, v0, f6"); - __asm__(VSE " v0, (%0)": : "r"(b11)); - __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc)); -} -#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c deleted file mode 100644 index 9332fd0963..0000000000 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#ifdef GEMMTRSM - -GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) -{ - (void) data; - (void) cntx; - const DATATYPE* restrict alpha = alpha_; - const DATATYPE* restrict a12 = a12_; - const DATATYPE* restrict a11 = a11_; - const DATATYPE* restrict b21 = b21_; - const DATATYPE* restrict b11 = b11_; - DATATYPE* restrict c11 = c11_; - - if (m <= 0 || n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - - DATATYPE alpha_cast = *alpha; - if (alpha_cast.real == 0 && alpha_cast.imag == 0) { - switch (m) { - case 6: - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v22, 0"); - case 5: - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v18, 0"); - case 4: - __asm__("vmv.v.i v12, 0"); - __asm__("vmv.v.i v14, 0"); - case 3: - __asm__("vmv.v.i v8, 0"); - __asm__("vmv.v.i v10, 0"); - case 2: - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v6, 0"); - case 1: - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - } - } - else { - const DATATYPE* b11_tmp = b11; - switch (m) { - case 6: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 5: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 3: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 1: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag); - } - } - - if (k >= 1) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b21)); - __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); - } - if (k >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b21)); - __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - a12 += m - 1; - - while (k > 0) { - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE)); - vcnmsac_vf(v0, v2, ft0, ft1, v24, v26); - } - k -= 1; - - if (k == 0) { break; } - - if (k >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b21)); - __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE)); - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v28, v30); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v28, v30); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v28, v30); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v28, v30); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v28, v30); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE)); - vcnmsac_vf(v0, v2, ft0, ft1, v28, v30); - } - k -= 1; - - if (k >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b21)); - __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - a11 += (m - 1) * (PACKMR + 1); // (m - 1) + (m - 1) * PACKMR - b11 += (m - 1) * PACKNR; - c11 += (m - 1) * rsc; - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE)); - vcmul_vf(v24, v26, v0, v2, ft0, ft1); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 1) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE)); - vcmul_vf(v24, v26, v4, v6, ft2, ft3); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 2) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE)); - vcmul_vf(v24, v26, v8, v10, ft4, ft5); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 3) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); - vcmul_vf(v24, v26, v12, v14, ft6, ft7); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 4) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcmul_vf(v24, v26, v16, v18, ft8, ft9); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 5) return; - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcmul_vf(v24, v26, v20, v22, ft10, ft11); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - return; -} -#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c deleted file mode 100644 index 2d511a8ba6..0000000000 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#ifdef GEMMTRSM - -GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) -{ - const DATATYPE* restrict alpha = alpha_; - const DATATYPE* restrict a12 = a12_; - const DATATYPE* restrict a11 = a11_; - const DATATYPE* restrict b21 = b21_; - const DATATYPE* restrict b11 = b11_; - DATATYPE* restrict c11 = c11_; - - if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR)) - return; - - dim_t m_sz, a11_offset, c11_offset, temp; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE)); - - // Multiply step sizes by data size - __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE)); - __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE)); - __asm__("slli %0, %1, %2": "=r"(m_sz) : "r"(m), "I"(LOG_FLT_SIZE)); - - __asm__("li %0, %1": "=r"(temp): "I"((PACKMR+1)*FLT_SIZE)); - __asm__("mul %0, %1, %2": "=r"(a11_offset) : "r"(m), "r"(temp)); - __asm__("addi %0, %0, %1": "+r"(a11_offset) : "I"(-PACKMR * FLT_SIZE)); - __asm__("mul %0, %1, %2": "=r"(c11_offset) : "r"(m), "r"(rsc)); - __asm__("sub %0, %0, %1": "+r"(c11_offset) : "r"(rsc)); - // a11_offset = (PACKMR*(m-1)+m)*sz = m*(PACKMR+1)*FLT_SIZE - PACKMR*FLT_SIZE - // c11_offset = rsc*(m-1)*sz - - __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha)); - switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha - case 7: __asm__(VLE " v0, (%0)": : "r"(b11)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 6: __asm__(VLE " v4, (%0)": : "r"(b11)); - __asm__("vfmul.vf v4, v4, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 5: __asm__(VLE " v8, (%0)": : "r"(b11)); - __asm__("vfmul.vf v8, v8, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 4: __asm__(VLE " v12, (%0)": : "r"(b11)); - __asm__("vfmul.vf v12, v12, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 3: __asm__(VLE " v16, (%0)": : "r"(b11)); - __asm__("vfmul.vf v16, v16, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 2: __asm__(VLE " v20, (%0)": : "r"(b11)); - __asm__("vfmul.vf v20, v20, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 1: __asm__(VLE " v24, (%0)": : "r"(b11)); - __asm__("vfmul.vf v24, v24, f0"); - // no add of b11 on final entry - } - // b11 now positioned at start of last row - // v24 = row 0 from bottom (bottom row) - // v20 = row 1 from bottom - // v16 = row 2 from bottom - // v12 = row 3 from bottom - // v8 = row 4 from bottom - // v4 = row 5 from bottom - // v0 = row 6 from bottom - - // GEMM: B11 := alpha * B11 - A12 * B21 - __asm__("add %0, %0, %1": "+r"(a12): "r"(m_sz)); - for (dim_t i = 0; i < k; i++){ - __asm__(VLE " v28, (%0)": : "r"(b21)); // kth row of b21 - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(-7*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v0, f6, v28"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(-6*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v4, f5, v28"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(-5*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v8, f4, v28"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(-4*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v12, f3, v28"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(-3*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v16, f2, v28"); - case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(-2*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v20, f1, v28"); - case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(-1*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v24, f0, v28"); - } - __asm__("addi %0, %0, %1": "+r"(a12): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b21): "I"(PACKNR * FLT_SIZE)); - } - // TRSM: B11 := inv(A11) * B11 - // Move a11 to end of array and c11 to first entry in last row - __asm__("add %0, %0, %1": "+r"(a11): "r"(a11_offset)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(c11_offset)); - - // Row 0 from bottom (bottom row) - __asm__(FLT_LOAD " f0, %0(%1)": : "I"(-1*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v24, v24, f0"); - __asm__(VSE " v24, (%0)": : "r"(b11)); - __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 1) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v24"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v24"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v24"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v24"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v16, f2, v24"); - case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v20, f1, v24"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 1 from bottom - __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v20, v20, f1"); - __asm__(VSE " v20, (%0)": : "r"(b11)); - __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 2) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v20"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v20"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v20"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v20"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v16, f2, v20"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 2 from bottom - __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v16, v16, f2"); - __asm__(VSE " v16, (%0)": : "r"(b11)); - __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 3) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v16"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v16"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v16"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v16"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 3 from bottom - __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v12, v12, f3"); - __asm__(VSE " v12, (%0)": : "r"(b11)); - __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 4) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v12"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v12"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v12"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 4 from bottom - __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v8, v8, f4"); - __asm__(VSE " v8, (%0)": : "r"(b11)); - __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 5) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v8"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v8"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 5 from bottom - __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v4, v4, f5"); - __asm__(VSE " v4, (%0)": : "r"(b11)); - __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 6) return; - - __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v4"); - - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 6 from bottom - __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v0, v0, f6"); - __asm__(VSE " v0, (%0)": : "r"(b11)); - __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc)); - -} -#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c similarity index 75% rename from kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c rename to kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c index 7cb8d9e070..687abec185 100644 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -34,12 +34,12 @@ // clang-format off #include "blis.h" -#include "../../riscv_cmul_macros_asm.h" +#include "../../riscv_cmul_macros_intr.h" #include "../../bli_kernels_sifive_x280.h" #include #include -#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_asm(\ +#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_intr(\ dim_t m, \ dim_t n, \ dim_t k, \ @@ -55,7 +55,7 @@ const cntx_t* restrict cntx \ ) -#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_asm(\ +#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_intr(\ dim_t m, \ dim_t n, \ dim_t k, \ @@ -76,108 +76,83 @@ // Single precision real #define DATATYPE float #define PRECISION_CHAR s +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) #define PACKMR 8 #define PACKNR 64 -#define VLE "vle32.v" -#define VSE "vse32.v" -#define VSSE "vsse32.v" -#define FLT_LOAD "flw" -#define FLT_SIZE sizeof(float) -#define LOG_FLT_SIZE 2 - -#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c" -#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c" +#include "./bli_gemmtrsm_sifive_x280_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE #undef PACKMR #undef PACKNR -#undef VLE -#undef VSE -#undef VSSE -#undef FLT_LOAD -#undef FLT_SIZE -#undef LOG_FLT_SIZE // Double precision real #define DATATYPE double #define PRECISION_CHAR d +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) #define PACKMR 8 #define PACKNR 32 -#define VLE "vle64.v" -#define VSE "vse64.v" -#define VSSE "vsse64.v" -#define FLT_LOAD "fld" -#define FLT_SIZE sizeof(double) -#define LOG_FLT_SIZE 3 -#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c" -#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c" +#include "./bli_gemmtrsm_sifive_x280_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE #undef PACKMR #undef PACKNR -#undef VLE -#undef VSE -#undef VSSE -#undef FLT_LOAD -#undef FLT_SIZE -#undef LOG_FLT_SIZE // Single precision complex #define DATATYPE scomplex +#define BASE_DT float #define PRECISION_CHAR c +#define PREC 32 +#define LMUL m2 +#define FLT_SIZE sizeof(float) #define PACKMR 8 #define PACKNR 32 -#define VLSEG2 "vlseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define FLT_LOAD "flw " -#define FLT_SIZE sizeof(float) -#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c" -#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c" +#include "./bli_gemmtrsm_sifive_x280_intr_complex.c" #undef DATATYPE +#undef BASE_DT #undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE #undef PACKMR #undef PACKNR -#undef VLSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef FLT_LOAD -#undef FLT_SIZE // Double precision complex #define DATATYPE dcomplex +#define BASE_DT double #define PRECISION_CHAR z +#define PREC 64 +#define LMUL m2 +#define FLT_SIZE sizeof(double) #define PACKMR 8 #define PACKNR 16 -#define VLSEG2 "vlseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define FLT_LOAD "fld " -#define FLT_SIZE sizeof(double) -#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c" -#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c" +#include "./bli_gemmtrsm_sifive_x280_intr_complex.c" #undef DATATYPE +#undef BASE_DT #undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE #undef PACKMR #undef PACKNR -#undef VLSEG -#undef VSSEG -#undef VSSSEG -#undef FLT_LOAD -#undef FLT_SIZE - - #undef GEMMTRSM #undef GEMMTRSM_L #undef GEMMTRSM_U - - diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..88ea04b7a9 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c @@ -0,0 +1,437 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMMTRSM + +#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr +#define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) + +static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + dim_t m, + dim_t n, + dim_t k, + const DATATYPE* restrict beta, + const DATATYPE* restrict a, inc_t rsa, inc_t csa, + const DATATYPE* restrict b, inc_t rsb, + DATATYPE* restrict c, inc_t rsc, + const DATATYPE* restrict a11, inc_t rsa11, inc_t csa11, + DATATYPE* restrict c11, inc_t rsc11, inc_t csc11 + ) +{ + // This function computes inv(a11) * (beta * c - a * b) + // and stores the result in c and c11. + + RVV_TYPE_F(PREC, LMUL) ab0_r, ab1_r, ab2_r, ab3_r, ab4_r, ab5_r; + RVV_TYPE_F(PREC, LMUL) ab0_i, ab1_i, ab2_i, ab3_i, ab4_i, ab5_i; + // gemm step + if (m <= 0 || n <= 0 || k < 0) + return; + else if (k == 0) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_F(PREC, LMUL) zero_splat = VFMV_V_F(PREC, LMUL)(0., n); + switch (m) { + case 6: + ab5_r = zero_splat; + ab5_i = zero_splat; + case 5: + ab4_r = zero_splat; + ab4_i = zero_splat; + case 4: + ab3_r = zero_splat; + ab3_i = zero_splat; + case 3: + ab2_r = zero_splat; + ab2_i = zero_splat; + case 2: + ab1_r = zero_splat; + ab1_i = zero_splat; + case 1: + ab0_r = zero_splat; + ab0_i = zero_splat; + } + } + else { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + + switch (m) { + case 6: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 5 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab5_r, ab5_i, c0_r, c0_i, beta->real, beta->imag, n); + case 5: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 4 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, c0_r, c0_i, beta->real, beta->imag, n); + case 4: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 3 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, c0_r, c0_i, beta->real, beta->imag, n); + case 3: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 2 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, c0_r, c0_i, beta->real, beta->imag, n); + case 2: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 1 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, c0_r, c0_i, beta->real, beta->imag, n); + case 1: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 0 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, c0_r, c0_i, beta->real, beta->imag, n); + } + } + } + else { + RVV_TYPE_FX(PREC, LMUL, 2) b0, b1; + RVV_TYPE_F(PREC, LMUL) b0_r, b1_r; + RVV_TYPE_F(PREC, LMUL) b0_i, b1_i; + + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += rsb; + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += rsb; + } + + switch (m) { + case 6: + VCMUL_VF(PREC, LMUL, ab5_r, ab5_i, b0_r, b0_i, a[5 * rsa].real, a[5 * rsa].imag, n); + case 5: + VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, b0_r, b0_i, a[4 * rsa].real, a[4 * rsa].imag, n); + case 4: + VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, b0_r, b0_i, a[3 * rsa].real, a[3 * rsa].imag, n); + case 3: + VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, b0_r, b0_i, a[2 * rsa].real, a[2 * rsa].imag, n); + case 2: + VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, b0_r, b0_i, a[1 * rsa].real, a[1 * rsa].imag, n); + case 1: + VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, b0_r, b0_i, a[0 * rsa].real, a[0 * rsa].imag, n); + } + + a += csa; + k -= 1; + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += rsb; + } + + while (k > 0) { + switch (m) { + case 6: + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5 * rsa].real, a[5 * rsa].imag, b1_r, b1_i, n); + case 5: + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4 * rsa].real, a[4 * rsa].imag, b1_r, b1_i, n); + case 4: + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3 * rsa].real, a[3 * rsa].imag, b1_r, b1_i, n); + case 3: + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2 * rsa].real, a[2 * rsa].imag, b1_r, b1_i, n); + case 2: + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1 * rsa].real, a[1 * rsa].imag, b1_r, b1_i, n); + case 1: + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0 * rsa].real, a[0 * rsa].imag, b1_r, b1_i, n); + } + + a += csa; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += rsb; + } + + switch (m) { + case 6: + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5 * rsa].real, a[5 * rsa].imag, b0_r, b0_i, n); + case 5: + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4 * rsa].real, a[4 * rsa].imag, b0_r, b0_i, n); + case 4: + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3 * rsa].real, a[3 * rsa].imag, b0_r, b0_i, n); + case 3: + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2 * rsa].real, a[2 * rsa].imag, b0_r, b0_i, n); + case 2: + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1 * rsa].real, a[1 * rsa].imag, b0_r, b0_i, n); + case 1: + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0 * rsa].real, a[0 * rsa].imag, b0_r, b0_i, n); + } + + a += csa; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += rsb; + } + } + + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + switch (m) { + case 6: + ab5_r = VFNEG_VF(PREC, LMUL)(ab5_r, n); + ab5_i = VFNEG_VF(PREC, LMUL)(ab5_i, n); + case 5: + ab4_r = VFNEG_VF(PREC, LMUL)(ab4_r, n); + ab4_i = VFNEG_VF(PREC, LMUL)(ab4_i, n); + case 4: + ab3_r = VFNEG_VF(PREC, LMUL)(ab3_r, n); + ab3_i = VFNEG_VF(PREC, LMUL)(ab3_i, n); + case 3: + ab2_r = VFNEG_VF(PREC, LMUL)(ab2_r, n); + ab2_i = VFNEG_VF(PREC, LMUL)(ab2_i, n); + case 2: + ab1_r = VFNEG_VF(PREC, LMUL)(ab1_r, n); + ab1_i = VFNEG_VF(PREC, LMUL)(ab1_i, n); + case 1: + ab0_r = VFNEG_VF(PREC, LMUL)(ab0_r, n); + ab0_i = VFNEG_VF(PREC, LMUL)(ab0_i, n); + } + } + else { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + switch (m) { + case 6: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab5_r, ab5_i, beta->real, beta->imag, c0_r, c0_i, n); + case 5: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + case 4: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + case 3: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + case 2: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + case 1: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + } + } + } + + // trsm step + RVV_TYPE_FX(PREC, LMUL, 2) temp = VUNDEFINED_FX(PREC, LMUL, 2)(); + RVV_TYPE_F(PREC, LMUL) temp_r, temp_i; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab0_r, ab0_i, a11[0 * rsa11].real, a11[0 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 0 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 0 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 0 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 1) return; + switch (m) { + case 6: + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + case 5: + VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n); + case 4: + VCNMSAC_VF(PREC, LMUL, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, temp_r, temp_i, n); + case 3: + VCNMSAC_VF(PREC, LMUL, ab2_r, ab2_i, a11[2 * rsa11].real, a11[2 * rsa11].imag, temp_r, temp_i, n); + case 2: + VCNMSAC_VF(PREC, LMUL, ab1_r, ab1_i, a11[1 * rsa11].real, a11[1 * rsa11].imag, temp_r, temp_i, n); + } + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab1_r, ab1_i, a11[1 * rsa11].real, a11[1 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 1 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 1 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 1 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 2) return; + switch (m) { + case 6: + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + case 5: + VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n); + case 4: + VCNMSAC_VF(PREC, LMUL, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, temp_r, temp_i, n); + case 3: + VCNMSAC_VF(PREC, LMUL, ab2_r, ab2_i, a11[2 * rsa11].real, a11[2 * rsa11].imag, temp_r, temp_i, n); + } + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab2_r, ab2_i, a11[2 * rsa11].real, a11[2 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 2 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 2 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 2 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 3) return; + switch (m) { + case 6: + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + case 5: + VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n); + case 4: + VCNMSAC_VF(PREC, LMUL, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, temp_r, temp_i, n); + } + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 3 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 3 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 3 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 4) return; + switch (m) { + case 6: + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + case 5: + VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n); + } + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 4 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 4 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 4 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 5) return; + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 5 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 5 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 5 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + return; +} + +GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a10 = a10_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b01 = b01_; + DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + m, n, k, + alpha, + a10, 1, PACKMR, + b01, PACKNR, + b11, PACKNR, + a11, 1, PACKMR, + c11, rsc, csc + ); + + return; +} + +GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a12 = a12_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b21 = b21_; + DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + m, n, k, + alpha, + a12 + (m - 1), -1, PACKMR, + b21, PACKNR, + b11 + (m - 1) * PACKNR, -PACKNR, + a11 + (m - 1) + (m - 1) * PACKMR, -1, -PACKMR, + c11 + (m - 1) * rsc, -rsc, csc + ); + + return; +} + +#undef GEMMTRSM_IMPL_NAME_ +#undef GEMMTRSM_IMPL_NAME + +#endif // GEMMTRSM diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c new file mode 100644 index 0000000000..7c3c3b8b7b --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c @@ -0,0 +1,364 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMMTRSM + +#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr +#define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) + +static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + dim_t m, + dim_t n, + dim_t k, + const DATATYPE* restrict beta, + const DATATYPE* restrict a, inc_t rsa, inc_t csa, + const DATATYPE* restrict b, inc_t rsb, + DATATYPE* restrict c, inc_t rsc, + const DATATYPE* restrict a11, inc_t rsa11, inc_t csa11, + DATATYPE* restrict c11, inc_t rsc11, inc_t csc11 + ) +{ + // This function computes inv(a11) * (beta * c - a * b) + // and stores the result in c and c11. + + RVV_TYPE_F(PREC, LMUL) ab0, ab1, ab2, ab3, ab4, ab5, ab6; + // gemm step + if (m <= 0 || n <= 0 || k < 0) + return; + else if (k == 0) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_F(PREC, LMUL) zero_splat = VFMV_V_F(PREC, LMUL)(0., n); + switch (m) { + case 7: + ab6 = zero_splat; + case 6: + ab5 = zero_splat; + case 5: + ab4 = zero_splat; + case 4: + ab3 = zero_splat; + case 3: + ab2 = zero_splat; + case 2: + ab1 = zero_splat; + case 1: + ab0 = zero_splat; + } + } + else { + RVV_TYPE_F(PREC, LMUL) c0; + switch (m) { + case 7: + c0 = VLE_V_F(PREC, LMUL)(c + 6 * rsc, n); + ab6 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 6: + c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n); + ab5 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 5: + c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n); + ab4 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 4: + c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n); + ab3 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 3: + c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n); + ab2 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 2: + c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n); + ab1 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 1: + c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n); + ab0 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + } + } + } + else { + bool first = true; + for (dim_t i = 0; i < k; ++i) { + RVV_TYPE_F(PREC, LMUL) b0 = VLE_V_F(PREC, LMUL)(b, n); + if (first) { + switch (m) { + case 7: + ab6 = VFMUL_VF(PREC, LMUL)(b0, a[6 * rsa], n); + case 6: + ab5 = VFMUL_VF(PREC, LMUL)(b0, a[5 * rsa], n); + case 5: + ab4 = VFMUL_VF(PREC, LMUL)(b0, a[4 * rsa], n); + case 4: + ab3 = VFMUL_VF(PREC, LMUL)(b0, a[3 * rsa], n); + case 3: + ab2 = VFMUL_VF(PREC, LMUL)(b0, a[2 * rsa], n); + case 2: + ab1 = VFMUL_VF(PREC, LMUL)(b0, a[1 * rsa], n); + case 1: + ab0 = VFMUL_VF(PREC, LMUL)(b0, a[0 * rsa], n); + } + first = false; + } + else { + switch (m) { + case 7: + ab6 = VFMACC_VF(PREC, LMUL)(ab6, a[6 * rsa], b0, n); + case 6: + ab5 = VFMACC_VF(PREC, LMUL)(ab5, a[5 * rsa], b0, n); + case 5: + ab4 = VFMACC_VF(PREC, LMUL)(ab4, a[4 * rsa], b0, n); + case 4: + ab3 = VFMACC_VF(PREC, LMUL)(ab3, a[3 * rsa], b0, n); + case 3: + ab2 = VFMACC_VF(PREC, LMUL)(ab2, a[2 * rsa], b0, n); + case 2: + ab1 = VFMACC_VF(PREC, LMUL)(ab1, a[1 * rsa], b0, n); + case 1: + ab0 = VFMACC_VF(PREC, LMUL)(ab0, a[0 * rsa], b0, n); + } + } + + a += csa; + b += rsb; + } + + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + switch (m) { + case 7: + ab6 = VFNEG_VF(PREC, LMUL)(ab6, n); + case 6: + ab5 = VFNEG_VF(PREC, LMUL)(ab5, n); + case 5: + ab4 = VFNEG_VF(PREC, LMUL)(ab4, n); + case 4: + ab3 = VFNEG_VF(PREC, LMUL)(ab3, n); + case 3: + ab2 = VFNEG_VF(PREC, LMUL)(ab2, n); + case 2: + ab1 = VFNEG_VF(PREC, LMUL)(ab1, n); + case 1: + ab0 = VFNEG_VF(PREC, LMUL)(ab0, n); + } + } + else { + RVV_TYPE_F(PREC, LMUL) c0; + switch (m) { + case 7: + c0 = VLE_V_F(PREC, LMUL)(c + 6 * rsc, n); + ab6 = VFMSAC_VF(PREC, LMUL)(ab6, *beta, c0, n); + case 6: + c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n); + ab5 = VFMSAC_VF(PREC, LMUL)(ab5, *beta, c0, n); + case 5: + c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n); + ab4 = VFMSAC_VF(PREC, LMUL)(ab4, *beta, c0, n); + case 4: + c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n); + ab3 = VFMSAC_VF(PREC, LMUL)(ab3, *beta, c0, n); + case 3: + c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n); + ab2 = VFMSAC_VF(PREC, LMUL)(ab2, *beta, c0, n); + case 2: + c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n); + ab1 = VFMSAC_VF(PREC, LMUL)(ab1, *beta, c0, n); + case 1: + c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n); + ab0 = VFMSAC_VF(PREC, LMUL)(ab0, *beta, c0, n); + } + } + } + + // trsm step + ab0 = VFMUL_VF(PREC, LMUL)(ab0, a11[0 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 0 * rsc, ab0, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 0 * rsc11, ab0, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 0 * rsc11, FLT_SIZE * csc11, ab0, n); + if (m == 1) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab0, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab0, n); + case 5: + ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab0, n); + case 4: + ab3 = VFNMSAC_VF(PREC, LMUL)(ab3, a11[3 * rsa11], ab0, n); + case 3: + ab2 = VFNMSAC_VF(PREC, LMUL)(ab2, a11[2 * rsa11], ab0, n); + case 2: + ab1 = VFNMSAC_VF(PREC, LMUL)(ab1, a11[1 * rsa11], ab0, n); + } + a11 += csa11; + + ab1 = VFMUL_VF(PREC, LMUL)(ab1, a11[1 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 1 * rsc, ab1, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 1 * rsc11, ab1, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 1 * rsc11, FLT_SIZE * csc11, ab1, n); + if (m == 2) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab1, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab1, n); + case 5: + ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab1, n); + case 4: + ab3 = VFNMSAC_VF(PREC, LMUL)(ab3, a11[3 * rsa11], ab1, n); + case 3: + ab2 = VFNMSAC_VF(PREC, LMUL)(ab2, a11[2 * rsa11], ab1, n); + } + a11 += csa11; + + ab2 = VFMUL_VF(PREC, LMUL)(ab2, a11[2 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 2 * rsc, ab2, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 2 * rsc11, ab2, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 2 * rsc11, FLT_SIZE * csc11, ab2, n); + if (m == 3) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab2, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab2, n); + case 5: + ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab2, n); + case 4: + ab3 = VFNMSAC_VF(PREC, LMUL)(ab3, a11[3 * rsa11], ab2, n); + } + a11 += csa11; + + ab3 = VFMUL_VF(PREC, LMUL)(ab3, a11[3 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 3 * rsc, ab3, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 3 * rsc11, ab3, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 3 * rsc11, FLT_SIZE * csc11, ab3, n); + if (m == 4) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab3, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab3, n); + case 5: + ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab3, n); + } + a11 += csa11; + + ab4 = VFMUL_VF(PREC, LMUL)(ab4, a11[4 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 4 * rsc, ab4, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 4 * rsc11, ab4, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 4 * rsc11, FLT_SIZE * csc11, ab4, n); + if (m == 5) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab4, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab4, n); + } + a11 += csa11; + + ab5 = VFMUL_VF(PREC, LMUL)(ab5, a11[5 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 5 * rsc, ab5, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 5 * rsc11, ab5, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 5 * rsc11, FLT_SIZE * csc11, ab5, n); + if (m == 6) return; + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab5, n); + a11 += csa11; + + ab6 = VFMUL_VF(PREC, LMUL)(ab6, a11[6 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 6 * rsc, ab6, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 6 * rsc11, ab6, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 6 * rsc11, FLT_SIZE * csc11, ab6, n); + return; +} + +GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a10 = a10_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b01 = b01_; + DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + m, n, k, + alpha, + a10, 1, PACKMR, + b01, PACKNR, + b11, PACKNR, + a11, 1, PACKMR, + c11, rsc, csc + ); + + return; +} + +GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a12 = a12_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b21 = b21_; + DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + m, n, k, + alpha, + a12 + (m - 1), -1, PACKMR, + b21, PACKNR, + b11 + (m - 1) * PACKNR, -PACKNR, + a11 + (m - 1) + (m - 1) * PACKMR, -1, -PACKMR, + c11 + (m - 1) * rsc, -rsc, csc + ); + + return; +} + +#undef GEMMTRSM_IMPL_NAME_ +#undef GEMMTRSM_IMPL_NAME + +#endif // GEMMTRSM diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h index 9ffc2f7449..ff7b445c47 100644 --- a/kernels/sifive_x280/bli_kernels_sifive_x280.h +++ b/kernels/sifive_x280/bli_kernels_sifive_x280.h @@ -147,16 +147,16 @@ PACKM_KER_PROT(scomplex, cc, packm_sifive_x280_ref) PACKM_KER_PROT(dcomplex, zz, packm_sifive_x280_ref) // Level 3 -GEMM_UKR_PROT(float, s, gemm_sifive_x280_asm_7m4) -GEMM_UKR_PROT(double, d, gemm_sifive_x280_asm_7m4) -GEMM_UKR_PROT(scomplex, c, gemm_sifive_x280_asm_6m2) -GEMM_UKR_PROT(dcomplex, z, gemm_sifive_x280_asm_6m2) - -GEMMTRSM_UKR_PROT(float, s, gemmtrsm_l_sifive_x280_asm) -GEMMTRSM_UKR_PROT(double, d, gemmtrsm_l_sifive_x280_asm) -GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_asm) -GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_asm) -GEMMTRSM_UKR_PROT(float, s, gemmtrsm_u_sifive_x280_asm) -GEMMTRSM_UKR_PROT(double, d, gemmtrsm_u_sifive_x280_asm) -GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_asm) -GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_asm) +GEMM_UKR_PROT(float, s, gemm_sifive_x280_intr) +GEMM_UKR_PROT(double, d, gemm_sifive_x280_intr) +GEMM_UKR_PROT(scomplex, c, gemm_sifive_x280_intr) +GEMM_UKR_PROT(dcomplex, z, gemm_sifive_x280_intr) + +GEMMTRSM_UKR_PROT(float, s, gemmtrsm_l_sifive_x280_intr) +GEMMTRSM_UKR_PROT(double, d, gemmtrsm_l_sifive_x280_intr) +GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_intr) +GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_intr) +GEMMTRSM_UKR_PROT(float, s, gemmtrsm_u_sifive_x280_intr) +GEMMTRSM_UKR_PROT(double, d, gemmtrsm_u_sifive_x280_intr) +GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_intr) +GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_intr) diff --git a/kernels/sifive_x280/riscv_cmul_macros_intr.h b/kernels/sifive_x280/riscv_cmul_macros_intr.h index ea33fc5d10..70a0a16124 100644 --- a/kernels/sifive_x280/riscv_cmul_macros_intr.h +++ b/kernels/sifive_x280/riscv_cmul_macros_intr.h @@ -109,6 +109,24 @@ VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ } while(0) +// vd = vs2 * f[rs1] - vd +#define VCMSAC_VF(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMSAC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL); \ + VD_I = VFMSAC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd -= vs2 * f[rs1] +#define VCNMSAC_VF(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL); \ + VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + // vd += vs2 * vs1 #define VCMACC_VV_TU(PREC, LMUL, VD_R, VD_I, VS1_R, VS1_I, VS2_R, VS2_I, VL) \ do { \ diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h index acf3cb5b47..44f70f2727 100644 --- a/kernels/sifive_x280/riscv_overloaded_intrinsics.h +++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h @@ -194,6 +194,8 @@ #define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS) #define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL #define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS) +#define VCREATE_V_FX_(PRECISION, LMUL, NFIELDS) __riscv_vcreate_v_f##PRECISION##LMUL##x##NFIELDS +#define VCREATE_V_FX(PRECISION, LMUL, NFIELDS) VCREATE_V_FX_(PRECISION, LMUL, NFIELDS) // Non-vector functions #define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__)) From 4a76a4c8f447d549736ea870b497edfec9da3f8c Mon Sep 17 00:00:00 2001 From: Michael Yeh <111819036+myeh01@users.noreply.github.com> Date: Wed, 20 Nov 2024 19:12:16 -0800 Subject: [PATCH 7/8] Add the sifive_rvv configuration --- config/sifive_rvv/bli_cntx_init_sifive_rvv.c | 222 +++++++++++++++++ config/sifive_rvv/bli_family_sifive_rvv.h | 34 +++ .../sifive_rvv/bli_kernel_defs_sifive_rvv.h | 55 +++++ config/sifive_rvv/make_defs.mk | 80 ++++++ .../sifive_x280/bli_cntx_init_sifive_x280.c | 230 +++++++++--------- config/sifive_x280/make_defs.mk | 4 +- config_registry | 3 +- frame/base/bli_arch.c | 4 + frame/include/bli_arch_config.h | 6 + frame/include/bli_gentconf_macro_defs.h | 6 + frame/include/bli_type_defs.h | 1 + .../bli_addv_sifive_rvv_intr.c} | 12 +- .../bli_addv_sifive_rvv_intr_complex.c} | 2 +- .../bli_addv_sifive_rvv_intr_real.c} | 2 +- .../bli_amaxv_sifive_rvv_intr.c} | 28 +-- .../bli_amaxv_sifive_rvv_intr_complex.c} | 0 .../bli_amaxv_sifive_rvv_intr_real.c} | 0 .../bli_axpbyv_sifive_rvv_intr.c} | 18 +- .../bli_axpbyv_sifive_rvv_intr_complex.c} | 2 +- .../bli_axpbyv_sifive_rvv_intr_real.c} | 2 +- .../bli_axpyv_sifive_rvv_intr.c} | 12 +- .../bli_axpyv_sifive_rvv_intr_complex.c} | 2 +- .../bli_axpyv_sifive_rvv_intr_real.c} | 2 +- .../bli_copyv_sifive_rvv_intr.c} | 10 +- .../bli_copyv_sifive_rvv_intr_complex.c} | 0 .../bli_copyv_sifive_rvv_intr_real.c} | 0 .../bli_dotv_sifive_rvv_intr.c} | 12 +- .../bli_dotv_sifive_rvv_intr_complex.c} | 2 +- .../bli_dotv_sifive_rvv_intr_real.c} | 2 +- .../bli_dotxv_sifive_rvv_intr.c} | 12 +- .../bli_dotxv_sifive_rvv_intr_complex.c} | 2 +- .../bli_dotxv_sifive_rvv_intr_real.c} | 2 +- .../bli_invertv_sifive_rvv_intr.c} | 10 +- .../bli_invertv_sifive_rvv_intr_complex.c} | 0 .../bli_invertv_sifive_rvv_intr_real.c} | 0 .../bli_invscalv_sifive_rvv_intr.c} | 10 +- .../bli_invscalv_sifive_rvv_intr_complex.c} | 0 .../bli_invscalv_sifive_rvv_intr_real.c} | 0 .../bli_scal2v_sifive_rvv_intr.c} | 16 +- .../bli_scal2v_sifive_rvv_intr_complex.c} | 2 +- .../bli_scal2v_sifive_rvv_intr_real.c} | 2 +- .../bli_scalv_sifive_rvv_intr.c} | 14 +- .../bli_scalv_sifive_rvv_intr_complex.c} | 2 +- .../bli_scalv_sifive_rvv_intr_real.c} | 2 +- .../bli_setv_sifive_rvv_intr.c} | 10 +- .../bli_setv_sifive_rvv_intr_complex.c} | 0 .../bli_setv_sifive_rvv_intr_real.c} | 0 .../bli_subv_sifive_rvv_intr.c} | 12 +- .../bli_subv_sifive_rvv_intr_complex.c} | 2 +- .../bli_subv_sifive_rvv_intr_real.c} | 2 +- .../bli_swapv_sifive_rvv_intr.c} | 10 +- .../bli_swapv_sifive_rvv_intr_complex.c} | 0 .../bli_swapv_sifive_rvv_intr_real.c} | 0 .../bli_xpbyv_sifive_rvv_intr.c} | 14 +- .../bli_xpbyv_sifive_rvv_intr_complex.c} | 2 +- .../bli_xpbyv_sifive_rvv_intr_real.c} | 2 +- .../bli_axpy2v_sifive_rvv_intr.c} | 12 +- .../bli_axpy2v_sifive_rvv_intr_complex.c} | 2 +- .../bli_axpy2v_sifive_rvv_intr_real.c} | 2 +- .../bli_axpyf_sifive_rvv_intr.c} | 10 +- .../bli_axpyf_sifive_rvv_intr_complex.c} | 0 .../bli_axpyf_sifive_rvv_intr_real.c} | 0 .../bli_dotaxpyv_sifive_rvv_intr.c} | 12 +- .../bli_dotaxpyv_sifive_rvv_intr_complex.c} | 2 +- .../bli_dotaxpyv_sifive_rvv_intr_real.c} | 2 +- .../bli_dotxaxpyf_sifive_rvv_intr.c} | 14 +- .../bli_dotxaxpyf_sifive_rvv_intr_complex.c} | 134 +++++----- .../bli_dotxaxpyf_sifive_rvv_intr_real.c} | 86 +++---- .../bli_dotxf_sifive_rvv_intr.c} | 14 +- .../bli_dotxf_sifive_rvv_intr_complex.c} | 126 +++++----- .../bli_dotxf_sifive_rvv_intr_real.c} | 110 ++++----- .../bli_packm_sifive_rvv_intr.c} | 25 +- .../bli_packm_sifive_rvv_intr_complex.c} | 19 +- .../bli_packm_sifive_rvv_intr_real.c} | 22 +- .../bli_gemm_sifive_rvv_intr.c} | 18 +- .../bli_gemm_sifive_rvv_intr_complex.c} | 0 .../bli_gemm_sifive_rvv_intr_real.c} | 0 .../bli_gemmtrsm_sifive_rvv_intr.c} | 22 +- .../bli_gemmtrsm_sifive_rvv_intr_complex.c} | 2 +- .../bli_gemmtrsm_sifive_rvv_intr_real.c} | 2 +- kernels/sifive_rvv/bli_kernels_sifive_rvv.h | 162 ++++++++++++ .../riscv_cmul_macros_intr.h | 0 .../riscv_overloaded_intrinsics.h | 2 +- kernels/sifive_x280/bli_kernels_sifive_x280.h | 162 ------------ kernels/sifive_x280/riscv_cmul_macros_asm.h | 137 ----------- 85 files changed, 1129 insertions(+), 851 deletions(-) create mode 100644 config/sifive_rvv/bli_cntx_init_sifive_rvv.c create mode 100644 config/sifive_rvv/bli_family_sifive_rvv.h create mode 100644 config/sifive_rvv/bli_kernel_defs_sifive_rvv.h create mode 100644 config/sifive_rvv/make_defs.mk rename kernels/{sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c => sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_complex.c} (98%) rename kernels/{sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c => sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c => sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr.c} (86%) rename kernels/{sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_complex.c} (100%) rename kernels/{sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c => sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_real.c} (100%) rename kernels/{sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c => sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c => sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c => sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c => sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c => sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr.c} (93%) rename kernels/{sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_complex.c} (100%) rename kernels/{sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c => sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_real.c} (100%) rename kernels/{sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c => sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c => sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c => sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr.c} (93%) rename kernels/{sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c => sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c => sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr.c} (93%) rename kernels/{sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_complex.c} (100%) rename kernels/{sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c => sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_real.c} (100%) rename kernels/{sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c => sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr.c} (93%) rename kernels/{sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_complex.c} (100%) rename kernels/{sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c => sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_real.c} (100%) rename kernels/{sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c => sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c => sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c => sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c => sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_complex.c} (98%) rename kernels/{sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c => sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c => sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr.c} (93%) rename kernels/{sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_complex.c} (100%) rename kernels/{sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c => sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_real.c} (100%) rename kernels/{sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c => sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_complex.c} (98%) rename kernels/{sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c => sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c => sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr.c} (93%) rename kernels/{sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_complex.c} (100%) rename kernels/{sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c => sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_real.c} (100%) rename kernels/{sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c => sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c => sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c => sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c => sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c => sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr.c} (94%) rename kernels/{sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_complex.c} (100%) rename kernels/{sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c => sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_real.c} (100%) rename kernels/{sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c => sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr.c} (92%) rename kernels/{sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c => sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_real.c} (99%) rename kernels/{sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c => sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr.c} (93%) rename kernels/{sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_complex.c} (76%) rename kernels/{sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c => sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_real.c} (79%) rename kernels/{sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c => sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr.c} (94%) rename kernels/{sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_complex.c} (74%) rename kernels/{sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c => sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_real.c} (72%) rename kernels/{sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c => sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c} (86%) rename kernels/{sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c => sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c => sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_real.c} (98%) rename kernels/{sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c => sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr.c} (90%) rename kernels/{sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c => sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_complex.c} (100%) rename kernels/{sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c => sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_real.c} (100%) rename kernels/{sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c => sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr.c} (89%) rename kernels/{sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c => sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_complex.c} (99%) rename kernels/{sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c => sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_real.c} (99%) create mode 100644 kernels/sifive_rvv/bli_kernels_sifive_rvv.h rename kernels/{sifive_x280 => sifive_rvv}/riscv_cmul_macros_intr.h (100%) rename kernels/{sifive_x280 => sifive_rvv}/riscv_overloaded_intrinsics.h (99%) delete mode 100644 kernels/sifive_x280/bli_kernels_sifive_x280.h delete mode 100644 kernels/sifive_x280/riscv_cmul_macros_asm.h diff --git a/config/sifive_rvv/bli_cntx_init_sifive_rvv.c b/config/sifive_rvv/bli_cntx_init_sifive_rvv.c new file mode 100644 index 0000000000..222a837434 --- /dev/null +++ b/config/sifive_rvv/bli_cntx_init_sifive_rvv.c @@ -0,0 +1,222 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_sifive_rvv( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_sifive_rvv_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native kernels. + bli_cntx_set_ukrs + ( + cntx, + + // Level 1 + BLIS_ADDV_KER, BLIS_FLOAT, bli_saddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_SCOMPLEX, bli_caddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_DCOMPLEX, bli_zaddv_sifive_rvv_intr, + + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_rvv_intr, + + BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_sifive_rvv_intr, + + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_sifive_rvv_intr, + + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_rvv_intr, + + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_sifive_rvv_intr, + + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_sifive_rvv_intr, + + BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_rvv_intr, + + BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_rvv_intr, + + BLIS_SCAL2V_KER, BLIS_FLOAT, bli_sscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_SCOMPLEX, bli_cscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_sifive_rvv_intr, + + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_sifive_rvv_intr, + + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_rvv_intr, + + BLIS_SUBV_KER, BLIS_FLOAT, bli_ssubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_DOUBLE, bli_dsubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_SCOMPLEX, bli_csubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_DCOMPLEX, bli_zsubv_sifive_rvv_intr, + + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_rvv_intr, + + BLIS_XPBYV_KER, BLIS_FLOAT, bli_sxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_DOUBLE, bli_dxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_SCOMPLEX, bli_cxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_DCOMPLEX, bli_zxpbyv_sifive_rvv_intr, + + // Level 1f + BLIS_AXPY2V_KER, BLIS_FLOAT, bli_saxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_SCOMPLEX, bli_caxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_sifive_rvv_intr, + + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_rvv_intr, + + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_rvv_intr, + + BLIS_DOTAXPYV_KER, BLIS_FLOAT, bli_sdotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_DOUBLE, bli_ddotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_rvv_intr, + + BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_rvv_intr, + + // Level 1m + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_rvv_intr, + + // Level 3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_rvv_intr, + + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_rvv_intr, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 7, 7, 6, 6, + 8, 8, 8, 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4 * __riscv_v_min_vlen / 32, 4 * __riscv_v_min_vlen / 64, 2 * __riscv_v_min_vlen / 32, 2 * __riscv_v_min_vlen / 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 7, 7, 6, 6 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4 * __riscv_v_min_vlen / 32, 4 * __riscv_v_min_vlen / 64, 2 * __riscv_v_min_vlen / 32, 2 * __riscv_v_min_vlen / 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 64, 64, 64, 64 ); + // Default BLIS_BBM_s = 1, but set here to ensure it's correct + bli_blksz_init_easy( &blkszs[ BLIS_BBM ], 1, 1, 1, 1 ); + bli_blksz_init_easy( &blkszs[ BLIS_BBN ], 1, 1, 1, 1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + // level-1m + BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM, + BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN, + + BLIS_VA_END + ); +} + diff --git a/config/sifive_rvv/bli_family_sifive_rvv.h b/config/sifive_rvv/bli_family_sifive_rvv.h new file mode 100644 index 0000000000..708c1960fd --- /dev/null +++ b/config/sifive_rvv/bli_family_sifive_rvv.h @@ -0,0 +1,34 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + diff --git a/config/sifive_rvv/bli_kernel_defs_sifive_rvv.h b/config/sifive_rvv/bli_kernel_defs_sifive_rvv.h new file mode 100644 index 0000000000..c6db9aceb7 --- /dev/null +++ b/config/sifive_rvv/bli_kernel_defs_sifive_rvv.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- +#define BLIS_MR_s 7 +#define BLIS_MR_d 7 +#define BLIS_MR_c 6 +#define BLIS_MR_z 6 + +#define BLIS_PACKMR_s 8 +#define BLIS_PACKMR_d 8 +#define BLIS_PACKMR_c 8 +#define BLIS_PACKMR_z 8 + +#define BLIS_NR_s ( 4 * __riscv_v_min_vlen / 32 ) +#define BLIS_NR_d ( 4 * __riscv_v_min_vlen / 64 ) +#define BLIS_NR_c ( 2 * __riscv_v_min_vlen / 32 ) +#define BLIS_NR_z ( 2 * __riscv_v_min_vlen / 64 ) +//#endif + diff --git a/config/sifive_rvv/make_defs.mk b/config/sifive_rvv/make_defs.mk new file mode 100644 index 0000000000..63c2d447fe --- /dev/null +++ b/config/sifive_rvv/make_defs.mk @@ -0,0 +1,80 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2024, SiFive, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := sifive_rvv +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb_zvl128b -mabi=lp64d +CMISCFLAGS_SIFIVE_OTHER := +CPPROCFLAGS := +CMISCFLAGS := $(CMISCFLAGS_SIFIVE) $(CMISCFLAGS_SIFIVE_OTHER) \ + -fdata-sections -ffunction-sections \ + -fdiagnostics-color=always -fno-rtti -fno-exceptions +CPICFLAGS := -fPIC +CWARNFLAGS := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \ + -Wno-sign-compare -Wno-unused-variable + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O3 +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) +CKVECFLAGS := + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +CRVECFLAGS := $(CKVECFLAGS) + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c index 668891cf3f..142ca19278 100644 --- a/config/sifive_x280/bli_cntx_init_sifive_x280.c +++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c @@ -49,127 +49,127 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) cntx, // Level 1 - BLIS_ADDV_KER, BLIS_FLOAT, bli_saddv_sifive_x280_intr, - BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_sifive_x280_intr, - BLIS_ADDV_KER, BLIS_SCOMPLEX, bli_caddv_sifive_x280_intr, - BLIS_ADDV_KER, BLIS_DCOMPLEX, bli_zaddv_sifive_x280_intr, - - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_x280_intr, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_x280_intr, - BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_x280_intr, - BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_intr, - - BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_sifive_x280_intr, - BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_sifive_x280_intr, - BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_sifive_x280_intr, - BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_sifive_x280_intr, - - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_sifive_x280_intr, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_sifive_x280_intr, - BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_sifive_x280_intr, - BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_sifive_x280_intr, - - BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_x280_intr, - BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_x280_intr, - BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_intr, - BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_intr, - - BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_sifive_x280_intr, - BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_sifive_x280_intr, - BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_sifive_x280_intr, - BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_sifive_x280_intr, - - BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_sifive_x280_intr, - BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_sifive_x280_intr, - BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_sifive_x280_intr, - BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_sifive_x280_intr, - - BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_x280_intr, - BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_x280_intr, - BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_intr, - BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_intr, - - BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_x280_intr, - BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_x280_intr, - BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_intr, - BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_intr, - - BLIS_SCAL2V_KER, BLIS_FLOAT, bli_sscal2v_sifive_x280_intr, - BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_sifive_x280_intr, - BLIS_SCAL2V_KER, BLIS_SCOMPLEX, bli_cscal2v_sifive_x280_intr, - BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_sifive_x280_intr, - - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_sifive_x280_intr, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_sifive_x280_intr, - BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_sifive_x280_intr, - BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_sifive_x280_intr, - - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_x280_intr, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_x280_intr, - BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_x280_intr, - BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_x280_intr, - - BLIS_SUBV_KER, BLIS_FLOAT, bli_ssubv_sifive_x280_intr, - BLIS_SUBV_KER, BLIS_DOUBLE, bli_dsubv_sifive_x280_intr, - BLIS_SUBV_KER, BLIS_SCOMPLEX, bli_csubv_sifive_x280_intr, - BLIS_SUBV_KER, BLIS_DCOMPLEX, bli_zsubv_sifive_x280_intr, - - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_x280_intr, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_x280_intr, - BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_x280_intr, - BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_x280_intr, - - BLIS_XPBYV_KER, BLIS_FLOAT, bli_sxpbyv_sifive_x280_intr, - BLIS_XPBYV_KER, BLIS_DOUBLE, bli_dxpbyv_sifive_x280_intr, - BLIS_XPBYV_KER, BLIS_SCOMPLEX, bli_cxpbyv_sifive_x280_intr, - BLIS_XPBYV_KER, BLIS_DCOMPLEX, bli_zxpbyv_sifive_x280_intr, + BLIS_ADDV_KER, BLIS_FLOAT, bli_saddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_SCOMPLEX, bli_caddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_DCOMPLEX, bli_zaddv_sifive_rvv_intr, + + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_rvv_intr, + + BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_sifive_rvv_intr, + + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_sifive_rvv_intr, + + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_rvv_intr, + + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_sifive_rvv_intr, + + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_sifive_rvv_intr, + + BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_rvv_intr, + + BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_rvv_intr, + + BLIS_SCAL2V_KER, BLIS_FLOAT, bli_sscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_SCOMPLEX, bli_cscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_sifive_rvv_intr, + + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_sifive_rvv_intr, + + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_rvv_intr, + + BLIS_SUBV_KER, BLIS_FLOAT, bli_ssubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_DOUBLE, bli_dsubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_SCOMPLEX, bli_csubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_DCOMPLEX, bli_zsubv_sifive_rvv_intr, + + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_rvv_intr, + + BLIS_XPBYV_KER, BLIS_FLOAT, bli_sxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_DOUBLE, bli_dxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_SCOMPLEX, bli_cxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_DCOMPLEX, bli_zxpbyv_sifive_rvv_intr, // Level 1f - BLIS_AXPY2V_KER, BLIS_FLOAT, bli_saxpy2v_sifive_x280_intr, - BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_sifive_x280_intr, - BLIS_AXPY2V_KER, BLIS_SCOMPLEX, bli_caxpy2v_sifive_x280_intr, - BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_sifive_x280_intr, - - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_x280_intr, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_x280_intr, - BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_intr, - BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_intr, - - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_x280_intr, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_x280_intr, - BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_intr, - BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_intr, - - BLIS_DOTAXPYV_KER, BLIS_FLOAT, bli_sdotaxpyv_sifive_x280_intr, - BLIS_DOTAXPYV_KER, BLIS_DOUBLE, bli_ddotaxpyv_sifive_x280_intr, - BLIS_DOTAXPYV_KER, BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_x280_intr, - BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_x280_intr, - - BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_x280_intr, - BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_x280_intr, - BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_intr, - BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_intr, + BLIS_AXPY2V_KER, BLIS_FLOAT, bli_saxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_SCOMPLEX, bli_caxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_sifive_rvv_intr, + + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_rvv_intr, + + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_rvv_intr, + + BLIS_DOTAXPYV_KER, BLIS_FLOAT, bli_sdotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_DOUBLE, bli_ddotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_rvv_intr, + + BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_rvv_intr, // Level 1m - BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_sifive_x280_intr, - BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_sifive_x280_intr, - BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_intr, - BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_intr, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_rvv_intr, // Level 3 - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_x280_intr, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_x280_intr, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_x280_intr, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_x280_intr, - - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_x280_intr, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_x280_intr, - BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_intr, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_intr, - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_x280_intr, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_x280_intr, - BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_intr, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_intr, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_rvv_intr, + + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_rvv_intr, BLIS_VA_END ); diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk index 31b31e387a..5f19e4e442 100644 --- a/config/sifive_x280/make_defs.mk +++ b/config/sifive_x280/make_defs.mk @@ -47,8 +47,10 @@ THIS_CONFIG := sifive_x280 # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb_zvl512b -mabi=lp64d +CMISCFLAGS_SIFIVE_OTHER := CPPROCFLAGS := -CMISCFLAGS := $(CMISCFLAGS_SIFIVE) -fdata-sections -ffunction-sections \ +CMISCFLAGS := $(CMISCFLAGS_SIFIVE) $(CMISCFLAGS_SIFIVE_OTHER) \ + -fdata-sections -ffunction-sections \ -fdiagnostics-color=always -fno-rtti -fno-exceptions CPICFLAGS := -fPIC CWARNFLAGS := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \ diff --git a/config_registry b/config_registry index 8c1f6f2542..8154393487 100644 --- a/config_registry +++ b/config_registry @@ -62,7 +62,8 @@ rv32iv: rv32iv/rviv rv64iv: rv64iv/rviv # SiFive architectures. -sifive_x280: sifive_x280 +sifive_rvv: sifive_rvv +sifive_x280: sifive_x280/sifive_rvv # Generic architectures. generic: generic diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 135d410635..53d9bdefdd 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -287,6 +287,9 @@ arch_t bli_arch_query_id_impl( void ) #endif // SiFive microarchitectures. + #ifdef BLIS_FAMILY_SIFIVE_RVV + id = BLIS_ARCH_SIFIVE_RVV; + #endif #ifdef BLIS_FAMILY_SIFIVE_X280 id = BLIS_ARCH_SIFIVE_X280; #endif @@ -356,6 +359,7 @@ static const char* config_name[ BLIS_NUM_ARCHS ] = "rv32iv", "rv64iv", + "sifive_rvv", "sifive_x280", "generic" diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index a35bb7746b..49a8943024 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -180,6 +180,9 @@ INSERT_GENTCONF // -- SiFive families -- +#ifdef BLIS_FAMILY_SIFIVE_RVV +#include "bli_family_sifive_rvv.h" +#endif #ifdef BLIS_FAMILY_SIFIVE_X280 #include "bli_family_sifive_x280.h" #endif @@ -277,6 +280,9 @@ INSERT_GENTCONF // -- SiFive RISC-V architectures -- +#ifdef BLIS_KERNELS_SIFIVE_RVV +#include "bli_kernels_sifive_rvv.h" +#endif #ifdef BLIS_KERNELS_SIFIVE_X280 #include "bli_kernels_sifive_x280.h" #endif diff --git a/frame/include/bli_gentconf_macro_defs.h b/frame/include/bli_gentconf_macro_defs.h index 70414fb475..f6f3af20e8 100644 --- a/frame/include/bli_gentconf_macro_defs.h +++ b/frame/include/bli_gentconf_macro_defs.h @@ -222,6 +222,11 @@ // -- SiFive architectures ---------------------------------------------------- +#ifdef BLIS_CONFIG_SIFIVE_RVV +#define INSERT_GENTCONF_SIFIVE_RVV GENTCONF( SIFIVE_RVV, sifive_rvv ) +#else +#define INSERT_GENTCONF_SIFIVE_RVV +#endif #ifdef BLIS_CONFIG_SIFIVE_X280 #define INSERT_GENTCONF_SIFIVE_X280 GENTCONF( SIFIVE_X280, sifive_x280 ) #else @@ -280,6 +285,7 @@ INSERT_GENTCONF_RV64I \ INSERT_GENTCONF_RV32IV \ INSERT_GENTCONF_RV64IV \ \ +INSERT_GENTCONF_SIFIVE_RVV \ INSERT_GENTCONF_SIFIVE_X280 \ \ INSERT_GENTCONF_GENERIC diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 5bc96e8f24..baefeef57d 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -987,6 +987,7 @@ typedef enum BLIS_ARCH_RV64IV, // SiFive + BLIS_ARCH_SIFIVE_RVV, BLIS_ARCH_SIFIVE_X280, // Generic architecture/configuration diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr.c index 2b7ad6fe7d..c917390f9c 100644 --- a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #include "../../riscv_overloaded_intrinsics.h" -#define ADDV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##addv_sifive_x280_intr(\ +#define ADDV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##addv_sifive_rvv_intr(\ conj_t conjx, \ dim_t n, \ const T* restrict x_, inc_t incx, \ @@ -57,7 +57,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_addv_sifive_x280_intr_real.c" +#include "./bli_addv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -72,7 +72,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_addv_sifive_x280_intr_real.c" +#include "./bli_addv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -88,7 +88,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_addv_sifive_x280_intr_complex.c" +#include "./bli_addv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -105,7 +105,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_addv_sifive_x280_intr_complex.c" +#include "./bli_addv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_complex.c similarity index 98% rename from kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_complex.c index d5343befe0..ae4ff39b97 100644 --- a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_real.c index d4e7d4a45e..bc928a5e6b 100644 --- a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr.c similarity index 86% rename from kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr.c index 4f7d546304..6014b860b6 100644 --- a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr.c @@ -40,7 +40,7 @@ #include #include -#define AMAXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##amaxv_sifive_x280_intr(\ +#define AMAXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##amaxv_sifive_rvv_intr(\ dim_t n, \ const T* restrict x_, inc_t incx, \ dim_t* index, \ @@ -52,20 +52,20 @@ // BLIS defines integers to be 32 or 64 bits according to BLIS_INT_TYPE_SIZE. // If BLIS_INT_TYPE_SIZE is any other value, integers are defined to be longs. #if BLIS_INT_TYPE_SIZE == 32 || BLIS_INT_TYPE_SIZE == 64 -#define AMAXV_SIFIVE_X280_INT_SIZE BLIS_INT_TYPE_SIZE +#define AMAXV_SIFIVE_RVV_INT_SIZE BLIS_INT_TYPE_SIZE #elif LONG_MAX == INT32_MAX -#define AMAXV_SIFIVE_X280_INT_SIZE 32 +#define AMAXV_SIFIVE_RVV_INT_SIZE 32 #elif LONG_MAX == INT64_MAX -#define AMAXV_SIFIVE_X280_INT_SIZE 64 +#define AMAXV_SIFIVE_RVV_INT_SIZE 64 #else -#error "Integers must be 32- or 64-bits for bli_?amaxv_sifive_x280_intr." +#error "Integers must be 32- or 64-bits for bli_?amaxv_sifive_rvv_intr." #endif // Single precision real #define DATATYPE float #define PRECISION_CHAR s #define PREC_X 32 -#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#define PREC_I AMAXV_SIFIVE_RVV_INT_SIZE #if PREC_I == 32 #define LMUL_X m4 #define LMUL_I m4 @@ -77,7 +77,7 @@ #endif #define FLT_SIZE sizeof(float) -#include "./bli_amaxv_sifive_x280_intr_real.c" +#include "./bli_amaxv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -92,7 +92,7 @@ #define DATATYPE double #define PRECISION_CHAR d #define PREC_X 64 -#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#define PREC_I AMAXV_SIFIVE_RVV_INT_SIZE #if PREC_I == 32 #define LMUL_X m8 #define LMUL_I m4 @@ -104,7 +104,7 @@ #endif #define FLT_SIZE sizeof(double) -#include "./bli_amaxv_sifive_x280_intr_real.c" +#include "./bli_amaxv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -120,7 +120,7 @@ #define BASE_DT float #define PRECISION_CHAR c #define PREC_X 32 -#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#define PREC_I AMAXV_SIFIVE_RVV_INT_SIZE #if PREC_I == 32 #define LMUL_X m4 #define LMUL_I m4 @@ -132,7 +132,7 @@ #endif #define FLT_SIZE sizeof(float) -#include "./bli_amaxv_sifive_x280_intr_complex.c" +#include "./bli_amaxv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -149,7 +149,7 @@ #define BASE_DT double #define PRECISION_CHAR z #define PREC_X 64 -#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#define PREC_I AMAXV_SIFIVE_RVV_INT_SIZE #if PREC_I == 32 #define LMUL_X m8 #define LMUL_I m4 @@ -161,7 +161,7 @@ #endif #define FLT_SIZE sizeof(double) -#include "./bli_amaxv_sifive_x280_intr_complex.c" +#include "./bli_amaxv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -173,7 +173,7 @@ #undef RATIO #undef FLT_SIZE -#undef AMAXV_SIFIVE_X280_INT_SIZE +#undef AMAXV_SIFIVE_RVV_INT_SIZE #undef AMAXV #undef AMAXV_ diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_complex.c similarity index 100% rename from kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_complex.c diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_real.c similarity index 100% rename from kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_real.c diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr.c index 389292f90f..94e3272bc4 100644 --- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #include "../../riscv_overloaded_intrinsics.h" -#define AXPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpbyv_sifive_x280_intr(\ +#define AXPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpbyv_sifive_rvv_intr(\ conj_t conjx, \ dim_t n, \ const T* restrict alpha_, \ @@ -52,11 +52,11 @@ #define AXPBYV(...) AXPBYV_(__VA_ARGS__) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) -#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr +#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_rvv_intr #define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) -#define SCAL2V_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2v_sifive_x280_intr +#define SCAL2V_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2v_sifive_rvv_intr #define SCAL2V(PRECISION_CHAR) SCAL2V_(PRECISION_CHAR) // Single precision real @@ -66,7 +66,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_axpbyv_sifive_x280_intr_real.c" +#include "./bli_axpbyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -81,7 +81,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_axpbyv_sifive_x280_intr_real.c" +#include "./bli_axpbyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -97,7 +97,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_axpbyv_sifive_x280_intr_complex.c" +#include "./bli_axpbyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -114,7 +114,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_axpbyv_sifive_x280_intr_complex.c" +#include "./bli_axpbyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_complex.c index 31fc584b97..af034824e1 100644 --- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_real.c index 33eafc5d12..b482189028 100644 --- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr.c index 3f9ebd3b04..07dc6a416b 100644 --- a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #include "../../riscv_overloaded_intrinsics.h" -#define AXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyv_sifive_x280_intr(\ +#define AXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyv_sifive_rvv_intr(\ conj_t conjx, \ dim_t n, \ const T* restrict alpha_, \ @@ -58,7 +58,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_axpyv_sifive_x280_intr_real.c" +#include "./bli_axpyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -73,7 +73,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_axpyv_sifive_x280_intr_real.c" +#include "./bli_axpyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -89,7 +89,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_axpyv_sifive_x280_intr_complex.c" +#include "./bli_axpyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -106,7 +106,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_axpyv_sifive_x280_intr_complex.c" +#include "./bli_axpyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_complex.c index dc520d2125..1b88f7d260 100644 --- a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_real.c index 0c2cda842f..8ad0ac3fb6 100644 --- a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr.c similarity index 93% rename from kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr.c index e030d85ff3..ab9cf0f346 100644 --- a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr.c @@ -38,7 +38,7 @@ #include #include -#define COPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##copyv_sifive_x280_intr(\ +#define COPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##copyv_sifive_rvv_intr(\ conj_t conjx, \ dim_t n, \ const T* restrict x_, inc_t incx, \ @@ -55,7 +55,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_copyv_sifive_x280_intr_real.c" +#include "./bli_copyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -70,7 +70,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_copyv_sifive_x280_intr_real.c" +#include "./bli_copyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -86,7 +86,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_copyv_sifive_x280_intr_complex.c" +#include "./bli_copyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -103,7 +103,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_copyv_sifive_x280_intr_complex.c" +#include "./bli_copyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_complex.c similarity index 100% rename from kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_complex.c diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_real.c similarity index 100% rename from kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_real.c diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr.c index 0dc8565400..31ae4cc26b 100644 --- a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #include "../../riscv_overloaded_intrinsics.h" -#define DOTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotv_sifive_x280_intr(\ +#define DOTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotv_sifive_rvv_intr(\ conj_t conjxt, \ conj_t conjy, \ dim_t n, \ @@ -59,7 +59,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_dotv_sifive_x280_intr_real.c" +#include "./bli_dotv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -74,7 +74,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_dotv_sifive_x280_intr_real.c" +#include "./bli_dotv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -90,7 +90,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_dotv_sifive_x280_intr_complex.c" +#include "./bli_dotv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -107,7 +107,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_dotv_sifive_x280_intr_complex.c" +#include "./bli_dotv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_complex.c index 250fab46e6..14dbfc4e9a 100644 --- a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_real.c index 0ec8e6328a..b7aec00fd1 100644 --- a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr.c similarity index 93% rename from kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr.c index 048f8d2983..ad405979ca 100644 --- a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #include "../../riscv_overloaded_intrinsics.h" -#define DOTXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxv_sifive_x280_intr(\ +#define DOTXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxv_sifive_rvv_intr(\ conj_t conjxt, \ conj_t conjy, \ dim_t n, \ @@ -62,7 +62,7 @@ #define FLT_SIZE sizeof(float) #define FMA fmaf -#include "./bli_dotxv_sifive_x280_intr_real.c" +#include "./bli_dotxv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -79,7 +79,7 @@ #define FLT_SIZE sizeof(double) #define FMA fma -#include "./bli_dotxv_sifive_x280_intr_real.c" +#include "./bli_dotxv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -97,7 +97,7 @@ #define FLT_SIZE sizeof(float) #define FMA fmaf -#include "./bli_dotxv_sifive_x280_intr_complex.c" +#include "./bli_dotxv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -116,7 +116,7 @@ #define FLT_SIZE sizeof(double) #define FMA fma -#include "./bli_dotxv_sifive_x280_intr_complex.c" +#include "./bli_dotxv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_complex.c index 8245e8e057..1c6d3d8f7a 100644 --- a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_real.c index f9d9346973..1f84ae610f 100644 --- a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr.c similarity index 93% rename from kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr.c index fc8f8a76d7..7f4443479b 100644 --- a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr.c @@ -38,7 +38,7 @@ #include #include -#define INVERTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invertv_sifive_x280_intr(\ +#define INVERTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invertv_sifive_rvv_intr(\ dim_t n, \ T* restrict x_, inc_t incx, \ const cntx_t* cntx \ @@ -53,7 +53,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_invertv_sifive_x280_intr_real.c" +#include "./bli_invertv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -68,7 +68,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_invertv_sifive_x280_intr_real.c" +#include "./bli_invertv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -85,7 +85,7 @@ #define RATIO 8 #define FLT_SIZE sizeof(float) -#include "./bli_invertv_sifive_x280_intr_complex.c" +#include "./bli_invertv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -104,7 +104,7 @@ #define RATIO 16 #define FLT_SIZE sizeof(double) -#include "./bli_invertv_sifive_x280_intr_complex.c" +#include "./bli_invertv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_complex.c similarity index 100% rename from kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_complex.c diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_real.c similarity index 100% rename from kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_real.c diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr.c similarity index 93% rename from kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr.c index a5c7561bd8..0dc9c01aba 100644 --- a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr.c @@ -39,7 +39,7 @@ #include #include -#define INVSCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invscalv_sifive_x280_intr(\ +#define INVSCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invscalv_sifive_rvv_intr(\ conj_t conjalpha, \ dim_t n, \ const T* restrict alpha_, \ @@ -56,7 +56,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_invscalv_sifive_x280_intr_real.c" +#include "./bli_invscalv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -71,7 +71,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_invscalv_sifive_x280_intr_real.c" +#include "./bli_invscalv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -87,7 +87,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_invscalv_sifive_x280_intr_complex.c" +#include "./bli_invscalv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -104,7 +104,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_invscalv_sifive_x280_intr_complex.c" +#include "./bli_invscalv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_complex.c similarity index 100% rename from kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_complex.c diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_real.c similarity index 100% rename from kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_real.c diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr.c index 4cae8257c3..b434f751eb 100644 --- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #include "../../riscv_overloaded_intrinsics.h" -#define SCAL2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scal2v_sifive_x280_intr(\ +#define SCAL2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scal2v_sifive_rvv_intr(\ conj_t conjx, \ dim_t n, \ const T* restrict alpha_, \ @@ -51,9 +51,9 @@ #define SCAL2V(...) SCAL2V_(__VA_ARGS__) -#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr +#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_rvv_intr #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) // Single precision real @@ -63,7 +63,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_scal2v_sifive_x280_intr_real.c" +#include "./bli_scal2v_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -78,7 +78,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_scal2v_sifive_x280_intr_real.c" +#include "./bli_scal2v_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -94,7 +94,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_scal2v_sifive_x280_intr_complex.c" +#include "./bli_scal2v_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -111,7 +111,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_scal2v_sifive_x280_intr_complex.c" +#include "./bli_scal2v_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_complex.c index 2e946a2a4c..c2272ae3bb 100644 --- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_real.c index 7084e15cf5..7b80882028 100644 --- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr.c index d1fb9940eb..c6b19ea00b 100644 --- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -39,7 +39,7 @@ #include "blis.h" #include "../../riscv_overloaded_intrinsics.h" -#define SCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scalv_sifive_x280_intr(\ +#define SCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scalv_sifive_rvv_intr(\ conj_t conjalpha, \ dim_t n, \ const T* restrict alpha_, \ @@ -49,7 +49,7 @@ #define SCALV(...) SCALV_(__VA_ARGS__) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) // Single precision real @@ -59,7 +59,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_scalv_sifive_x280_intr_real.c" +#include "./bli_scalv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -74,7 +74,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_scalv_sifive_x280_intr_real.c" +#include "./bli_scalv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -90,7 +90,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_scalv_sifive_x280_intr_complex.c" +#include "./bli_scalv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -107,7 +107,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_scalv_sifive_x280_intr_complex.c" +#include "./bli_scalv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_complex.c similarity index 98% rename from kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_complex.c index c6803c9676..20f49ebdf6 100644 --- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_real.c index 2b4e31d359..7cc2dd6b64 100644 --- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr.c similarity index 93% rename from kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr.c index 8c2ba7c72a..33cfb4a573 100644 --- a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr.c @@ -38,7 +38,7 @@ #include #include -#define SETV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##setv_sifive_x280_intr(\ +#define SETV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##setv_sifive_rvv_intr(\ conj_t conjalpha, \ dim_t n, \ const T* restrict alpha_, \ @@ -55,7 +55,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_setv_sifive_x280_intr_real.c" +#include "./bli_setv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -70,7 +70,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_setv_sifive_x280_intr_real.c" +#include "./bli_setv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -86,7 +86,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_setv_sifive_x280_intr_complex.c" +#include "./bli_setv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -103,7 +103,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_setv_sifive_x280_intr_complex.c" +#include "./bli_setv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_complex.c similarity index 100% rename from kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_complex.c diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_real.c similarity index 100% rename from kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_real.c diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr.c index e6b483a3f8..0ba7c53041 100644 --- a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #include "../../riscv_overloaded_intrinsics.h" -#define SUBV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##subv_sifive_x280_intr(\ +#define SUBV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##subv_sifive_rvv_intr(\ conj_t conjx, \ dim_t n, \ const T* restrict x_, inc_t incx, \ @@ -57,7 +57,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_subv_sifive_x280_intr_real.c" +#include "./bli_subv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -72,7 +72,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_subv_sifive_x280_intr_real.c" +#include "./bli_subv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -88,7 +88,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_subv_sifive_x280_intr_complex.c" +#include "./bli_subv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -105,7 +105,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_subv_sifive_x280_intr_complex.c" +#include "./bli_subv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_complex.c similarity index 98% rename from kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_complex.c index 2d4a1a017f..62eab516d4 100644 --- a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_real.c index b158594319..5488007b2b 100644 --- a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr.c similarity index 93% rename from kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr.c index baf685d35f..ec14df9cb6 100644 --- a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr.c @@ -38,7 +38,7 @@ #include #include -#define SWAPV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##swapv_sifive_x280_intr(\ +#define SWAPV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##swapv_sifive_rvv_intr(\ dim_t n, \ T* restrict x_, inc_t incx, \ T* restrict y_, inc_t incy, \ @@ -54,7 +54,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_swapv_sifive_x280_intr_real.c" +#include "./bli_swapv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -69,7 +69,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_swapv_sifive_x280_intr_real.c" +#include "./bli_swapv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -85,7 +85,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_swapv_sifive_x280_intr_complex.c" +#include "./bli_swapv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -102,7 +102,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_swapv_sifive_x280_intr_complex.c" +#include "./bli_swapv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_complex.c similarity index 100% rename from kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_complex.c diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_real.c similarity index 100% rename from kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_real.c diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c rename to kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr.c index da688851d0..0f6a6d550b 100644 --- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #include "../../riscv_overloaded_intrinsics.h" -#define XPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##xpbyv_sifive_x280_intr(\ +#define XPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##xpbyv_sifive_rvv_intr(\ conj_t conjx, \ dim_t n, \ const T* restrict x_, inc_t incx, \ @@ -51,7 +51,7 @@ #define XPBYV(...) XPBYV_(__VA_ARGS__) -#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr +#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_rvv_intr #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) // Single precision real @@ -61,7 +61,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_xpbyv_sifive_x280_intr_real.c" +#include "./bli_xpbyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -76,7 +76,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_xpbyv_sifive_x280_intr_real.c" +#include "./bli_xpbyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -92,7 +92,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_xpbyv_sifive_x280_intr_complex.c" +#include "./bli_xpbyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -109,7 +109,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_xpbyv_sifive_x280_intr_complex.c" +#include "./bli_xpbyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_complex.c index 4c86e8b36a..1eb2fff234 100644 --- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_real.c index b23272fea4..f4a8aa72eb 100644 --- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c rename to kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr.c index 1b5ce3b962..e9d4a8b5f5 100644 --- a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #include "../../riscv_overloaded_intrinsics.h" -#define AXPY2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpy2v_sifive_x280_intr(\ +#define AXPY2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpy2v_sifive_rvv_intr(\ conj_t conjx, \ conj_t conjy, \ dim_t n, \ @@ -61,7 +61,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_axpy2v_sifive_x280_intr_real.c" +#include "./bli_axpy2v_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -76,7 +76,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_axpy2v_sifive_x280_intr_real.c" +#include "./bli_axpy2v_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -92,7 +92,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_axpy2v_sifive_x280_intr_complex.c" +#include "./bli_axpy2v_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -109,7 +109,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_axpy2v_sifive_x280_intr_complex.c" +#include "./bli_axpy2v_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_complex.c index 9b57198272..de753d2249 100644 --- a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_real.c index cebb159973..b2e42155c8 100644 --- a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr.c similarity index 94% rename from kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c rename to kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr.c index a5e0268467..ace31d7a8b 100644 --- a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr.c @@ -39,7 +39,7 @@ #include #include -#define AXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyf_sifive_x280_intr(\ +#define AXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyf_sifive_rvv_intr(\ conj_t conja, \ conj_t conjx, \ dim_t m, \ @@ -60,7 +60,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_axpyf_sifive_x280_intr_real.c" +#include "./bli_axpyf_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -75,7 +75,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_axpyf_sifive_x280_intr_real.c" +#include "./bli_axpyf_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -91,7 +91,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_axpyf_sifive_x280_intr_complex.c" +#include "./bli_axpyf_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -108,7 +108,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_axpyf_sifive_x280_intr_complex.c" +#include "./bli_axpyf_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_complex.c similarity index 100% rename from kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_complex.c diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_real.c similarity index 100% rename from kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_real.c diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr.c similarity index 92% rename from kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c rename to kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr.c index 9cd1071d7a..7d46f52b07 100644 --- a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -39,7 +39,7 @@ #include "blis.h" #include "../../riscv_overloaded_intrinsics.h" -#define DOTAXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotaxpyv_sifive_x280_intr(\ +#define DOTAXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotaxpyv_sifive_rvv_intr(\ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ @@ -61,7 +61,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(float) -#include "./bli_dotaxpyv_sifive_x280_intr_real.c" +#include "./bli_dotaxpyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -76,7 +76,7 @@ #define LMUL m8 #define FLT_SIZE sizeof(double) -#include "./bli_dotaxpyv_sifive_x280_intr_real.c" +#include "./bli_dotaxpyv_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -92,7 +92,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_dotaxpyv_sifive_x280_intr_complex.c" +#include "./bli_dotaxpyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -109,7 +109,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_dotaxpyv_sifive_x280_intr_complex.c" +#include "./bli_dotaxpyv_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_complex.c index c3cd06c523..7529fb7584 100644 --- a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_complex.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_real.c similarity index 99% rename from kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_real.c index adaf3610b0..0b6b7e0164 100644 --- a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_real.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr.c similarity index 93% rename from kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c rename to kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr.c index dc1bca9f6a..24052dd5c2 100644 --- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr.c @@ -40,7 +40,7 @@ #include #include -#define DOTXAXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxaxpyf_sifive_x280_intr(\ +#define DOTXAXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxaxpyf_sifive_rvv_intr(\ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ @@ -59,9 +59,9 @@ #define DOTXAXPYF(...) DOTXAXPYF_(__VA_ARGS__) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) -#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr +#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_rvv_intr #define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) // Single precision real @@ -71,7 +71,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_dotxaxpyf_sifive_x280_intr_real.c" +#include "./bli_dotxaxpyf_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -86,7 +86,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_dotxaxpyf_sifive_x280_intr_real.c" +#include "./bli_dotxaxpyf_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -102,7 +102,7 @@ #define LMUL m2 #define FLT_SIZE sizeof(float) -#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c" +#include "./bli_dotxaxpyf_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -119,7 +119,7 @@ #define LMUL m2 #define FLT_SIZE sizeof(double) -#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c" +#include "./bli_dotxaxpyf_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_complex.c similarity index 76% rename from kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_complex.c index d8a984064d..67edd9db3e 100644 --- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_complex.c @@ -35,89 +35,89 @@ // clang-format off #ifdef DOTXAXPYF -#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i) \ +#define DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL(i) \ do { \ acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \ acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ +#define DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL_STRIDED(i) \ do { \ acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \ acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ +#define DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ do { \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc3_r, yacc3_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ +#define DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ do { \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc3_r, yacc3_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ +#define DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ do { \ switch (b) { \ case 3: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ case 2: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ case 1: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \ } \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ +#define DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ do { \ switch (b) { \ case 3: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ case 2: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ case 1: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \ } \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_REDUCE(i) \ +#define DOTXAXPYF_SIFIVE_RVV_REDUCE(i) \ do { \ RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1); \ RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1); \ @@ -205,29 +205,29 @@ DOTXAXPYF(PRECISION_CHAR, void) if (bli_is_conj(conjat)) { if (bli_is_conj(conja)) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, _CONJ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( , _CONJ, _CONJ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, _CONJ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, _CONJ, _CONJ); } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( , _CONJ, ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, _CONJ, ); } } else { if (bli_is_conj(conja)) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , _CONJ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( , , _CONJ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , _CONJ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, , _CONJ); } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( , , ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, , ); } } first = false; @@ -236,29 +236,29 @@ DOTXAXPYF(PRECISION_CHAR, void) if (bli_is_conj(conjat)) { if (bli_is_conj(conja)) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, _CONJ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( , _CONJ, _CONJ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, _CONJ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED, _CONJ, _CONJ); } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( , _CONJ, ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED, _CONJ, ); } } else { if (bli_is_conj(conja)) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , _CONJ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( , , _CONJ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , _CONJ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED, , _CONJ); } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( , , ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED, , ); } } } @@ -287,10 +287,10 @@ DOTXAXPYF(PRECISION_CHAR, void) avl -= vl; } - DOTXAXPYF_SIFIVE_X280_REDUCE(0); - DOTXAXPYF_SIFIVE_X280_REDUCE(1); - DOTXAXPYF_SIFIVE_X280_REDUCE(2); - DOTXAXPYF_SIFIVE_X280_REDUCE(3); + DOTXAXPYF_SIFIVE_RVV_REDUCE(0); + DOTXAXPYF_SIFIVE_RVV_REDUCE(1); + DOTXAXPYF_SIFIVE_RVV_REDUCE(2); + DOTXAXPYF_SIFIVE_RVV_REDUCE(3); a += 4 * lda; x += 4 * incx; @@ -322,29 +322,29 @@ DOTXAXPYF(PRECISION_CHAR, void) if (bli_is_conj(conjat)) { if (bli_is_conj(conja)) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, _CONJ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , _CONJ, _CONJ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, _CONJ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, _CONJ); } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , _CONJ, ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, ); } } else { if (bli_is_conj(conja)) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , _CONJ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , , _CONJ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , _CONJ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, , _CONJ); } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , , ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, , ); } } first = false; @@ -353,29 +353,29 @@ DOTXAXPYF(PRECISION_CHAR, void) if (bli_is_conj(conjat)) { if (bli_is_conj(conja)) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, _CONJ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( , _CONJ, _CONJ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, _CONJ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, _CONJ, _CONJ); } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( , _CONJ, ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, _CONJ, ); } } else { if (bli_is_conj(conja)) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , _CONJ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( , , _CONJ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , _CONJ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, , _CONJ); } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( , , ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, , ); } } } @@ -406,22 +406,22 @@ DOTXAXPYF(PRECISION_CHAR, void) switch (b) { case 3: - DOTXAXPYF_SIFIVE_X280_REDUCE(2); + DOTXAXPYF_SIFIVE_RVV_REDUCE(2); case 2: - DOTXAXPYF_SIFIVE_X280_REDUCE(1); + DOTXAXPYF_SIFIVE_RVV_REDUCE(1); case 1: - DOTXAXPYF_SIFIVE_X280_REDUCE(0); + DOTXAXPYF_SIFIVE_RVV_REDUCE(0); } } return; } -#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL -#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED -#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST -#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY -#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST -#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY -#undef DOTXAXPYF_SIFIVE_X280_REDUCE +#undef DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL +#undef DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL_STRIDED +#undef DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_RVV_LOOP_BODY +#undef DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY +#undef DOTXAXPYF_SIFIVE_RVV_REDUCE #endif // DOTXAXPYF diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_real.c similarity index 79% rename from kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_real.c index 57ef4f7447..7143d3a974 100644 --- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_real.c @@ -35,85 +35,85 @@ // clang-format off #ifdef DOTXAXPYF -#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i) \ +#define DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL(i) \ do { \ acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl); \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ +#define DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL_STRIDED(i) \ do { \ acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF) \ +#define DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(LOAD_SUF) \ do { \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ yacc3 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF) \ +#define DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(LOAD_SUF) \ do { \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl); \ zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ yacc3 = VFMACC_VV_TU(PREC, LMUL)(yacc3, acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF) \ +#define DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(LOAD_SUF) \ do { \ switch (b) { \ case 3: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ case 2: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ case 1: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \ } \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF) \ +#define DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(LOAD_SUF) \ do { \ switch (b) { \ case 3: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ case 2: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ case 1: \ - DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl); \ zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \ } \ } while (0) -#define DOTXAXPYF_SIFIVE_X280_REDUCE(i) \ +#define DOTXAXPYF_SIFIVE_RVV_REDUCE(i) \ do { \ RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1); \ dot##i = VF_REDUSUM_VS(PREC, LMUL)(yacc##i, dot##i, m); \ @@ -174,16 +174,16 @@ DOTXAXPYF(PRECISION_CHAR, void) wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl); if (first) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED); first = false; } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_LOOP_BODY( ); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( ); else - DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED); + DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED); } RVV_TYPE_F(PREC, LMUL) zvec; @@ -203,10 +203,10 @@ DOTXAXPYF(PRECISION_CHAR, void) avl -= vl; } - DOTXAXPYF_SIFIVE_X280_REDUCE(0); - DOTXAXPYF_SIFIVE_X280_REDUCE(1); - DOTXAXPYF_SIFIVE_X280_REDUCE(2); - DOTXAXPYF_SIFIVE_X280_REDUCE(3); + DOTXAXPYF_SIFIVE_RVV_REDUCE(0); + DOTXAXPYF_SIFIVE_RVV_REDUCE(1); + DOTXAXPYF_SIFIVE_RVV_REDUCE(2); + DOTXAXPYF_SIFIVE_RVV_REDUCE(3); a += 4 * lda; x += 4 * incx; @@ -231,16 +231,16 @@ DOTXAXPYF(PRECISION_CHAR, void) wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl); if (first) { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED); first = false; } else { if (inca == 1) - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( ); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( ); else - DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED); + DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED); } RVV_TYPE_F(PREC, LMUL) zvec; @@ -262,22 +262,22 @@ DOTXAXPYF(PRECISION_CHAR, void) switch (b) { case 3: - DOTXAXPYF_SIFIVE_X280_REDUCE(2); + DOTXAXPYF_SIFIVE_RVV_REDUCE(2); case 2: - DOTXAXPYF_SIFIVE_X280_REDUCE(1); + DOTXAXPYF_SIFIVE_RVV_REDUCE(1); case 1: - DOTXAXPYF_SIFIVE_X280_REDUCE(0); + DOTXAXPYF_SIFIVE_RVV_REDUCE(0); } } return; } -#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL -#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED -#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST -#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY -#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST -#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY -#undef DOTXAXPYF_SIFIVE_X280_REDUCE +#undef DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL +#undef DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL_STRIDED +#undef DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_RVV_LOOP_BODY +#undef DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY +#undef DOTXAXPYF_SIFIVE_RVV_REDUCE #endif // DOTXAXPYF diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr.c similarity index 94% rename from kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c rename to kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr.c index 9396515b30..e65f0637dd 100644 --- a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr.c @@ -39,7 +39,7 @@ #include #include -#define DOTXF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxf_sifive_x280_intr(\ +#define DOTXF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxf_sifive_rvv_intr(\ conj_t conjat, \ conj_t conjx, \ dim_t m, \ @@ -54,9 +54,9 @@ #define DOTXF(...) DOTXF_(__VA_ARGS__) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) -#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr +#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_rvv_intr #define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) // Single precision real @@ -66,7 +66,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) -#include "./bli_dotxf_sifive_x280_intr_real.c" +#include "./bli_dotxf_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -81,7 +81,7 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) -#include "./bli_dotxf_sifive_x280_intr_real.c" +#include "./bli_dotxf_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -97,7 +97,7 @@ #define LMUL m2 #define FLT_SIZE sizeof(float) -#include "./bli_dotxf_sifive_x280_intr_complex.c" +#include "./bli_dotxf_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -114,7 +114,7 @@ #define LMUL m2 #define FLT_SIZE sizeof(double) -#include "./bli_dotxf_sifive_x280_intr_complex.c" +#include "./bli_dotxf_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_complex.c similarity index 74% rename from kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_complex.c index 463a111f07..8cdc4b76e7 100644 --- a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_complex.c @@ -35,95 +35,95 @@ // clang-format off #ifdef DOTXF -#define DOTXF_SIFIVE_X280_LOAD_ACOL(i) \ +#define DOTXF_SIFIVE_RVV_LOAD_ACOL(i) \ do { \ acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \ acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ } while (0) -#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ +#define DOTXF_SIFIVE_RVV_LOAD_ACOL_STRIDED(i) \ do { \ acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \ acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ } while (0) -#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, CONJ_SUF) \ +#define DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(LOAD_SUF, CONJ_SUF) \ do { \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(5); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc5_r, acc5_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ } while (0) -#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, CONJ_SUF) \ +#define DOTXF_SIFIVE_RVV_LOOP_BODY(LOAD_SUF, CONJ_SUF) \ do { \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(5); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc5_r, acc5_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ } while (0) -#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, CONJ_SUF) \ +#define DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(LOAD_SUF, CONJ_SUF) \ do { \ switch (b) { \ case 5: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ case 4: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ case 3: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ case 2: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ case 1: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ } \ } while (0) -#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, CONJ_SUF) \ +#define DOTXF_SIFIVE_RVV_CLEANUP_BODY(LOAD_SUF, CONJ_SUF) \ do { \ switch (b) { \ case 5: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ case 4: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ case 3: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ case 2: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ case 1: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ } \ } while (0) -#define DOTXF_SIFIVE_X280_REDUCE(i) \ +#define DOTXF_SIFIVE_RVV_REDUCE(i) \ do { \ RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1); \ RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1); \ @@ -200,30 +200,30 @@ DOTXF(PRECISION_CHAR, void) if (first) { if (bli_is_conj(conjat)) { if (inca == 1) - DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ); + DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST( , _CONJ); else - DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ); + DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, _CONJ); } else { if (inca == 1) - DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , ); + DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST( , ); else - DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, ); + DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, ); } first = false; } else { if (bli_is_conj(conjat)) { if (inca == 1) - DOTXF_SIFIVE_X280_LOOP_BODY( , _CONJ); + DOTXF_SIFIVE_RVV_LOOP_BODY( , _CONJ); else - DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ); + DOTXF_SIFIVE_RVV_LOOP_BODY(_STRIDED, _CONJ); } else { if (inca == 1) - DOTXF_SIFIVE_X280_LOOP_BODY( , ); + DOTXF_SIFIVE_RVV_LOOP_BODY( , ); else - DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, ); + DOTXF_SIFIVE_RVV_LOOP_BODY(_STRIDED, ); } } @@ -232,12 +232,12 @@ DOTXF(PRECISION_CHAR, void) avl -= vl; } - DOTXF_SIFIVE_X280_REDUCE(0); - DOTXF_SIFIVE_X280_REDUCE(1); - DOTXF_SIFIVE_X280_REDUCE(2); - DOTXF_SIFIVE_X280_REDUCE(3); - DOTXF_SIFIVE_X280_REDUCE(4); - DOTXF_SIFIVE_X280_REDUCE(5); + DOTXF_SIFIVE_RVV_REDUCE(0); + DOTXF_SIFIVE_RVV_REDUCE(1); + DOTXF_SIFIVE_RVV_REDUCE(2); + DOTXF_SIFIVE_RVV_REDUCE(3); + DOTXF_SIFIVE_RVV_REDUCE(4); + DOTXF_SIFIVE_RVV_REDUCE(5); a += 6 * lda; y += 6 * incy; @@ -265,30 +265,30 @@ DOTXF(PRECISION_CHAR, void) if (first) { if (bli_is_conj(conjat)) { if (inca == 1) - DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ); + DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , _CONJ); else - DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ); + DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, _CONJ); } else { if (inca == 1) - DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , ); + DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , ); else - DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, ); + DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, ); } first = false; } else { if (bli_is_conj(conjat)) { if (inca == 1) - DOTXF_SIFIVE_X280_CLEANUP_BODY( , _CONJ); + DOTXF_SIFIVE_RVV_CLEANUP_BODY( , _CONJ); else - DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ); + DOTXF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, _CONJ); } else { if (inca == 1) - DOTXF_SIFIVE_X280_CLEANUP_BODY( , ); + DOTXF_SIFIVE_RVV_CLEANUP_BODY( , ); else - DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, ); + DOTXF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, ); } } @@ -299,26 +299,26 @@ DOTXF(PRECISION_CHAR, void) switch (b) { case 5: - DOTXF_SIFIVE_X280_REDUCE(4); + DOTXF_SIFIVE_RVV_REDUCE(4); case 4: - DOTXF_SIFIVE_X280_REDUCE(3); + DOTXF_SIFIVE_RVV_REDUCE(3); case 3: - DOTXF_SIFIVE_X280_REDUCE(2); + DOTXF_SIFIVE_RVV_REDUCE(2); case 2: - DOTXF_SIFIVE_X280_REDUCE(1); + DOTXF_SIFIVE_RVV_REDUCE(1); case 1: - DOTXF_SIFIVE_X280_REDUCE(0); + DOTXF_SIFIVE_RVV_REDUCE(0); } } return; } -#undef DOTXF_SIFIVE_X280_LOAD_ACOL -#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED -#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST -#undef DOTXF_SIFIVE_X280_LOOP_BODY -#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST -#undef DOTXF_SIFIVE_X280_CLEANUP_BODY -#undef DOTXF_SIFIVE_X280_REDUCE +#undef DOTXF_SIFIVE_RVV_LOAD_ACOL +#undef DOTXF_SIFIVE_RVV_LOAD_ACOL_STRIDED +#undef DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST +#undef DOTXF_SIFIVE_RVV_LOOP_BODY +#undef DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST +#undef DOTXF_SIFIVE_RVV_CLEANUP_BODY +#undef DOTXF_SIFIVE_RVV_REDUCE #endif // DOTXF diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_real.c similarity index 72% rename from kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_real.c index 8286e2476f..cdc8f259e0 100644 --- a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_real.c @@ -35,91 +35,91 @@ // clang-format off #ifdef DOTXF -#define DOTXF_SIFIVE_X280_LOAD_ACOL(i) \ +#define DOTXF_SIFIVE_RVV_LOAD_ACOL(i) \ do { \ acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl); \ } while (0) -#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ +#define DOTXF_SIFIVE_RVV_LOAD_ACOL_STRIDED(i) \ do { \ acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \ } while (0) -#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF) \ +#define DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(LOAD_SUF) \ do { \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4); \ acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(5); \ acc5 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ } while (0) -#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF) \ +#define DOTXF_SIFIVE_RVV_LOOP_BODY(LOAD_SUF) \ do { \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4); \ acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(5); \ acc5 = VFMACC_VV_TU(PREC, LMUL)(acc5, acol_vec, xvec, vl); \ } while (0) -#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF) \ +#define DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(LOAD_SUF) \ do { \ switch (b) { \ case 5: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4); \ acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ case 4: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ case 3: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ case 2: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ case 1: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ } \ } while (0) -#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF) \ +#define DOTXF_SIFIVE_RVV_CLEANUP_BODY(LOAD_SUF) \ do { \ switch (b) { \ case 5: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4); \ acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \ case 4: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3); \ acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \ case 3: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2); \ acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \ case 2: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1); \ acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \ case 1: \ - DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0); \ acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \ } \ } while (0) -#define DOTXF_SIFIVE_X280_REDUCE(i) \ +#define DOTXF_SIFIVE_RVV_REDUCE(i) \ do { \ RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1); \ dot##i = VF_REDUSUM_VS(PREC, LMUL)(acc##i, dot##i, m); \ @@ -173,16 +173,16 @@ DOTXF(PRECISION_CHAR, void) xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl); if (first) { if (inca == 1) - DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(); + DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(); else - DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED); + DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED); first = false; } else { if (inca == 1) - DOTXF_SIFIVE_X280_LOOP_BODY(); + DOTXF_SIFIVE_RVV_LOOP_BODY(); else - DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED); + DOTXF_SIFIVE_RVV_LOOP_BODY(_STRIDED); } a_tmp += vl * inca; @@ -190,12 +190,12 @@ DOTXF(PRECISION_CHAR, void) avl -= vl; } - DOTXF_SIFIVE_X280_REDUCE(0); - DOTXF_SIFIVE_X280_REDUCE(1); - DOTXF_SIFIVE_X280_REDUCE(2); - DOTXF_SIFIVE_X280_REDUCE(3); - DOTXF_SIFIVE_X280_REDUCE(4); - DOTXF_SIFIVE_X280_REDUCE(5); + DOTXF_SIFIVE_RVV_REDUCE(0); + DOTXF_SIFIVE_RVV_REDUCE(1); + DOTXF_SIFIVE_RVV_REDUCE(2); + DOTXF_SIFIVE_RVV_REDUCE(3); + DOTXF_SIFIVE_RVV_REDUCE(4); + DOTXF_SIFIVE_RVV_REDUCE(5); a += 6 * lda; y += 6 * incy; @@ -217,16 +217,16 @@ DOTXF(PRECISION_CHAR, void) xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl); if (first) { if (inca == 1) - DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(); + DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(); else - DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED); + DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED); first = false; } else { if (inca == 1) - DOTXF_SIFIVE_X280_CLEANUP_BODY(); + DOTXF_SIFIVE_RVV_CLEANUP_BODY(); else - DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED); + DOTXF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED); } a_tmp += vl * inca; @@ -236,27 +236,27 @@ DOTXF(PRECISION_CHAR, void) switch (b) { case 5: - DOTXF_SIFIVE_X280_REDUCE(4); + DOTXF_SIFIVE_RVV_REDUCE(4); case 4: - DOTXF_SIFIVE_X280_REDUCE(3); + DOTXF_SIFIVE_RVV_REDUCE(3); case 3: - DOTXF_SIFIVE_X280_REDUCE(2); + DOTXF_SIFIVE_RVV_REDUCE(2); case 2: - DOTXF_SIFIVE_X280_REDUCE(1); + DOTXF_SIFIVE_RVV_REDUCE(1); case 1: - DOTXF_SIFIVE_X280_REDUCE(0); + DOTXF_SIFIVE_RVV_REDUCE(0); } } return; } -#undef DOTXF_SIFIVE_X280_LOAD_ACOL -#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED -#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST -#undef DOTXF_SIFIVE_X280_LOOP_BODY -#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST -#undef DOTXF_SIFIVE_X280_CLEANUP_BODY -#undef DOTXF_SIFIVE_X280_REDUCE +#undef DOTXF_SIFIVE_RVV_LOAD_ACOL +#undef DOTXF_SIFIVE_RVV_LOAD_ACOL_STRIDED +#undef DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST +#undef DOTXF_SIFIVE_RVV_LOOP_BODY +#undef DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST +#undef DOTXF_SIFIVE_RVV_CLEANUP_BODY +#undef DOTXF_SIFIVE_RVV_REDUCE #endif // DOTXF diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c similarity index 86% rename from kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c rename to kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c index 119872197a..a0a4eb3c3d 100644 --- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c +++ b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c @@ -40,7 +40,7 @@ #include #include -#define PACKM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##packm_sifive_x280_intr(\ +#define PACKM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##packm_sifive_rvv_intr(\ conj_t conja, \ pack_t schema, \ dim_t cdim, \ @@ -57,8 +57,11 @@ #define PACKM(...) PACKM_(__VA_ARGS__) -#define REF_KERNEL_(PRECISION_CHAR) bli_##PRECISION_CHAR##PRECISION_CHAR##packm_sifive_x280_ref -#define REF_KERNEL(PRECISION_CHAR) REF_KERNEL_(PRECISION_CHAR) +#define BLI_SCAL2BBS_MXN_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2bbs_mxn +#define BLI_SCAL2BBS_MXN(PRECISION_CHAR) BLI_SCAL2BBS_MXN_(PRECISION_CHAR) + +#define BLI_SET0S_EDGE_(PRECISION_CHAR) bli_##PRECISION_CHAR##set0s_edge +#define BLI_SET0S_EDGE(PRECISION_CHAR) BLI_SET0S_EDGE_(PRECISION_CHAR) // LMUL is the LMUL used when a is "row major" (lda == 1). Since we use // segment stores with more than 4 fields, this is usually m1. @@ -74,9 +77,9 @@ #define LMUL_NR m4 #define FLT_SIZE sizeof(float) #define MR 7 -#define NR 64 +#define NR ( 4 * __riscv_v_min_vlen / 32 ) -#include "./bli_packm_sifive_x280_intr_real.c" +#include "./bli_packm_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -97,9 +100,9 @@ #define LMUL_NR m4 #define FLT_SIZE sizeof(double) #define MR 7 -#define NR 32 +#define NR ( 4 * __riscv_v_min_vlen / 64 ) -#include "./bli_packm_sifive_x280_intr_real.c" +#include "./bli_packm_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -121,9 +124,9 @@ #define LMUL_NR m2 #define FLT_SIZE sizeof(float) #define MR 6 -#define NR 32 +#define NR ( 2 * __riscv_v_min_vlen / 32 ) -#include "./bli_packm_sifive_x280_intr_complex.c" +#include "./bli_packm_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -146,9 +149,9 @@ #define LMUL_NR m2 #define FLT_SIZE sizeof(double) #define MR 6 -#define NR 16 +#define NR ( 2 * __riscv_v_min_vlen / 64 ) -#include "./bli_packm_sifive_x280_intr_complex.c" +#include "./bli_packm_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_complex.c index ee49090dc9..2173be3a74 100644 --- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_complex.c @@ -522,20 +522,21 @@ PACKM(PRECISION_CHAR, void) // generic kernel else { - REF_KERNEL(PRECISION_CHAR) + BLI_SCAL2BBS_MXN(PRECISION_CHAR) ( conja, - schema, cdim, - cdim_max, - cdim_bcast, n, - n_max, kappa, - a, inca, lda, - p, ldp, - params, - cntx + a, inca, lda, + p, cdim_bcast, ldp + ); + + BLI_SET0S_EDGE(PRECISION_CHAR) + ( + cdim*cdim_bcast, cdim_max*cdim_bcast, + n, n_max, + p, ldp ); } diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_real.c similarity index 98% rename from kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c rename to kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_real.c index 741714d60a..c853765a2f 100644 --- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_real.c @@ -37,8 +37,7 @@ PACKM(PRECISION_CHAR, void) { - (void) conja; // Suppress unused parameter warnings - (void) schema; + (void) schema; // Suppress unused parameter warnings (void) params; (void) cntx; const DATATYPE* restrict kappa = kappa_; @@ -341,20 +340,21 @@ PACKM(PRECISION_CHAR, void) // generic kernel else { - REF_KERNEL(PRECISION_CHAR) + BLI_SCAL2BBS_MXN(PRECISION_CHAR) ( conja, - schema, cdim, - cdim_max, - cdim_bcast, n, - n_max, kappa, - a, inca, lda, - p, ldp, - params, - cntx + a, inca, lda, + p, cdim_bcast, ldp + ); + + BLI_SET0S_EDGE(PRECISION_CHAR) + ( + cdim*cdim_bcast, cdim_max*cdim_bcast, + n, n_max, + p, ldp ); } diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c b/kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr.c similarity index 90% rename from kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c rename to kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr.c index 664d4616f3..564ce25a19 100644 --- a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c +++ b/kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr.c @@ -39,7 +39,7 @@ #include #include -#define GEMM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemm_sifive_x280_intr(\ +#define GEMM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemm_sifive_rvv_intr(\ dim_t m, \ dim_t n, \ dim_t k, \ @@ -61,9 +61,9 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) #define PACKMR 8 -#define PACKNR 64 +#define PACKNR ( 4 * __riscv_v_min_vlen / 32 ) -#include "./bli_gemm_sifive_x280_intr_real.c" +#include "./bli_gemm_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -80,9 +80,9 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) #define PACKMR 8 -#define PACKNR 32 +#define PACKNR ( 4 * __riscv_v_min_vlen / 64 ) -#include "./bli_gemm_sifive_x280_intr_real.c" +#include "./bli_gemm_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -100,9 +100,9 @@ #define LMUL m2 #define FLT_SIZE sizeof(float) #define PACKMR 8 -#define PACKNR 32 +#define PACKNR ( 2 * __riscv_v_min_vlen / 32 ) -#include "./bli_gemm_sifive_x280_intr_complex.c" +#include "./bli_gemm_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -121,9 +121,9 @@ #define LMUL m2 #define FLT_SIZE sizeof(double) #define PACKMR 8 -#define PACKNR 16 +#define PACKNR ( 2 * __riscv_v_min_vlen / 64 ) -#include "./bli_gemm_sifive_x280_intr_complex.c" +#include "./bli_gemm_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c b/kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_complex.c similarity index 100% rename from kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_complex.c diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c b/kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_real.c similarity index 100% rename from kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c rename to kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_real.c diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr.c similarity index 89% rename from kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c rename to kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr.c index 687abec185..9b2b4968f3 100644 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c +++ b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr.c @@ -35,11 +35,11 @@ // clang-format off #include "blis.h" #include "../../riscv_cmul_macros_intr.h" -#include "../../bli_kernels_sifive_x280.h" +#include "../../bli_kernels_sifive_rvv.h" #include #include -#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_intr(\ +#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_rvv_intr(\ dim_t m, \ dim_t n, \ dim_t k, \ @@ -55,7 +55,7 @@ const cntx_t* restrict cntx \ ) -#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_intr(\ +#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_rvv_intr(\ dim_t m, \ dim_t n, \ dim_t k, \ @@ -80,9 +80,9 @@ #define LMUL m4 #define FLT_SIZE sizeof(float) #define PACKMR 8 -#define PACKNR 64 +#define PACKNR ( 4 * __riscv_v_min_vlen / 32 ) -#include "./bli_gemmtrsm_sifive_x280_intr_real.c" +#include "./bli_gemmtrsm_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -99,9 +99,9 @@ #define LMUL m4 #define FLT_SIZE sizeof(double) #define PACKMR 8 -#define PACKNR 32 +#define PACKNR ( 4 * __riscv_v_min_vlen / 64 ) -#include "./bli_gemmtrsm_sifive_x280_intr_real.c" +#include "./bli_gemmtrsm_sifive_rvv_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR @@ -119,9 +119,9 @@ #define LMUL m2 #define FLT_SIZE sizeof(float) #define PACKMR 8 -#define PACKNR 32 +#define PACKNR ( 2 * __riscv_v_min_vlen / 32 ) -#include "./bli_gemmtrsm_sifive_x280_intr_complex.c" +#include "./bli_gemmtrsm_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT @@ -140,9 +140,9 @@ #define LMUL m2 #define FLT_SIZE sizeof(double) #define PACKMR 8 -#define PACKNR 16 +#define PACKNR ( 2 * __riscv_v_min_vlen / 64 ) -#include "./bli_gemmtrsm_sifive_x280_intr_complex.c" +#include "./bli_gemmtrsm_sifive_rvv_intr_complex.c" #undef DATATYPE #undef BASE_DT diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_complex.c similarity index 99% rename from kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c rename to kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_complex.c index 88ea04b7a9..7f2fc1c893 100644 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c +++ b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_complex.c @@ -35,7 +35,7 @@ // clang-format off #ifdef GEMMTRSM -#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr +#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_rvv_intr #define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR) diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_real.c similarity index 99% rename from kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c rename to kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_real.c index 7c3c3b8b7b..b628e4cc11 100644 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c +++ b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_real.c @@ -35,7 +35,7 @@ // clang-format off #ifdef GEMMTRSM -#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr +#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_rvv_intr #define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR) diff --git a/kernels/sifive_rvv/bli_kernels_sifive_rvv.h b/kernels/sifive_rvv/bli_kernels_sifive_rvv.h new file mode 100644 index 0000000000..f9f0f8995c --- /dev/null +++ b/kernels/sifive_rvv/bli_kernels_sifive_rvv.h @@ -0,0 +1,162 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Level 1 +ADDV_KER_PROT(float, s, addv_sifive_rvv_intr) +ADDV_KER_PROT(double, d, addv_sifive_rvv_intr) +ADDV_KER_PROT(scomplex, c, addv_sifive_rvv_intr) +ADDV_KER_PROT(dcomplex, z, addv_sifive_rvv_intr) + +AMAXV_KER_PROT(float, s, amaxv_sifive_rvv_intr) +AMAXV_KER_PROT(double, d, amaxv_sifive_rvv_intr) +AMAXV_KER_PROT(scomplex, c, amaxv_sifive_rvv_intr) +AMAXV_KER_PROT(dcomplex, z, amaxv_sifive_rvv_intr) + +AXPBYV_KER_PROT(float, s, axpbyv_sifive_rvv_intr) +AXPBYV_KER_PROT(double, d, axpbyv_sifive_rvv_intr) +AXPBYV_KER_PROT(scomplex, c, axpbyv_sifive_rvv_intr) +AXPBYV_KER_PROT(dcomplex, z, axpbyv_sifive_rvv_intr) + +AXPYV_KER_PROT(float, s, axpyv_sifive_rvv_intr) +AXPYV_KER_PROT(double, d, axpyv_sifive_rvv_intr) +AXPYV_KER_PROT(scomplex, c, axpyv_sifive_rvv_intr) +AXPYV_KER_PROT(dcomplex, z, axpyv_sifive_rvv_intr) + +COPYV_KER_PROT(float, s, copyv_sifive_rvv_intr) +COPYV_KER_PROT(double, d, copyv_sifive_rvv_intr) +COPYV_KER_PROT(scomplex, c, copyv_sifive_rvv_intr) +COPYV_KER_PROT(dcomplex, z, copyv_sifive_rvv_intr) + +DOTV_KER_PROT(float, s, dotv_sifive_rvv_intr) +DOTV_KER_PROT(double, d, dotv_sifive_rvv_intr) +DOTV_KER_PROT(scomplex, c, dotv_sifive_rvv_intr) +DOTV_KER_PROT(dcomplex, z, dotv_sifive_rvv_intr) + +DOTXV_KER_PROT(float, s, dotxv_sifive_rvv_intr) +DOTXV_KER_PROT(double, d, dotxv_sifive_rvv_intr) +DOTXV_KER_PROT(scomplex, c, dotxv_sifive_rvv_intr) +DOTXV_KER_PROT(dcomplex, z, dotxv_sifive_rvv_intr) + +INVERTV_KER_PROT(float, s, invertv_sifive_rvv_intr) +INVERTV_KER_PROT(double, d, invertv_sifive_rvv_intr) +INVERTV_KER_PROT(scomplex, c, invertv_sifive_rvv_intr) +INVERTV_KER_PROT(dcomplex, z, invertv_sifive_rvv_intr) + +INVSCALV_KER_PROT(float, s, invscalv_sifive_rvv_intr) +INVSCALV_KER_PROT(double, d, invscalv_sifive_rvv_intr) +INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_rvv_intr) +INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_rvv_intr) + +SCAL2V_KER_PROT(float, s, scal2v_sifive_rvv_intr) +SCAL2V_KER_PROT(double, d, scal2v_sifive_rvv_intr) +SCAL2V_KER_PROT(scomplex, c, scal2v_sifive_rvv_intr) +SCAL2V_KER_PROT(dcomplex, z, scal2v_sifive_rvv_intr) + +SCALV_KER_PROT(float, s, scalv_sifive_rvv_intr) +SCALV_KER_PROT(double, d, scalv_sifive_rvv_intr) +SCALV_KER_PROT(scomplex, c, scalv_sifive_rvv_intr) +SCALV_KER_PROT(dcomplex, z, scalv_sifive_rvv_intr) + +SETV_KER_PROT(float, s, setv_sifive_rvv_intr) +SETV_KER_PROT(double, d, setv_sifive_rvv_intr) +SETV_KER_PROT(scomplex, c, setv_sifive_rvv_intr) +SETV_KER_PROT(dcomplex, z, setv_sifive_rvv_intr) + +SUBV_KER_PROT(float, s, subv_sifive_rvv_intr) +SUBV_KER_PROT(double, d, subv_sifive_rvv_intr) +SUBV_KER_PROT(scomplex, c, subv_sifive_rvv_intr) +SUBV_KER_PROT(dcomplex, z, subv_sifive_rvv_intr) + +SWAPV_KER_PROT(float, s, swapv_sifive_rvv_intr) +SWAPV_KER_PROT(double, d, swapv_sifive_rvv_intr) +SWAPV_KER_PROT(scomplex, c, swapv_sifive_rvv_intr) +SWAPV_KER_PROT(dcomplex, z, swapv_sifive_rvv_intr) + +XPBYV_KER_PROT(float, s, xpbyv_sifive_rvv_intr) +XPBYV_KER_PROT(double, d, xpbyv_sifive_rvv_intr) +XPBYV_KER_PROT(scomplex, c, xpbyv_sifive_rvv_intr) +XPBYV_KER_PROT(dcomplex, z, xpbyv_sifive_rvv_intr) + +// Level 1f +AXPY2V_KER_PROT(float, s, axpy2v_sifive_rvv_intr) +AXPY2V_KER_PROT(double, d, axpy2v_sifive_rvv_intr) +AXPY2V_KER_PROT(scomplex, c, axpy2v_sifive_rvv_intr) +AXPY2V_KER_PROT(dcomplex, z, axpy2v_sifive_rvv_intr) + +AXPYF_KER_PROT(float, s, axpyf_sifive_rvv_intr) +AXPYF_KER_PROT(double, d, axpyf_sifive_rvv_intr) +AXPYF_KER_PROT(scomplex, c, axpyf_sifive_rvv_intr) +AXPYF_KER_PROT(dcomplex, z, axpyf_sifive_rvv_intr) + +DOTXF_KER_PROT(float, s, dotxf_sifive_rvv_intr) +DOTXF_KER_PROT(double, d, dotxf_sifive_rvv_intr) +DOTXF_KER_PROT(scomplex, c, dotxf_sifive_rvv_intr) +DOTXF_KER_PROT(dcomplex, z, dotxf_sifive_rvv_intr) + +DOTAXPYV_KER_PROT(float, s, dotaxpyv_sifive_rvv_intr) +DOTAXPYV_KER_PROT(double, d, dotaxpyv_sifive_rvv_intr) +DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_rvv_intr) +DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_rvv_intr) + +DOTXAXPYF_KER_PROT(float, s, dotxaxpyf_sifive_rvv_intr) +DOTXAXPYF_KER_PROT(double, d, dotxaxpyf_sifive_rvv_intr) +DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_rvv_intr) +DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_rvv_intr) + +// Level 1m +PACKM_KER_PROT(float, s, packm_sifive_rvv_intr) +PACKM_KER_PROT(double, d, packm_sifive_rvv_intr) +PACKM_KER_PROT(scomplex, c, packm_sifive_rvv_intr) +PACKM_KER_PROT(dcomplex, z, packm_sifive_rvv_intr) + +// Reference 1m +PACKM_KER_PROT(float, ss, packm_sifive_rvv_ref) +PACKM_KER_PROT(double, dd, packm_sifive_rvv_ref) +PACKM_KER_PROT(scomplex, cc, packm_sifive_rvv_ref) +PACKM_KER_PROT(dcomplex, zz, packm_sifive_rvv_ref) + +// Level 3 +GEMM_UKR_PROT(float, s, gemm_sifive_rvv_intr) +GEMM_UKR_PROT(double, d, gemm_sifive_rvv_intr) +GEMM_UKR_PROT(scomplex, c, gemm_sifive_rvv_intr) +GEMM_UKR_PROT(dcomplex, z, gemm_sifive_rvv_intr) + +GEMMTRSM_UKR_PROT(float, s, gemmtrsm_l_sifive_rvv_intr) +GEMMTRSM_UKR_PROT(double, d, gemmtrsm_l_sifive_rvv_intr) +GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_rvv_intr) +GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_rvv_intr) +GEMMTRSM_UKR_PROT(float, s, gemmtrsm_u_sifive_rvv_intr) +GEMMTRSM_UKR_PROT(double, d, gemmtrsm_u_sifive_rvv_intr) +GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_rvv_intr) +GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_rvv_intr) diff --git a/kernels/sifive_x280/riscv_cmul_macros_intr.h b/kernels/sifive_rvv/riscv_cmul_macros_intr.h similarity index 100% rename from kernels/sifive_x280/riscv_cmul_macros_intr.h rename to kernels/sifive_rvv/riscv_cmul_macros_intr.h diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_rvv/riscv_overloaded_intrinsics.h similarity index 99% rename from kernels/sifive_x280/riscv_overloaded_intrinsics.h rename to kernels/sifive_rvv/riscv_overloaded_intrinsics.h index 44f70f2727..794c44c092 100644 --- a/kernels/sifive_x280/riscv_overloaded_intrinsics.h +++ b/kernels/sifive_rvv/riscv_overloaded_intrinsics.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h deleted file mode 100644 index ff7b445c47..0000000000 --- a/kernels/sifive_x280/bli_kernels_sifive_x280.h +++ /dev/null @@ -1,162 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// Level 1 -ADDV_KER_PROT(float, s, addv_sifive_x280_intr) -ADDV_KER_PROT(double, d, addv_sifive_x280_intr) -ADDV_KER_PROT(scomplex, c, addv_sifive_x280_intr) -ADDV_KER_PROT(dcomplex, z, addv_sifive_x280_intr) - -AMAXV_KER_PROT(float, s, amaxv_sifive_x280_intr) -AMAXV_KER_PROT(double, d, amaxv_sifive_x280_intr) -AMAXV_KER_PROT(scomplex, c, amaxv_sifive_x280_intr) -AMAXV_KER_PROT(dcomplex, z, amaxv_sifive_x280_intr) - -AXPBYV_KER_PROT(float, s, axpbyv_sifive_x280_intr) -AXPBYV_KER_PROT(double, d, axpbyv_sifive_x280_intr) -AXPBYV_KER_PROT(scomplex, c, axpbyv_sifive_x280_intr) -AXPBYV_KER_PROT(dcomplex, z, axpbyv_sifive_x280_intr) - -AXPYV_KER_PROT(float, s, axpyv_sifive_x280_intr) -AXPYV_KER_PROT(double, d, axpyv_sifive_x280_intr) -AXPYV_KER_PROT(scomplex, c, axpyv_sifive_x280_intr) -AXPYV_KER_PROT(dcomplex, z, axpyv_sifive_x280_intr) - -COPYV_KER_PROT(float, s, copyv_sifive_x280_intr) -COPYV_KER_PROT(double, d, copyv_sifive_x280_intr) -COPYV_KER_PROT(scomplex, c, copyv_sifive_x280_intr) -COPYV_KER_PROT(dcomplex, z, copyv_sifive_x280_intr) - -DOTV_KER_PROT(float, s, dotv_sifive_x280_intr) -DOTV_KER_PROT(double, d, dotv_sifive_x280_intr) -DOTV_KER_PROT(scomplex, c, dotv_sifive_x280_intr) -DOTV_KER_PROT(dcomplex, z, dotv_sifive_x280_intr) - -DOTXV_KER_PROT(float, s, dotxv_sifive_x280_intr) -DOTXV_KER_PROT(double, d, dotxv_sifive_x280_intr) -DOTXV_KER_PROT(scomplex, c, dotxv_sifive_x280_intr) -DOTXV_KER_PROT(dcomplex, z, dotxv_sifive_x280_intr) - -INVERTV_KER_PROT(float, s, invertv_sifive_x280_intr) -INVERTV_KER_PROT(double, d, invertv_sifive_x280_intr) -INVERTV_KER_PROT(scomplex, c, invertv_sifive_x280_intr) -INVERTV_KER_PROT(dcomplex, z, invertv_sifive_x280_intr) - -INVSCALV_KER_PROT(float, s, invscalv_sifive_x280_intr) -INVSCALV_KER_PROT(double, d, invscalv_sifive_x280_intr) -INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_intr) -INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_intr) - -SCAL2V_KER_PROT(float, s, scal2v_sifive_x280_intr) -SCAL2V_KER_PROT(double, d, scal2v_sifive_x280_intr) -SCAL2V_KER_PROT(scomplex, c, scal2v_sifive_x280_intr) -SCAL2V_KER_PROT(dcomplex, z, scal2v_sifive_x280_intr) - -SCALV_KER_PROT(float, s, scalv_sifive_x280_intr) -SCALV_KER_PROT(double, d, scalv_sifive_x280_intr) -SCALV_KER_PROT(scomplex, c, scalv_sifive_x280_intr) -SCALV_KER_PROT(dcomplex, z, scalv_sifive_x280_intr) - -SETV_KER_PROT(float, s, setv_sifive_x280_intr) -SETV_KER_PROT(double, d, setv_sifive_x280_intr) -SETV_KER_PROT(scomplex, c, setv_sifive_x280_intr) -SETV_KER_PROT(dcomplex, z, setv_sifive_x280_intr) - -SUBV_KER_PROT(float, s, subv_sifive_x280_intr) -SUBV_KER_PROT(double, d, subv_sifive_x280_intr) -SUBV_KER_PROT(scomplex, c, subv_sifive_x280_intr) -SUBV_KER_PROT(dcomplex, z, subv_sifive_x280_intr) - -SWAPV_KER_PROT(float, s, swapv_sifive_x280_intr) -SWAPV_KER_PROT(double, d, swapv_sifive_x280_intr) -SWAPV_KER_PROT(scomplex, c, swapv_sifive_x280_intr) -SWAPV_KER_PROT(dcomplex, z, swapv_sifive_x280_intr) - -XPBYV_KER_PROT(float, s, xpbyv_sifive_x280_intr) -XPBYV_KER_PROT(double, d, xpbyv_sifive_x280_intr) -XPBYV_KER_PROT(scomplex, c, xpbyv_sifive_x280_intr) -XPBYV_KER_PROT(dcomplex, z, xpbyv_sifive_x280_intr) - -// Level 1f -AXPY2V_KER_PROT(float, s, axpy2v_sifive_x280_intr) -AXPY2V_KER_PROT(double, d, axpy2v_sifive_x280_intr) -AXPY2V_KER_PROT(scomplex, c, axpy2v_sifive_x280_intr) -AXPY2V_KER_PROT(dcomplex, z, axpy2v_sifive_x280_intr) - -AXPYF_KER_PROT(float, s, axpyf_sifive_x280_intr) -AXPYF_KER_PROT(double, d, axpyf_sifive_x280_intr) -AXPYF_KER_PROT(scomplex, c, axpyf_sifive_x280_intr) -AXPYF_KER_PROT(dcomplex, z, axpyf_sifive_x280_intr) - -DOTXF_KER_PROT(float, s, dotxf_sifive_x280_intr) -DOTXF_KER_PROT(double, d, dotxf_sifive_x280_intr) -DOTXF_KER_PROT(scomplex, c, dotxf_sifive_x280_intr) -DOTXF_KER_PROT(dcomplex, z, dotxf_sifive_x280_intr) - -DOTAXPYV_KER_PROT(float, s, dotaxpyv_sifive_x280_intr) -DOTAXPYV_KER_PROT(double, d, dotaxpyv_sifive_x280_intr) -DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_x280_intr) -DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_x280_intr) - -DOTXAXPYF_KER_PROT(float, s, dotxaxpyf_sifive_x280_intr) -DOTXAXPYF_KER_PROT(double, d, dotxaxpyf_sifive_x280_intr) -DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_intr) -DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_intr) - -// Level 1m -PACKM_KER_PROT(float, s, packm_sifive_x280_intr) -PACKM_KER_PROT(double, d, packm_sifive_x280_intr) -PACKM_KER_PROT(scomplex, c, packm_sifive_x280_intr) -PACKM_KER_PROT(dcomplex, z, packm_sifive_x280_intr) - -// Reference 1m -PACKM_KER_PROT(float, ss, packm_sifive_x280_ref) -PACKM_KER_PROT(double, dd, packm_sifive_x280_ref) -PACKM_KER_PROT(scomplex, cc, packm_sifive_x280_ref) -PACKM_KER_PROT(dcomplex, zz, packm_sifive_x280_ref) - -// Level 3 -GEMM_UKR_PROT(float, s, gemm_sifive_x280_intr) -GEMM_UKR_PROT(double, d, gemm_sifive_x280_intr) -GEMM_UKR_PROT(scomplex, c, gemm_sifive_x280_intr) -GEMM_UKR_PROT(dcomplex, z, gemm_sifive_x280_intr) - -GEMMTRSM_UKR_PROT(float, s, gemmtrsm_l_sifive_x280_intr) -GEMMTRSM_UKR_PROT(double, d, gemmtrsm_l_sifive_x280_intr) -GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_intr) -GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_intr) -GEMMTRSM_UKR_PROT(float, s, gemmtrsm_u_sifive_x280_intr) -GEMMTRSM_UKR_PROT(double, d, gemmtrsm_u_sifive_x280_intr) -GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_intr) -GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_intr) diff --git a/kernels/sifive_x280/riscv_cmul_macros_asm.h b/kernels/sifive_x280/riscv_cmul_macros_asm.h deleted file mode 100644 index 9c33fd7bc5..0000000000 --- a/kernels/sifive_x280/riscv_cmul_macros_asm.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// macros to emit complex multiplication -// caveat: the destination registers cannot overlap the source registers! -// rd = rs1 * rs2 -#define cmul(rd_r, rd_i, rs1_r, rs1_i, rs2_r, rs2_i) \ - \ - __asm__(FMUL#rd_r", "#rs1_r", "#rs2_r);\ - __asm__(FMUL#rd_i", "#rs1_r", "#rs2_i);\ - __asm__(FNMSUB#rd_r", "#rs1_i", "#rs2_i", "#rd_r);\ - __asm__(FMADD#rd_i", "#rs1_i", "#rs2_r", "#rd_i) - -// vd = vs2 * f[rs1] -#define vcmul_vf(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \ - \ - __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\ - __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\ - __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\ - __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i) - -#define vcmul_vf2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \ - \ - __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\ - __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\ - __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\ - __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r)) - -// vd = conj(vs2) * f[rs1] -#define vcmul_vf_conj(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \ - \ - __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\ - __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\ - __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\ - __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i) - -#define vcmul_vf_conj2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \ - \ - __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\ - __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\ - __asm__("vfmacc.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\ - __asm__("vfnmsac.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r)) - -// vd += vs2 * f[rs1] -#define vcmacc_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \ - \ - __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\ - __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\ - __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\ - __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i) - -#define vcmacc_vf2(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \ - \ - __asm__("vfmacc.vf "#vd_r", %0, "#vs2_r : : "f"(rs1_r));\ - __asm__("vfmacc.vf "#vd_i", %0, "#vs2_r : : "f"(rs1_i));\ - __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\ - __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r)) - -// vd += conj(vs2) * f[rs1] -#define vcmacc_vf_conj(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \ - \ - __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\ - __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\ - __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\ - __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i) - -// vd -= vs2 * f[rs1] -#define vcnmsac_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \ - \ - __asm__("vfnmsac.vf "#vd_r", "#rs1_r", "#vs2_r);\ - __asm__("vfnmsac.vf "#vd_i", "#rs1_i", "#vs2_r);\ - __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\ - __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i) - -// vd = vs2 * vs1 -#define vcmul_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \ - \ - __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\ - __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\ - __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\ - __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r) - -// vd = vs2 * conj(vs1) -#define vcmul_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \ - \ - __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\ - __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\ - __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\ - __asm__("vfmsac.vv "#vd_i", "#vs2_i", "#vs1_r) - -// vd += vs2 * vs1 -#define vcmacc_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \ - \ - __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\ - __asm__("vfmacc.vv "#vd_i", "#vs2_r", "#vs1_i);\ - __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\ - __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r) - -// vd += vs2 * conj(vs1) -#define vcmacc_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \ - \ - __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\ - __asm__("vfnmsac.vv "#vd_i", "#vs2_r", "#vs1_i);\ - __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\ - __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r) - From 1235d6b291f248d54a834273a9d5900160450287 Mon Sep 17 00:00:00 2001 From: Michael Yeh Date: Thu, 21 Nov 2024 15:21:44 -0800 Subject: [PATCH 8/8] Fix undefs in packm --- .../bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c index a0a4eb3c3d..cdd5a4035b 100644 --- a/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c +++ b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c @@ -164,8 +164,11 @@ #undef MR #undef NR -#undef REF_KERNEL_ -#undef REF_KERNEL +#undef BLI_SCAL2BBS_MXN_ +#undef BLI_SCAL2BBS_MXN + +#undef BLI_SET0S_EDGE_ +#undef BLI_SET0S_EDGE #undef PACKM #undef PACKM_