From d23eb3b93ec42eae92bfafffc50f1a6d1e0c0d25 Mon Sep 17 00:00:00 2001 From: Vaisakh K V Date: Thu, 5 Dec 2024 11:41:05 +0530 Subject: [PATCH 01/56] Support for SME1 based sgemm_direct kernel for cblas_sgemm level 3 API * Added ARMV9SME target * Added SGEMM_DIRECT kernel based on SME1 --- CMakeLists.txt | 3 +- Makefile.arm64 | 5 + Makefile.system | 8 + TargetList.txt | 1 + c_check | 19 ++ cmake/arch.cmake | 18 +- cmake/cc.cmake | 6 + cmake/prebuild.cmake | 2 +- cmake/system.cmake | 38 +++- cmake/system_check.cmake | 11 + common.h | 1 + common_arm64.h | 2 +- common_param.h | 6 + common_s.h | 4 +- driver/others/dynamic_arm64.c | 34 +++ getarch.c | 13 ++ interface/gemm.c | 70 ++++-- kernel/CMakeLists.txt | 16 +- kernel/Makefile | 4 + kernel/Makefile.L3 | 33 ++- kernel/arm64/KERNEL.ARMV9SME | 3 + kernel/arm64/sgemm_direct_arm64_sme1.c | 59 +++++ kernel/arm64/sgemm_direct_sme1.S | 228 ++++++++++++++++++++ kernel/arm64/sgemm_direct_sme1_preprocess.S | 133 ++++++++++++ kernel/setparam-ref.c | 5 + param.h | 8 +- 26 files changed, 694 insertions(+), 36 deletions(-) create mode 100644 kernel/arm64/KERNEL.ARMV9SME create mode 100644 kernel/arm64/sgemm_direct_arm64_sme1.c create mode 100644 kernel/arm64/sgemm_direct_sme1.S create mode 100644 kernel/arm64/sgemm_direct_sme1_preprocess.S diff --git a/CMakeLists.txt b/CMakeLists.txt index ddff73c2cd..8e99bd208f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,11 +4,12 @@ cmake_minimum_required(VERSION 3.16.0) +set (CMAKE_ASM_SOURCE_FILE_EXTENSIONS "S") project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 28.dev) +set(OpenBLAS_PATCH_VERSION 29.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") diff --git a/Makefile.arm64 b/Makefile.arm64 index fccc0d0d0f..46e4baefc4 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve endif endif +ifeq ($(CORE), ARMV9SME) +CCOMMON_OPT += -march=armv9-a+sve2+sme +FCOMMON_OPT += -march=armv9-a+sve2 +endif + ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 ifneq ($(F_COMPILER), NAG) diff --git a/Makefile.system b/Makefile.system index 29ea819f13..14830eb4e2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64) export MACOSX_DEPLOYMENT_TARGET=11.0 ifeq ($(C_COMPILER), GCC) export NO_SVE = 1 +export NO_SME = 1 endif else export MACOSX_DEPLOYMENT_TARGET=10.8 @@ -709,6 +710,9 @@ DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE DYNAMIC_CORE += A64FX endif +ifneq ($(NO_SME), 1) +DYNAMIC_CORE += ARMV9SME +endif DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 @@ -1474,6 +1478,10 @@ ifeq ($(NO_SVE), 1) CCOMMON_OPT += -DNO_SVE endif +ifeq ($(NO_SME), 1) +CCOMMON_OPT += -DNO_SME +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER diff --git a/TargetList.txt b/TargetList.txt index 25eeddfb00..232e12ffa6 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -111,6 +111,7 @@ THUNDERX3T110 VORTEX A64FX ARMV8SVE +ARMV9SME FT2000 9.System Z: diff --git a/c_check b/c_check index c2b52c81b0..0aea55fee7 100755 --- a/c_check +++ b/c_check @@ -331,6 +331,24 @@ if [ "$architecture" = "arm64" ]; then rm -rf "$tmpd" fi +no_sme=0 +if [ "$architecture" = "arm64" ]; then + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') + tmpf="$tmpd/a.S" + printf ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n">> "$tmpf" + args=" -march=armv9-a+sve2+sme -c -o $tmpf.o $tmpf" + no_sme=0 + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + args=" -march=armv9-a+sme -c -o $tmpf.o $tmpf" + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_sme=1 + } + rm -rf "$tmpd" +fi + c11_atomics=0 case "$data" in *HAVE_C11*) @@ -472,6 +490,7 @@ done printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" [ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n" [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" + [ "$no_sme" -eq 1 ] && printf "NO_SME=1\n" [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 27ba6f8727..ec91a2d598 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,9 +44,21 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) - if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10 + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + endif () + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + endif() + elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + endif () + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + endif() endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 775239e1cd..5e9c5a8c42 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE) endif () endif () +if (${CORE} STREQUAL ARMV9SME) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") + endif () +endif () + if (${CORE} STREQUAL CORTEXA510) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 53a78d782f..f6ca73b7b6 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1014,7 +1014,7 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "NEOVERSEN2") + elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index 6b891ca0ef..871fdb2e64 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -21,7 +21,15 @@ endif() # Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? # It seems we are meant to use TARGET as input and CORE internally as kernel. if(NOT DEFINED CORE AND DEFINED TARGET) - set(CORE ${TARGET}) + if (${TARGET} STREQUAL "LOONGSON3R5") + set(CORE "LA464") + elseif (${TARGET} STREQUAL "LOONGSON2K1000") + set(CORE "LA264") + elseif (${TARGET} STREQUAL "LOONGSONGENERIC") + set(CORE "LA64_GENERIC)") + else () + set(CORE ${TARGET}) + endif() endif() # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. @@ -310,6 +318,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") endif() endif() + if (${TARGET} STREQUAL ARMV9SME) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3") + endif() if (${TARGET} STREQUAL A64FX) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") @@ -382,6 +393,8 @@ if (NEED_PIC) if (NOT NOFORTRAN) if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -pic") + elseif (${F_COMPILER} STREQUAL "NAGFOR") + set(FCOMMON_OPT "${FCOMMON_OPT} -PIC") else () set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") endif () @@ -640,17 +653,17 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") endif () if (CMAKE_Fortran_COMPILER) -if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") - set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") - if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") -message(STATUS "removing fortran flags") - set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") + if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") + if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + message(STATUS "removing fortran flags") + set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") + endif () + foreach (FILTER_FLAG ${FILTER_FLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) + endforeach () endif () - foreach (FILTER_FLAG ${FILTER_FLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) - endforeach () -endif () endif () if ("${F_COMPILER}" STREQUAL "GFORTRAN") @@ -670,6 +683,9 @@ endif () if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () +if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") +endif () if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") if ("${F_COMPILER}" STREQUAL "FLANG") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 59a1358789..256ab336bb 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -135,6 +135,17 @@ endif() endif() endif() +if (ARM64) +if (NOT NO_SME) + file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n") + execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME) +if (NO_SME EQUAL 1) +set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME") +endif() + file(REMOVE "${PROJECT_BINARY_DIR}/sme.c" "${PROJECT_BINARY_DIR}/sme.o") +endif() +endif() + include(CheckIncludeFile) CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) if (HAVE_C11 EQUAL 1) diff --git a/common.h b/common.h index b8bac1ad27..766b89cf74 100644 --- a/common.h +++ b/common.h @@ -696,6 +696,7 @@ void gotoblas_profile_init(void); void gotoblas_profile_quit(void); int support_avx512(void); +int support_sme1(void); #ifdef USE_OPENMP diff --git a/common_arm64.h b/common_arm64.h index 595a01995a..5856898a2b 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -175,7 +175,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HUGE_PAGESIZE ( 4 << 20) #ifndef BUFFERSIZE -#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) +#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME) #define BUFFER_SIZE (32 << 22) #else #define BUFFER_SIZE (32 << 20) diff --git a/common_param.h b/common_param.h index c082d248e8..e1a87f9693 100644 --- a/common_param.h +++ b/common_param.h @@ -221,6 +221,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); #endif +#ifdef ARCH_ARM64 +#ifdef HAVE_SME + void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); +#endif +#endif + int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); diff --git a/common_s.h b/common_s.h index fdd80b62f6..af9d940ae1 100644 --- a/common_s.h +++ b/common_s.h @@ -213,9 +213,9 @@ #ifdef ARCH_X86_64 #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant #define SGEMM_DIRECT gotoblas -> sgemm_direct -#else +#elif ARCH_ARM64 #define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant -#define SGEMM_DIRECT sgemm_direct +#define SGEMM_DIRECT gotoblas -> sgemm_direct #endif #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index dc88d816fb..3d2bed4af8 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif +#ifdef DYN_ARMV9SME +extern gotoblas_t gotoblas_ARMV9SME; +#else +#define gotoblas_ARMV9SME gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -148,6 +153,13 @@ extern gotoblas_t gotoblas_A64FX; #define gotoblas_ARMV8SVE gotoblas_ARMV8 #define gotoblas_A64FX gotoblas_ARMV8 #endif + +#ifndef NO_SME +extern gotoblas_t gotoblas_ARMV9SME; +#else +#define gotoblas_ARMV9SME gotoblas_ARMV8SVE +#endif + extern gotoblas_t gotoblas_THUNDERX3T110; #endif #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 @@ -168,6 +180,9 @@ extern void openblas_warning(int verbose, const char * msg); #ifndef HWCAP_SVE #define HWCAP_SVE (1 << 22) #endif +#ifndef HWCAP2_SME +#define HWCAP2_SME 1<<23 +#endif #define get_cpu_ftr(id, var) ({ \ __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ @@ -393,6 +408,13 @@ static gotoblas_t *get_coretype(void) { snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } + +#if !defined(NO_SME) && defined(HWCAP2_SME) + if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) { + return &gotoblas_ARMV9SME; + } +#endif + #ifndef NO_SVE if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { return &gotoblas_ARMV8SVE; @@ -443,3 +465,15 @@ void gotoblas_dynamic_init(void) { void gotoblas_dynamic_quit(void) { gotoblas = NULL; } + +int support_sme1(void) { + int ret = 0; + +#if (defined OS_LINUX || defined OS_ANDROID) + ret = getauxval(AT_HWCAP2) & HWCAP2_SME; + if(getauxval(AT_HWCAP2) & HWCAP2_SME){ + ret = 1; + } +#endif + return ret; +} diff --git a/getarch.c b/getarch.c index 826dd1ce0a..b51c3ed643 100644 --- a/getarch.c +++ b/getarch.c @@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "ARMV8SVE" #endif +#ifdef FORCE_ARMV9SME +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV9SME" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV9SME " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9" +#define LIBNAME "armv9sme" +#define CORENAME "ARMV9SME" +#endif #ifdef FORCE_ARMV8 #define FORCE diff --git a/interface/gemm.c b/interface/gemm.c index 576e94593c..2cd7d7b5c3 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -1,5 +1,5 @@ /*********************************************************************/ -/* Copyright 2024 The OpenBLAS Project */ +/* Copyright 2024, 2025 The OpenBLAS Project */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -86,7 +86,7 @@ #endif static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = { -#ifndef GEMM3M +#if !defined(GEMM3M) || defined(GENERIC) GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR, @@ -177,6 +177,49 @@ static int init_amxtile_permission() { } #endif +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif + +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { + return + MNK < 262144L ? 1 + : MNK < 1124864L ? MIN(ncpu, 6) + : MNK < 7880599L ? MIN(ncpu, 12) + : MNK < 17173512L ? MIN(ncpu, 16) + : MNK < 33386248L ? MIN(ncpu, 20) + : MNK < 57066625L ? MIN(ncpu, 24) + : MNK < 91733851L ? MIN(ncpu, 32) + : MNK < 265847707L ? MIN(ncpu, 40) + : MNK < 458314011L ? MIN(ncpu, 48) + : MNK < 729000000L ? MIN(ncpu, 56) + : ncpu; +} +#endif + +static inline int get_gemm_optimal_nthreads(double MNK) { + int ncpu = num_cpu_avail(3); +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); + } +#endif + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { + return 1; + } + else { + if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) { + return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); + } + else { + return ncpu; + } + } +} + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -310,7 +353,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FLOAT *beta = (FLOAT*) vbeta; FLOAT *a = (FLOAT*) va; FLOAT *b = (FLOAT*) vb; - FLOAT *c = (FLOAT*) vc; + FLOAT *c = (FLOAT*) vc; #endif blas_arg_t args; @@ -350,14 +393,21 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) -#ifdef DYNAMIC_ARCH +#if defined(DYNAMIC_ARCH) && defined(ARCH_x86) if (support_avx512() ) -#endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; } - +#endif +#if defined(DYNAMIC_ARCH) && defined(ARCH_ARM64) + if (support_sme1()){ + if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { + SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); + return; + } + } +#endif #endif #ifndef COMPLEX @@ -604,13 +654,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) - args.nthreads = 1; - else { - args.nthreads = num_cpu_avail(3); - if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) - args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); - } + args.nthreads = get_gemm_optimal_nthreads(MNK); args.common = NULL; diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 74e6760c27..ad7dca9a7b 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) @@ -125,6 +126,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE") endif () if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") @@ -148,6 +150,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") @@ -204,19 +207,27 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () - set(USE_DIRECT_SGEMM false) - if (X86_64) + if (X86_64 OR (ARM64 AND (UC_TARGET_CORE MATCHES ARMV9SME))) set(USE_DIRECT_SGEMM true) endif() if (USE_DIRECT_SGEMM) # if (NOT DEFINED SGEMMDIRECTKERNEL) + if (X86_64) set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) # endif() GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) + elseif (ARM64) + set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c) + set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) + set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) + endif () endif() foreach (float_type SINGLE DOUBLE) @@ -1105,6 +1116,7 @@ endif () GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") diff --git a/kernel/Makefile b/kernel/Makefile index 3f9afd3fa1..84cd482a06 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,7 +24,11 @@ ifdef NO_AVX2 AVX2OPT= endif + ifdef TARGET_CORE +ifeq ($(TARGET_CORE), ARMV9SME) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme +endif ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index ed1c74ecff..41f16f9c91 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -24,6 +24,7 @@ endif ifeq ($(ARCH), arm64) USE_TRMM = 1 +USE_DIRECT_SGEMM = 1 endif ifeq ($(ARCH), riscv64) @@ -95,9 +96,17 @@ endif ifdef USE_DIRECT_SGEMM ifndef SGEMMDIRECTKERNEL +ifeq ($(ARCH), x86_64) SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c endif +ifeq ($(ARCH), arm64) +ifeq ($(TARGET_CORE), ARMV9SME) +HAVE_SME = 1 +SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c +endif +endif +endif endif ifeq ($(BUILD_BFLOAT16), 1) @@ -128,9 +137,19 @@ SKERNELOBJS += \ $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) ifdef USE_DIRECT_SGEMM +ifeq ($(ARCH), x86_64) +SKERNELOBJS += \ + sgemm_direct$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_performant$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(ARCH), arm64) +ifdef HAVE_SME SKERNELOBJS += \ sgemm_direct$(TSUFFIX).$(SUFFIX) \ - sgemm_direct_performant$(TSUFFIX).$(SUFFIX) + sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) +endif +endif endif endif @@ -809,11 +828,23 @@ else endif ifdef USE_DIRECT_SGEMM +ifeq ($(ARCH), x86_64) $(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif +ifeq ($(ARCH), arm64) +ifdef HAVE_SME +$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : + $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ +$(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : + $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@ +endif +endif +endif ifeq ($(BUILD_BFLOAT16), 1) diff --git a/kernel/arm64/KERNEL.ARMV9SME b/kernel/arm64/KERNEL.ARMV9SME new file mode 100644 index 0000000000..dc333d8298 --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV9SME @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.ARMV8SVE + + diff --git a/kernel/arm64/sgemm_direct_arm64_sme1.c b/kernel/arm64/sgemm_direct_arm64_sme1.c new file mode 100644 index 0000000000..bd7e548894 --- /dev/null +++ b/kernel/arm64/sgemm_direct_arm64_sme1.c @@ -0,0 +1,59 @@ +/* + Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +#include "common.h" +#include +#include +#include + +#if defined(HAVE_SME) + +/* Function prototypes */ +extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ + const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); +extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\ + const float * matLeft,\ + const float * restrict matRight,\ + const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL"); + +/* Function Definitions */ +uint64_t sve_cntw() { + uint64_t cnt; + asm volatile( + "rdsvl %[res], #1\n" + "lsr %[res], %[res], #2\n" + : [res] "=r" (cnt) :: + ); + return cnt; +} + +/*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\ + float * __restrict A, BLASLONG strideA, float * __restrict B,\ + BLASLONG strideB , float * __restrict R, BLASLONG strideR) +*/ +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ + BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ + float * __restrict R, BLASLONG strideR){ + + uint64_t m_mod, vl_elms; + + vl_elms = sve_cntw(); + + m_mod = ceil((double)M/(double)vl_elms) * vl_elms; + + float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); + + /* Pre-process the left matrix to make it suitable for + matrix sum of outer-product calculation + */ + sgemm_direct_sme1_preprocess(M, K, A, A_mod); + + /* Calculate C = A*B */ + sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); + + free(A_mod); +} + +#endif diff --git a/kernel/arm64/sgemm_direct_sme1.S b/kernel/arm64/sgemm_direct_sme1.S new file mode 100644 index 0000000000..8c0a173f3d --- /dev/null +++ b/kernel/arm64/sgemm_direct_sme1.S @@ -0,0 +1,228 @@ +/* + Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +/*-------------------------------------------------------------------------- + * SME1 based Matrix multiplication code for FP32 input matrices to FP32 + * output matrix + * C = A*B + * A: Left input matrix of dimension M x K + * B: Right input matrix of dimension K x N + * C: Result matrix of dimension M x N + * + * Usage of function: + * sgemm_direct_sme1_2VLx2VL( uint64_t M , uint64_t K, uint64_t N,\ + const float * restrict A_base,\ + const float * restrict B_base,\ + const float * restrict C_base); +----------------------------------------------------------------------------*/ + +#define M x0 //M dimension +#define K x1 //K dimension +#define N x2 //N dimension +#define A_base x3 //Pointer to left matrix(A) +#define B_base x4 //Pointer to right matrix(B) +#define C_base x5 //Pointer to result matrix(C) +#define Aptr x6 //Pointer to traverse A +#define Aptr_end x7 //Pointer to end of row of A +#define Cptr x8 //Pointer to traverse C +#define Cptr0 x9 //2nd Pointer to traverse C +#define Cptr1 x10 //3rd Pointer to traverse C +#define Bptr x11 //Pointer to traverse B +#define Bptr0 x12 //2nd Pointer to traverse B +#define N_exit x14 //Exit condition for N loop +#define K_exit x15 //Exit condition for K loop +#define M_cntr x16 //M loop counter +#define C1 x17 //Constant1: N*(SVLs+1);SVLs-No. of 32-bit elements +#define C2 x18 //Constant2: N + SVLs +#define C3 x19 //Constant3: K*SVLs + SVLs +#define C4 x20 //Constant4: SVLs-2 +#define C5 x21 //Constant5: K*SVLs +#define C6 x22 //Constant6: N*SVLs + + .text + .global sgemm_direct_sme1_2VLx2VL + + sgemm_direct_sme1_2VLx2VL: + + stp x19, x20, [sp, #-48]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + + smstart + + cntw C4 //SVLs + mul C5, C4, K //K*SVLs + mul C6, C4, N //N*SVLs + add C1, C6, N //N*SVLs + N + add N_exit, B_base, N, lsl #2 //N_Loop exit conditon + mov M_cntr, #0 + add C2, N, C4 //N + SVLs + add C3, C5, C4 //K*SVLs + SVLs + whilelt p2.s, M_cntr, M //Tile 0,1 predicate (M dimension) + sub w20, w20, #2 //SVLs-2 + +.M_Loop: + incw M_cntr + whilelt p3.s, M_cntr, M //Tile 2,3 predicate (M dimension) + mov Bptr, B_base //B_base + mov Cptr, C_base //C_base + whilelt p0.b, Bptr, N_exit //Tile 0/2 predicate (N dimension) + +.N_Loop: + mov Aptr, A_base //Aptr = A_base + mov Bptr0, Bptr //Bptr = B_base + mov Cptr0, Cptr //Cptr0 = C_base + addvl Cptr1, Cptr, #1 //Cptr1 = C_base + SVLb + addvl Bptr, Bptr, #1 + whilelt p1.b, Bptr, N_exit //Tile 1,3 predicate (N dimension) + add Aptr_end, A_base, C5, lsl #2 //A_base + K*SVLs + addvl K_exit, Aptr_end, #-1 //Exit condition for K loop + //Load 1st vector from Aptr + ld1w {z1.s}, p2/z, [Aptr] + zero {za} + // Load 1st vector from Bptr + ld1w {z2.s}, p0/z, [Bptr0] + // ZA0 += 1st Aptr vector OP 1st Bptr vector + fmopa za0.s, p2/m, p0/m, z1.s, z2.s + // Load 2nd vector from Aptr + ld1w {z5.s}, p3/z, [Aptr, C5, lsl #2] + // Aptr += SVLb + addvl Aptr, Aptr, #1 + +.K_Loop: + // ZA2 += 2nd Aptr vector OP 1st Bptr vector + fmopa za2.s, p3/m, p0/m, z5.s, z2.s + // Load 2nd vector from Bptr + ld1w {z3.s}, p1/z, [Bptr0, #1, MUL VL] + // ZA1 += 1st Aptr vector OP 2nd Bptr vector + fmopa za1.s, p2/m, p1/m, z1.s, z3.s + // Load next 1st vector from Aptr + ld1w {z0.s}, p2/z, [Aptr] + // ZA3 += 2nd Aptr vector OP 2nd Bptr vector + fmopa za3.s, p3/m, p1/m, z5.s, z3.s + cmp K, #2 + b.le process_K_less_than_equal_2 + // Load next 1st vector from Bptr + ld1w {z6.s}, p0/z, [Bptr0, N, lsl #2] + // ZA0 += 1st Aptr vector OP 1st Bptr vector + fmopa za0.s, p2/m, p0/m, z0.s, z6.s + // Load next 2nd vector from Aptr + ld1w {z4.s}, p3/z, [Aptr, C5, lsl #2] + // ZA2 += 2nd Aptr vector OP 1st Bptr vector + fmopa za2.s, p3/m, p0/m, z4.s, z6.s + // Load next 2nd vector from Bptr + ld1w {z7.s}, p1/z, [Bptr0, C2, lsl #2] + // Bptr += 2*ldb FP32 elms [Bytes] + add Bptr0, Bptr0, N, lsl #3 + // ZA1 += 1st Aptr vector OP 2nd Bptr vector + fmopa za1.s, p2/m, p1/m, z0.s, z7.s + // Load next 2nd vector from Aptr + ld1w {z1.s}, p2/z, [Aptr, #1, MUL VL] + // ZA3 += 2nd Aptr vector OP 2nd Bptr vector + fmopa za3.s, p3/m, p1/m, z4.s, z7.s + // Load next 1st vector from Bptr + ld1w {z2.s}, p0/z, [Bptr0] + // ZA0 += 1st Aptr vector OP 1st Bptr vector + fmopa za0.s, p2/m, p0/m, z1.s, z2.s + // Load next 2nd vector from Aptr + ld1w {z5.s}, p3/z, [Aptr, C3, lsl #2] + // Aptr += 2*SVLb [Bytes] + addvl Aptr, Aptr, #2 + cmp Aptr, K_exit + b.mi .K_Loop + // ZA2 += 2nd Aptr vector OP 1st Bptr vector + fmopa za2.s, p3/m, p0/m, z5.s, z2.s + // Load next 2nd vector from Bptr + ld1w {z3.s}, p1/z, [Bptr0, #1, MUL VL] + // ZA1 += 1st Aptr vector OP 2nd Bptr vector + fmopa za1.s, p2/m, p1/m, z1.s, z3.s + // ZA3 += 2nd Aptr vector OP 2nd Bptr vector + fmopa za3.s, p3/m, p1/m, z5.s, z3.s + +process_K_less_than_equal_2: + // Bptr += 2*ldb FP32 elements + add Bptr0, Bptr0, N, lsl #2 + cmp Aptr, Aptr_end + b.pl .Ktail_end + +.Ktail_start: + ld1w {z1.s}, p2/z, [Aptr] + ld1w {z2.s}, p0/z, [Bptr0] + ld1w {z3.s}, p1/z, [Bptr0, #1, MUL VL] + fmopa za0.s, p2/m, p0/m, z1.s, z2.s + ld1w {z5.s}, p3/z, [Aptr, C5, lsl #2] + fmopa za2.s, p3/m, p0/m, z5.s, z2.s + fmopa za1.s, p2/m, p1/m, z1.s, z3.s + fmopa za3.s, p3/m, p1/m, z5.s, z3.s + +.Ktail_end: + mov w13, #0 + psel p4, p0, p2.s[w13, 0] + psel p5, p1, p2.s[w13, 0] + psel p6, p0, p3.s[w13, 0] + psel p7, p1, p3.s[w13, 0] + // Store to Cptr0 + st1w {za0h.s[w13, #0]}, p4, [Cptr0] + // Store to Cptr1 + st1w {za1h.s[w13, #0]}, p5, [Cptr1] + // Store to Cptr0 + N*SVLs + st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] + // Store to Cptr1 + N*SVLs + st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] + +.Loop_store_ZA: + psel p4, p0, p2.s[w13, 1] + psel p5, p1, p2.s[w13, 1] + psel p6, p0, p3.s[w13, 1] + psel p7, p1, p3.s[w13, 1] + // Store to Cptr0 + N + st1w {za0h.s[w13, #1]}, p4, [Cptr0, N, lsl #2] + // Store to Cptr1 + N + st1w {za1h.s[w13, #1]}, p5, [Cptr1, N, lsl #2] + // Store to Cptr0 + N*(SVLs+1) + st1w {za2h.s[w13, #1]}, p6, [Cptr0, C1, lsl #2] + // Store to Cptr1 + N*(SVLs+1) + st1w {za3h.s[w13, #1]}, p7, [Cptr1, C1, lsl #2] + + add Cptr0, Cptr0, N, lsl #3 //Cptr0 += 2*N FP32 elements + add Cptr1, Cptr1, N, lsl #3 //Cptr1 += 2*N FP32 elements + add w13, w13, #2 + + psel p4, p0, p2.s[w13, 0] + psel p5, p1, p2.s[w13, 0] + psel p6, p0, p3.s[w13, 0] + psel p7, p1, p3.s[w13, 0] + st1w {za0h.s[w13, #0]}, p4, [Cptr0] + st1w {za1h.s[w13, #0]}, p5, [Cptr1] + st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] + st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] + cmp w13, w20 + b.mi .Loop_store_ZA + psel p4, p0, p2.s[w13, 1] + psel p5, p1, p2.s[w13, 1] + psel p6, p0, p3.s[w13, 1] + psel p7, p1, p3.s[w13, 1] + st1w {za0h.s[w13, #1]}, p4, [Cptr0, N, lsl #2] + st1w {za1h.s[w13, #1]}, p5, [Cptr1, N, lsl #2] + st1w {za2h.s[w13, #1]}, p6, [Cptr0, C1, lsl #2] + st1w {za3h.s[w13, #1]}, p7, [Cptr1, C1, lsl #2] + addvl Cptr, Cptr, #2 + addvl Bptr, Bptr, #1 + whilelt p0.b, Bptr, N_exit //1st Tile predicate (N dimension) + b.first .N_Loop + add A_base, A_base, C5, lsl #3 //A_base += 2*K*SVLs FP32 elements + add C_base, C_base, C6, lsl #3 //C_base += 2*N*SVLs FP32 elements + incw M_cntr + whilelt p2.s, M_cntr, M //1st Tile predicate (M dimension) + b.first .M_Loop + + smstop + + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #48 + + ret + diff --git a/kernel/arm64/sgemm_direct_sme1_preprocess.S b/kernel/arm64/sgemm_direct_sme1_preprocess.S new file mode 100644 index 0000000000..fa13620751 --- /dev/null +++ b/kernel/arm64/sgemm_direct_sme1_preprocess.S @@ -0,0 +1,133 @@ +/* + Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +/*---------------------------------------------------------------------------- + * This function is used to re-arrange the elements of input matrix to + * make it suitable for matrix outer product computation using SME for matrix + * multiplication. It should be used to pre-process the leftmatrix(A) in the + * matrix muliplication (C= A*B) using sgemm_direct_sme1_2VLx2VL() + * + * The pre-processing transposes a block of SVLs rows of the input matrix and + * stores it contiguously. The same is applied to remaining blocks of SVLs + * rows. The last block of SVLs rows is zero-padded to SVLs rows if needed. + * + * Usage of function: + * sgemm_direct_sme1_preprocess(uint64_t nrow, uint64_t ncol, \ + * const float * restrict mat, float * mat_mod); + * + ----------------------------------------------------------------------------*/ + + +#define nrow x0 //Number of rows of input matrix +#define ncol x1 //Number of coulumns of input matrix +#define mat x2 //Input matrix base address +#define mat_mod x3 //Output matrix (re-arranged matrix) base address +#define mat_mod_ptr x4 //Pointer to output matrix +#define mat_ptr0 x5 //Pointer to input matrix +#define mat_ptr1 x6 //2nd pointer to input matrix +#define outer_loop_cntr x7 //Outer loop counter +#define inner_loop_exit x8 //Inner loop exit condition +#define C1 x9 //Constant1: SVLs - No. of 32-bit elements +#define C2 x10 //Constant2: 3*SVLs +#define C3 x11 //Constant3: ncol*SVLs +#define C4 x13 //Constant4: 2*SVLs +#define C5 x14 //Constant5: 2*ncol +#define C6 x15 //Constant6: 3*ncol + + .text + .global sgemm_direct_sme1_preprocess + + sgemm_direct_sme1_preprocess: + + stp x19, x20, [sp, #-48]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + + smstart + + cntw C1 //SVLs + mul C3, C1, ncol //SVLs*ncol + lsl C5, ncol, #1 //2*ncol + add C6, C5, ncol //3*ncol + cnth C4 //2*SVLs + add C2, C1, C1, lsl #1 //3*SVLs + + mov outer_loop_cntr, #0 + //Tile predicate (M dimension) + whilelt p0.s, outer_loop_cntr, nrow + //Predicate for stores + ptrue p9.s + +.M_Loop: + mov mat_ptr0, mat //Load base address of mat + mov mat_mod_ptr, mat_mod //a_mod store base address + add inner_loop_exit, mat, ncol, lsl #2 //Exit condition for inner loop + whilelt p8.b, mat_ptr0, inner_loop_exit //Tile predicate (K dimension) + +.Loop_process: + mov mat_ptr1, mat_ptr0 + //Load_to_tile loop counter + mov w12, #0 + +.Load_to_tile: + psel p2, p8, p0.s[w12, 0] + psel p3, p8, p0.s[w12, 1] + psel p4, p8, p0.s[w12, 2] + psel p5, p8, p0.s[w12, 3] + //Load 1st row from mat_ptr1 + ld1w {za0h.s[w12, #0]}, p2/z, [mat_ptr1] + //Load 2nd row from mat_ptr1 + ncol + ld1w {za0h.s[w12, #1]}, p3/z, [mat_ptr1, ncol, lsl #2] + //Load 3rd row from mat_ptr1 + 2*ncol + ld1w {za0h.s[w12, #2]}, p4/z, [mat_ptr1, C5, lsl #2] + //Load 4th row from mat_ptr1 + 3*ncol + ld1w {za0h.s[w12, #3]}, p5/z, [mat_ptr1, C6, lsl #2] + //mat_ptr1+=4*ncol FP32 elements + add mat_ptr1, mat_ptr1, ncol, lsl #4 + //Increment counter + add w12, w12, #4 + cmp w12, w9 + b.mi .Load_to_tile + // Store_from_tile loop counter + mov w12, #0 + +.Store_from_tile: + psel p2, p9, p8.s[w12, 0] + psel p3, p9, p8.s[w12, 1] + psel p4, p9, p8.s[w12, 2] + psel p5, p9, p8.s[w12, 3] + //Store 1st col to mat_mod + st1w {za0v.s[w12, #0]}, p2, [mat_mod_ptr] + //Store 2nd col to mat_mod + SVLs + st1w {za0v.s[w12, #1]}, p3, [mat_mod_ptr, C1, lsl #2] + //Store 3rd col to mat_mod + 2*SVLs + st1w {za0v.s[w12, #2]}, p4, [mat_mod_ptr, C4, lsl #2] + //Store 4th col to mat_mod + 3*SVLs + st1w {za0v.s[w12, #3]}, p5, [mat_mod_ptr, C2, lsl #2] + + addvl mat_mod_ptr, mat_mod_ptr, #4 //mat_mod_ptr += 4*SVLb + add w12, w12, #4 //Increment counter + cmp w12, w9 + b.mi .Store_from_tile + + addvl mat_ptr0, mat_ptr0, #1 //mat_ptr0 += SVLb + whilelt p8.b, mat_ptr0, inner_loop_exit + b.first .Loop_process + + add mat_mod, mat_mod, C3, lsl #2 //mat_mod+=SVLs*nbc FP32 elements + add mat, mat, C3, lsl #2 //mat+=SVLs*nbc FP32 elements + incw outer_loop_cntr + + whilelt p0.s, outer_loop_cntr, nrow + b.first .M_Loop + + smstop + + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #48 + + ret + diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index fa61a209e1..dece71a66a 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -178,6 +178,11 @@ gotoblas_t TABLE_NAME = { #ifdef ARCH_X86_64 sgemm_directTS, sgemm_direct_performantTS, +#endif +#ifdef ARCH_ARM64 +#ifdef HAVE_SME + sgemm_directTS, +#endif #endif sgemm_kernelTS, sgemm_betaTS, diff --git a/param.h b/param.h index fee9195d02..51ebcbabbe 100644 --- a/param.h +++ b/param.h @@ -3303,6 +3303,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 + + #ifdef _WIN64 /* Use explicit casting for win64 as LLP64 datamodel is used */ #define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL @@ -3667,7 +3669,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE +#elif defined(ARMV8SVE) || defined(ARMV9SME) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 @@ -3738,6 +3740,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #endif /* ARMv8 */ +#if defined(ARMV9SME) /* ARMv9 SME */ +#define USE_SGEMM_KERNEL_DIRECT 1 +#endif /* ARMv9 SME */ + #if defined(ARMV5) #define SNUMOPT 2 #define DNUMOPT 2 From c1bb90a823eebb2aed4bde424941536c900e2fe1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Feb 2025 14:23:07 +0100 Subject: [PATCH 02/56] remove the express NeoverseN2 target from the Cobalt100 job --- .github/workflows/dynamic_arch.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index b388cb1b26..2d5c7b6123 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -372,7 +372,7 @@ jobs: - name: Build OpenBLAS run: | - make -j${nproc} TARGET=NEOVERSEN2 - make -j${nproc} TARGET=NEOVERSEN2 lapack-test + make -j${nproc} + make -j${nproc} lapack-test From b9ae246f205909c14561d4e8a52b21d72c7f5e8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Feb 2025 23:18:04 +0100 Subject: [PATCH 03/56] define USE_TRMM for RISCV64 targets as well --- kernel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index b43cda2c14..55daa6d1e8 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -201,7 +201,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) - if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) + if (ARM OR ARM64 OR RISCV64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) set(USE_TRMM true) endif () if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) From ebcab9097674f35c1fd7d87fee79875e4703ccab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Feb 2025 23:12:58 +0100 Subject: [PATCH 04/56] Handle flang-new runtime library linking on Linux like classic-flang --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index 668a4866e8..04fc64cfe0 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -197,7 +197,7 @@ ifeq ($(F_COMPILER), INTEL) -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. -else ifeq ($(F_COMPILER), FLANG) +else ifeq ($(F_COMPILER), $(filter $(F_COMPILER),FLANG FLANGNEW)) $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) From 6d1444be3ab6617944d1e54c29e46bc90ea5e01f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2025 14:26:43 +0100 Subject: [PATCH 05/56] Add ARM64 options for NVIDIA HPC --- cmake/cc.cmake | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 5e9c5a8c42..bffc7a7d6a 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -84,7 +84,7 @@ endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") if (POWER) set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8") - else () + elseif (X86_64) set(CCOMMON_OPT "${CCOMMON_OPT} -tp px") endif () endif () @@ -182,7 +182,9 @@ endif () if (${CORE} STREQUAL A64FX) if (NOT DYNAMIC_ARCH) - if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=a64fx") + elseif (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") @@ -194,6 +196,8 @@ if (${CORE} STREQUAL NEOVERSEN2) if (NOT DYNAMIC_ARCH) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v2") else () if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") @@ -208,6 +212,8 @@ if (${CORE} STREQUAL NEOVERSEV1) if (NOT DYNAMIC_ARCH) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v1") else () if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") @@ -220,7 +226,9 @@ endif () if (${CORE} STREQUAL NEOVERSEN1) if (NOT DYNAMIC_ARCH) - if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") + elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") @@ -232,6 +240,8 @@ if (${CORE} STREQUAL ARMV8SVE) if (NOT DYNAMIC_ARCH) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") endif () @@ -240,6 +250,9 @@ endif () if (${CORE} STREQUAL ARMV9SME) if (NOT DYNAMIC_ARCH) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") + else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") endif () endif () From f1fa370579aa2505975f85327d84b3b169a1228a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2025 15:22:26 +0100 Subject: [PATCH 06/56] fix missing endif --- cmake/cc.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index bffc7a7d6a..f292f1c577 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -254,6 +254,7 @@ if (${CORE} STREQUAL ARMV9SME) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") + endif () endif () endif () From b723c1b7b79663583e303e43132ee2e79ed1592c Mon Sep 17 00:00:00 2001 From: Marek Michalowski Date: Thu, 20 Feb 2025 10:18:47 +0000 Subject: [PATCH 07/56] Add thread throttling profile for SGEMM on `NEOVERSEV2` --- CONTRIBUTORS.md | 3 ++- interface/gemm.c | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f4a93aa1bc..80dd9211fe 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -237,8 +237,9 @@ In chronological order: * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel -* Marek Michalowski +* Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` + * [2025-02-18] Add thread throttling profile for SGEMM on `NEOVERSEV2` * Ye Tao * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 diff --git a/interface/gemm.c b/interface/gemm.c index 2cd7d7b5c3..67ab42b484 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -198,14 +198,37 @@ static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { } #endif +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) +static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) { + return + MNK < 125000L ? 1 + : MNK < 1092727L ? MIN(ncpu, 6) + : MNK < 2628072L ? MIN(ncpu, 8) + : MNK < 8000000L ? MIN(ncpu, 12) + : MNK < 20346417L ? MIN(ncpu, 16) + : MNK < 57066625L ? MIN(ncpu, 24) + : MNK < 91125000L ? MIN(ncpu, 28) + : MNK < 238328000L ? MIN(ncpu, 40) + : MNK < 454756609L ? MIN(ncpu, 48) + : MNK < 857375000L ? MIN(ncpu, 56) + : MNK < 1073741824L ? MIN(ncpu, 64) + : ncpu; +} +#endif + static inline int get_gemm_optimal_nthreads(double MNK) { int ncpu = num_cpu_avail(3); #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); +#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) if (strcmp(gotoblas_corename(), "neoversev1") == 0) { return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); } + if (strcmp(gotoblas_corename(), "neoversev2") == 0) { + return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); + } #endif if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { return 1; From 650a062e19e452cf1eb77617b14af4d8a838fc27 Mon Sep 17 00:00:00 2001 From: Marek Michalowski Date: Thu, 20 Feb 2025 10:19:40 +0000 Subject: [PATCH 08/56] Add thread throttling profile for SGEMV on `NEOVERSEV2` --- CONTRIBUTORS.md | 1 + interface/gemv.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 80dd9211fe..99166f5203 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -240,6 +240,7 @@ In chronological order: * Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` * [2025-02-18] Add thread throttling profile for SGEMM on `NEOVERSEV2` + * [2025-02-19] Add thread throttling profile for SGEMV on `NEOVERSEV2` * Ye Tao * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 diff --git a/interface/gemv.c b/interface/gemv.c index f91f364eed..4bcdf07c49 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -77,14 +77,30 @@ static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { } #endif +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) +static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { + return + MN < 24964L ? 1 + : MN < 65536L ? MIN(ncpu, 8) + : MN < 262144L ? MIN(ncpu, 32) + : MN < 1638400L ? MIN(ncpu, 64) + : ncpu; +} +#endif + static inline int get_gemv_optimal_nthreads(BLASLONG MN) { int ncpu = num_cpu_avail(3); #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); +#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) if (strcmp(gotoblas_corename(), "neoversev1") == 0) { return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); } + if (strcmp(gotoblas_corename(), "neoversev2") == 0) { + return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); + } #endif if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) From 75b958a0184a614c75ea41446d23d1b066acb2fc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 20 Feb 2025 23:54:12 +0100 Subject: [PATCH 09/56] Transform the B array back if necessary before returning --- interface/gemmt.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/interface/gemmt.c b/interface/gemmt.c index bcccf5a747..aa65f81edf 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -688,5 +688,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_END; +/* transform B back if necessary */ +#if defined(COMPLEX) + if (transb > 1){ +#ifndef CBLAS + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#else + if (order == CblasColMajor) + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); + if (order == CblasRowMajor) + IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#endif + } +#endif + return; } From f0bea79a6e151299cecdfb191969dbfe88ece814 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Fri, 21 Feb 2025 10:03:50 +0000 Subject: [PATCH 10/56] dispatch NEOVERSEV2 to NEOVERSEN2 under dynamic setting --- driver/others/dynamic_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 37991184ac..3174e22843 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -150,7 +150,7 @@ extern gotoblas_t gotoblas_A64FX; #endif extern gotoblas_t gotoblas_THUNDERX3T110; #endif -#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 +#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEN2 extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 From 77fba0f400beedc73ac817b2d5b3f86abefa9d7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Feb 2025 20:09:21 +0100 Subject: [PATCH 11/56] Fix "dummy2" flag handling --- kernel/power/scal.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/power/scal.S b/kernel/power/scal.S index eceb9fe8ed..8fd175d18e 100644 --- a/kernel/power/scal.S +++ b/kernel/power/scal.S @@ -51,7 +51,7 @@ #else #define X r7 #define INCX r8 -#define FLAG r12 +#define FLAG r11 #endif #endif @@ -63,7 +63,7 @@ #else #define X r7 #define INCX r8 -#define FLAG r12 +#define FLAG r11 #endif #endif @@ -91,7 +91,7 @@ fcmpu cr0, FZERO, ALPHA bne- cr0, LL(A1I1) - LDLONG FLAG, 48+64+8(SP) + LDLONG FLAG, 104(SP) cmpwi cr0, FLAG, 1 beq- cr0, LL(A1I1) From 030ae1fd97f04c0ff4536e4e35567147409fb985 Mon Sep 17 00:00:00 2001 From: Harishmcw Date: Tue, 25 Feb 2025 15:40:39 +0530 Subject: [PATCH 12/56] Redefined threading logic for WoA --- interface/gemv.c | 5 +++++ interface/lapack/gesv.c | 10 ++++++---- interface/zgemv.c | 15 ++++++++++----- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index f91f364eed..0f8fe66782 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -79,6 +79,11 @@ static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { static inline int get_gemv_optimal_nthreads(BLASLONG MN) { int ncpu = num_cpu_avail(3); +#if defined(_WIN64) && defined(_M_ARM64) + if (MN > 100000000L) + return num_cpu_avail(4); + return 1; +#endif #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index 51a38de60d..21fcc20970 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -117,13 +117,15 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, #if defined(_WIN64) && defined(_M_ARM64) #ifdef COMPLEX - if (args.m * args.n > 600) + if (args.m * args.n <= 300) #else - if (args.m * args.n > 1000) + if (args.m * args.n <= 500) #endif - args.nthreads = num_cpu_avail(4); - else args.nthreads = 1; + else if (args.m * args.n <= 1000) + args.nthreads = 4; + else + args.nthreads = num_cpu_avail(4); #else #ifndef DOUBLE if (args.m * args.n < 40000) diff --git a/interface/zgemv.c b/interface/zgemv.c index 3e98dba7ff..3438575b90 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -252,25 +252,30 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP - if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) +#if defined(_WIN64) && defined(_M_ARM64) + if (m*n > 25000000L) + nthreads = num_cpu_avail(4); + else + nthreads = 1; +#else + if (1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD) nthreads = 1; else nthreads = num_cpu_avail(2); +#endif if (nthreads == 1) { -#endif +#endif (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP - } else { - (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); - } #endif + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); From 09ba0994615c2e78570ca400bf13da0a8a45d873 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 25 Feb 2025 12:10:48 +0100 Subject: [PATCH 13/56] make throttling code conditional on SMP --- interface/gemm.c | 2 ++ interface/gemv.c | 9 ++------- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index 67ab42b484..d369256291 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -177,6 +177,7 @@ static int init_amxtile_permission() { } #endif +#ifdef SMP #ifdef DYNAMIC_ARCH extern char* gotoblas_corename(void); #endif @@ -242,6 +243,7 @@ static inline int get_gemm_optimal_nthreads(double MNK) { } } } +#endif #ifndef CBLAS diff --git a/interface/gemv.c b/interface/gemv.c index 4bcdf07c49..533ea3a56d 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -63,6 +63,7 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT }; #endif +#ifdef SMP #ifdef DYNAMIC_ARCH extern char* gotoblas_corename(void); #endif @@ -108,6 +109,7 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { else return num_cpu_avail(2); } +#endif #ifndef CBLAS @@ -248,13 +250,6 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; -#if 0 -/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */ - if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { - GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); - return; - } -#endif IDEBUG_START; FUNCTION_PROFILE_START(); From edaf51dd99bb979f15fa4f2774ba068cfec0c09e Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 26 Feb 2025 12:47:11 +0000 Subject: [PATCH 14/56] Add sbgemv_t_bfdot kernel for ARM64 This improves performance for sbgemv_t by up to 100x on NEOVERSEV1. The geometric mean speedup is ~61x for M=N=[2,512]. --- CONTRIBUTORS.md | 1 + kernel/arm64/KERNEL.NEOVERSEN2 | 1 + kernel/arm64/KERNEL.NEOVERSEV1 | 1 + kernel/arm64/KERNEL.NEOVERSEV2 | 4 + kernel/arm64/sbgemv_t_bfdot.c | 207 +++++++++++++++++++++++++++++++++ 5 files changed, 214 insertions(+) create mode 100644 kernel/arm64/sbgemv_t_bfdot.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 99166f5203..9ce5e37de3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -236,6 +236,7 @@ In chronological order: * Annop Wongwathanarat * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel + * [2025-02-26] Add sbgemv_t_bfdot kernel * Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index 2f7400113b..e4e1cfde31 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -198,3 +198,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +SBGEMVTKERNEL = sbgemv_t_bfdot.c \ No newline at end of file diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 8845e6860a..374acb35b8 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -15,4 +15,5 @@ SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +SBGEMVTKERNEL = sbgemv_t_bfdot.c endif \ No newline at end of file diff --git a/kernel/arm64/KERNEL.NEOVERSEV2 b/kernel/arm64/KERNEL.NEOVERSEV2 index bc59990979..4d866f8584 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV2 +++ b/kernel/arm64/KERNEL.NEOVERSEV2 @@ -1 +1,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE + +ifeq ($(BUILD_BFLOAT16), 1) +SBGEMVTKERNEL = sbgemv_t_bfdot.c +endif \ No newline at end of file diff --git a/kernel/arm64/sbgemv_t_bfdot.c b/kernel/arm64/sbgemv_t_bfdot.c new file mode 100644 index 0000000000..0751690fcd --- /dev/null +++ b/kernel/arm64/sbgemv_t_bfdot.c @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +static inline float bf16_to_fp32(bfloat16 bf16) { + uint32_t fp32 = (uint32_t)bf16 << 16; + return *((float*)&fp32); +} + +int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) +{ + if (m < 1 || n < 1) return(0); + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + bfloat16_t *a_ptr; + bfloat16_t *x_ptr; + float *y_ptr; + float temp; + + iy = 0; + a_ptr = (bfloat16_t*)(a); + x_ptr = (bfloat16_t*)(x); + + if (incx == 1) { + BLASLONG width = n / 4; + + bfloat16_t *a0_ptr = a_ptr + lda * width * 0; + bfloat16_t *a1_ptr = a_ptr + lda * width * 1; + bfloat16_t *a2_ptr = a_ptr + lda * width * 2; + bfloat16_t *a3_ptr = a_ptr + lda * width * 3; + + float *y0_ptr = y + incy * width * 0; + float *y1_ptr = y + incy * width * 1; + float *y2_ptr = y + incy * width * 2; + float *y3_ptr = y + incy * width * 3; + + for (j = 0; j < width; j++) { + float32x4_t temp0_vec = vdupq_n_f32(0.0f); + float32x4_t temp1_vec = vdupq_n_f32(0.0f); + float32x4_t temp2_vec = vdupq_n_f32(0.0f); + float32x4_t temp3_vec = vdupq_n_f32(0.0f); + + i = 0; + while (i + 7 < m) { + bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); + + bfloat16x8_t a0_vec = vld1q_bf16(a0_ptr + i); + bfloat16x8_t a1_vec = vld1q_bf16(a1_ptr + i); + bfloat16x8_t a2_vec = vld1q_bf16(a2_ptr + i); + bfloat16x8_t a3_vec = vld1q_bf16(a3_ptr + i); + + temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); + temp1_vec = vbfdotq_f32(temp1_vec, a1_vec, x_vec); + temp2_vec = vbfdotq_f32(temp2_vec, a2_vec, x_vec); + temp3_vec = vbfdotq_f32(temp3_vec, a3_vec, x_vec); + + i += 8; + } + if (i + 3 < m) { + float32x2_t t0 = vdup_n_f32(0.0f); + float32x2_t t1 = vdup_n_f32(0.0f); + float32x2_t t2 = vdup_n_f32(0.0f); + float32x2_t t3 = vdup_n_f32(0.0f); + + bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); + + bfloat16x4_t a0_vec = vld1_bf16(a0_ptr + i); + bfloat16x4_t a1_vec = vld1_bf16(a1_ptr + i); + bfloat16x4_t a2_vec = vld1_bf16(a2_ptr + i); + bfloat16x4_t a3_vec = vld1_bf16(a3_ptr + i); + + t0 = vbfdot_f32(t0, a0_vec, x_vec); + t1 = vbfdot_f32(t1, a1_vec, x_vec); + t2 = vbfdot_f32(t2, a2_vec, x_vec); + t3 = vbfdot_f32(t3, a3_vec, x_vec); + + float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); + float32x2_t temp1_vec_low = vget_low_f32(temp1_vec); + float32x2_t temp2_vec_low = vget_low_f32(temp2_vec); + float32x2_t temp3_vec_low = vget_low_f32(temp3_vec); + + temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); + temp1_vec = vcombine_f32(vadd_f32(t1, temp1_vec_low), vget_high_f32(temp1_vec)); + temp2_vec = vcombine_f32(vadd_f32(t2, temp2_vec_low), vget_high_f32(temp2_vec)); + temp3_vec = vcombine_f32(vadd_f32(t3, temp3_vec_low), vget_high_f32(temp3_vec)); + + i += 4; + } + if (beta == 0.0f) { + y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec); + y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec); + y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec); + y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec); + } + else { + y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y0_ptr[iy]; + y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec) + beta * y1_ptr[iy]; + y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec) + beta * y2_ptr[iy]; + y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec) + beta * y3_ptr[iy]; + } + + for (; i < m; ++i) { + y0_ptr[iy] += alpha * a0_ptr[i] * x_ptr[i]; + y1_ptr[iy] += alpha * a1_ptr[i] * x_ptr[i]; + y2_ptr[iy] += alpha * a2_ptr[i] * x_ptr[i]; + y3_ptr[iy] += alpha * a3_ptr[i] * x_ptr[i]; + } + + iy += incy; + + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + a3_ptr += lda; + } + + a_ptr = a3_ptr; + y_ptr = y3_ptr; + for (j = width * 4; j < n; j++) { + float32x4_t temp0_vec = vdupq_n_f32(0.0f); + i = 0; + while (i + 7 < m) { + bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); + bfloat16x8_t a0_vec = vld1q_bf16(a_ptr + i); + temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); + + i += 8; + } + if (i + 3 < m) { + float32x2_t t0 = vdup_n_f32(0.0f); + bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); + bfloat16x4_t a0_vec = vld1_bf16(a_ptr + i); + + t0 = vbfdot_f32(t0, a0_vec, x_vec); + float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); + temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); + + i += 4; + } + if (beta == 0.0f) { + y_ptr[iy] = alpha * vaddvq_f32(temp0_vec); + } + else { + y_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y_ptr[iy]; + } + + for (; i < m; ++i) { + y_ptr[iy] += alpha * a_ptr[i] * x_ptr[i]; + } + + iy += incy; + + a_ptr += lda; + } + return(0); + } + + for (j = 0; j < n; j++) { + temp = 0.0; + ix = 0; + for (i = 0; i < m; i++) { + temp += bf16_to_fp32(a[i]) * bf16_to_fp32(x[ix]); + ix += incx; + } + if (beta == 0.0f) { + y[iy] = alpha * temp; + } + else { + y[iy] = alpha * temp + beta * y[iy]; + } + iy += incy; + a += lda; + } + return (0); +} From 35bdbca1535c3297a64d304977c6aa28e2e57e49 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Thu, 27 Feb 2025 18:15:17 +0000 Subject: [PATCH 15/56] Add sbgemv_n_neon kernel for arm64. --- CONTRIBUTORS.md | 1 + kernel/arm64/KERNEL.NEOVERSEV1 | 2 + kernel/arm64/sbgemv_n_neon.c | 542 +++++++++++++++++++++++++++++++++ 3 files changed, 545 insertions(+) create mode 100644 kernel/arm64/sbgemv_n_neon.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f4a93aa1bc..9edf3d6eae 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -242,3 +242,4 @@ In chronological order: * Ye Tao * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 + * [2025-02-27] Add sbgemv_n_neon kernel \ No newline at end of file diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 8845e6860a..d14993544c 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -15,4 +15,6 @@ SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SBGEMVNKERNEL = sbgemv_n_neon.c endif \ No newline at end of file diff --git a/kernel/arm64/sbgemv_n_neon.c b/kernel/arm64/sbgemv_n_neon.c new file mode 100644 index 0000000000..cdb54298c0 --- /dev/null +++ b/kernel/arm64/sbgemv_n_neon.c @@ -0,0 +1,542 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if (defined(__GNUC__) && __GNUC__ >= 13) +#define BF16_TO_FP32(bf16) ((float)(bf16)) +#else +static inline float bf16_to_fp32(bfloat16_t bf16) { + uint32_t fp32 = (uint32_t)(*((u_int16_t*)(&bf16))) << 16; + return *((float*)&fp32); +} +#define BF16_TO_FP32(bf16) bf16_to_fp32(bf16) +#endif + +static void beta_op(float *x, BLASLONG n, FLOAT beta) { + if (beta == 0) { + memset(x, 0, n * sizeof(float)); + return; + } + + float32x4_t y0, y1, y2, y3; + + for (BLASLONG i = 0; i < n / 16; i++) { + y0 = vld1q_f32(x); + y1 = vld1q_f32(x + 4); + y2 = vld1q_f32(x + 8); + y3 = vld1q_f32(x + 12); + + y0 = vmulq_n_f32(y0, beta); + y1 = vmulq_n_f32(y1, beta); + y2 = vmulq_n_f32(y2, beta); + y3 = vmulq_n_f32(y3, beta); + + vst1q_f32(x, y0); + vst1q_f32(x + 4, y1); + vst1q_f32(x + 8, y2); + vst1q_f32(x + 12, y3); + + x += 16; + } + + if (n & 15) { + BLASLONG rest_n = n & 15; + for (BLASLONG i = 0; i < (rest_n) / 4; i++) { + y0 = vld1q_f32(x); + y0 = vmulq_n_f32(y0, beta); + vst1q_f32(x, y0); + x += 4; + } + + if (rest_n & 3) { + x[0] *= beta; + if ((rest_n & 3) > 1) + x[1] *= beta; + if ((rest_n & 3) > 2) + x[2] *= beta; + } + } + return; +} + +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, + bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) { + BLASLONG i, j; + bfloat16_t *a_ptr, *x_ptr; + FLOAT *y_ptr; + + bfloat16x8_t a0, a1, a2, a3, a4, a5, a6, a7; + bfloat16x8_t t0, t1, t2, t3, t4, t5, t6, t7; + bfloat16x8_t x_vec; + float32x4_t y1_vec, y2_vec; + float32x4_t fp32_low, fp32_high; + + float x0, x1, x2, x3, x4, x5, x6, x7; + bfloat16_t *a_ptr0, *a_ptr1, *a_ptr2, *a_ptr3, *a_ptr4, *a_ptr5, *a_ptr6, + *a_ptr7; + + a_ptr = (bfloat16_t *)a; + x_ptr = (bfloat16_t *)x; + + BLASLONG rest_m = m & 3; + + bfloat16x4_t bf16_zero = vreinterpret_bf16_u16(vdup_n_u16(0)); + bfloat16x8_t bf16_zero_q = vreinterpretq_bf16_u16(vdupq_n_u16(0)); + + if (incx == 1 && incy == 1) { + if (beta != 1) { + beta_op(y, n, beta); + } + + for (i = 0; i < n / 8; i++) { + a_ptr0 = a_ptr; + a_ptr1 = a_ptr0 + lda; + a_ptr2 = a_ptr1 + lda; + a_ptr3 = a_ptr2 + lda; + a_ptr4 = a_ptr3 + lda; + a_ptr5 = a_ptr4 + lda; + a_ptr6 = a_ptr5 + lda; + a_ptr7 = a_ptr6 + lda; + + a_ptr += 8 * lda; + + y_ptr = y; + + x_vec = vld1q_bf16(x_ptr); + + if (alpha != 1) { + fp32_low = vreinterpretq_f32_u16( + vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), + vreinterpretq_u16_bf16(x_vec))); + fp32_high = vreinterpretq_f32_u16( + vzip2q_u16(vreinterpretq_u16_bf16(bf16_zero_q), + vreinterpretq_u16_bf16(x_vec))); + + fp32_low = vmulq_n_f32(fp32_low, alpha); + fp32_high = vmulq_n_f32(fp32_high, alpha); + + x_vec = + vcombine_bf16(vcvt_bf16_f32(fp32_low), vcvt_bf16_f32(fp32_high)); + } + + for (j = 0; j < m / 8; j++) { + a0 = vld1q_bf16(a_ptr0); + a1 = vld1q_bf16(a_ptr1); + a2 = vld1q_bf16(a_ptr2); + a3 = vld1q_bf16(a_ptr3); + a4 = vld1q_bf16(a_ptr4); + a5 = vld1q_bf16(a_ptr5); + a6 = vld1q_bf16(a_ptr6); + a7 = vld1q_bf16(a_ptr7); + + y1_vec = vld1q_f32(y_ptr); + y2_vec = vld1q_f32(y_ptr + 4); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t2 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); + t3 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); + + t4 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t5 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t6 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); + t7 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); + + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); + + y2_vec = vbfmlalbq_laneq_f32(y2_vec, t4, x_vec, 0); + y2_vec = vbfmlaltq_laneq_f32(y2_vec, t4, x_vec, 1); + y2_vec = vbfmlalbq_laneq_f32(y2_vec, t5, x_vec, 2); + y2_vec = vbfmlaltq_laneq_f32(y2_vec, t5, x_vec, 3); + y2_vec = vbfmlalbq_laneq_f32(y2_vec, t6, x_vec, 4); + y2_vec = vbfmlaltq_laneq_f32(y2_vec, t6, x_vec, 5); + y2_vec = vbfmlalbq_laneq_f32(y2_vec, t7, x_vec, 6); + y2_vec = vbfmlaltq_laneq_f32(y2_vec, t7, x_vec, 7); + + vst1q_f32(y_ptr, y1_vec); + vst1q_f32(y_ptr + 4, y2_vec); + + a_ptr0 += 8; + a_ptr1 += 8; + a_ptr2 += 8; + a_ptr3 += 8; + a_ptr4 += 8; + a_ptr5 += 8; + a_ptr6 += 8; + a_ptr7 += 8; + + y_ptr += 8; + } + + if (m & 4) { + bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); + bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); + bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); + bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); + bfloat16x4_t a4x4 = vld1_bf16(a_ptr4); + bfloat16x4_t a5x4 = vld1_bf16(a_ptr5); + bfloat16x4_t a6x4 = vld1_bf16(a_ptr6); + bfloat16x4_t a7x4 = vld1_bf16(a_ptr7); + + y1_vec = vld1q_f32(y_ptr); + + a0 = vcombine_bf16(a0x4, bf16_zero); + a1 = vcombine_bf16(a1x4, bf16_zero); + a2 = vcombine_bf16(a2x4, bf16_zero); + a3 = vcombine_bf16(a3x4, bf16_zero); + a4 = vcombine_bf16(a4x4, bf16_zero); + a5 = vcombine_bf16(a5x4, bf16_zero); + a6 = vcombine_bf16(a6x4, bf16_zero); + a7 = vcombine_bf16(a7x4, bf16_zero); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t2 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); + t3 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); + + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); + + vst1q_f32(y_ptr, y1_vec); + + a_ptr0 += 4; + a_ptr1 += 4; + a_ptr2 += 4; + a_ptr3 += 4; + a_ptr4 += 4; + a_ptr5 += 4; + a_ptr6 += 4; + a_ptr7 += 4; + + y_ptr += 4; + } + + if (rest_m) { + x0 = alpha * BF16_TO_FP32(x_ptr[0]); + x1 = alpha * BF16_TO_FP32(x_ptr[1]); + x2 = alpha * BF16_TO_FP32(x_ptr[2]); + x3 = alpha * BF16_TO_FP32(x_ptr[3]); + x4 = alpha * BF16_TO_FP32(x_ptr[4]); + x5 = alpha * BF16_TO_FP32(x_ptr[5]); + x6 = alpha * BF16_TO_FP32(x_ptr[6]); + x7 = alpha * BF16_TO_FP32(x_ptr[7]); + + for (BLASLONG j = 0; j < rest_m; j++) { + y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); + y_ptr[j] += x1 * BF16_TO_FP32(a_ptr1[j]); + y_ptr[j] += x2 * BF16_TO_FP32(a_ptr2[j]); + y_ptr[j] += x3 * BF16_TO_FP32(a_ptr3[j]); + y_ptr[j] += x4 * BF16_TO_FP32(a_ptr4[j]); + y_ptr[j] += x5 * BF16_TO_FP32(a_ptr5[j]); + y_ptr[j] += x6 * BF16_TO_FP32(a_ptr6[j]); + y_ptr[j] += x7 * BF16_TO_FP32(a_ptr7[j]); + } + } + + x_ptr += 8; + } + + if (n & 4) { + a_ptr0 = a_ptr; + a_ptr1 = a_ptr0 + lda; + a_ptr2 = a_ptr1 + lda; + a_ptr3 = a_ptr2 + lda; + + a_ptr += 4 * lda; + + bfloat16x4_t x_vecx4 = vld1_bf16(x_ptr); + if (alpha != 1) { + x_vec = vcombine_bf16(x_vecx4, bf16_zero); + fp32_low = vreinterpretq_f32_u16( + vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), + vreinterpretq_u16_bf16(x_vec))); + fp32_low = vmulq_n_f32(fp32_low, alpha); + x_vecx4 = vcvt_bf16_f32(fp32_low); + } + + y_ptr = y; + for (j = 0; j < m / 8; j++) { + a0 = vld1q_bf16(a_ptr0); + a1 = vld1q_bf16(a_ptr1); + a2 = vld1q_bf16(a_ptr2); + a3 = vld1q_bf16(a_ptr3); + + y1_vec = vld1q_f32(y_ptr); + y2_vec = vld1q_f32(y_ptr + 4); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t4 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t5 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + + y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); + y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); + + y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0); + y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1); + y2_vec = vbfmlalbq_lane_f32(y2_vec, t5, x_vecx4, 2); + y2_vec = vbfmlaltq_lane_f32(y2_vec, t5, x_vecx4, 3); + + vst1q_f32(y_ptr, y1_vec); + vst1q_f32(y_ptr + 4, y2_vec); + + a_ptr0 += 8; + a_ptr1 += 8; + a_ptr2 += 8; + a_ptr3 += 8; + + y_ptr += 8; + } + + if (m & 4) { + bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); + bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); + bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); + bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); + + y1_vec = vld1q_f32(y_ptr); + + a0 = vcombine_bf16(a0x4, bf16_zero); + a1 = vcombine_bf16(a1x4, bf16_zero); + a2 = vcombine_bf16(a2x4, bf16_zero); + a3 = vcombine_bf16(a3x4, bf16_zero); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + + y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); + y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); + + vst1q_f32(y_ptr, y1_vec); + + a_ptr0 += 4; + a_ptr1 += 4; + a_ptr2 += 4; + a_ptr3 += 4; + + y_ptr += 4; + } + + if (rest_m) { + x0 = alpha * BF16_TO_FP32(x_ptr[0]); + x1 = alpha * BF16_TO_FP32(x_ptr[1]); + x2 = alpha * BF16_TO_FP32(x_ptr[2]); + x3 = alpha * BF16_TO_FP32(x_ptr[3]); + + for (BLASLONG j = 0; j < rest_m; j++) { + y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); + y_ptr[j] += x1 * BF16_TO_FP32(a_ptr1[j]); + y_ptr[j] += x2 * BF16_TO_FP32(a_ptr2[j]); + y_ptr[j] += x3 * BF16_TO_FP32(a_ptr3[j]); + } + } + + x_ptr += 4; + } + + if (n & 2) { + a_ptr0 = a_ptr; + a_ptr1 = a_ptr0 + lda; + + a_ptr += 2 * lda; + + bfloat16_t tmp_buffer[4]; + memset((void*)tmp_buffer, 0, sizeof(bfloat16_t)); + + tmp_buffer[0] = x_ptr[0]; + tmp_buffer[1] = x_ptr[1]; + + bfloat16x4_t x_vecx4 = vld1_bf16(tmp_buffer); + if (alpha != 1) { + x_vec = vcombine_bf16(x_vecx4, bf16_zero); + fp32_low = vreinterpretq_f32_u16( + vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), + vreinterpretq_u16_bf16(x_vec))); + fp32_low = vmulq_n_f32(fp32_low, alpha); + x_vecx4 = vcvt_bf16_f32(fp32_low); + } + + y_ptr = y; + for (j = 0; j < m / 8; j++) { + a0 = vld1q_bf16(a_ptr0); + a1 = vld1q_bf16(a_ptr1); + + y1_vec = vld1q_f32(y_ptr); + y2_vec = vld1q_f32(y_ptr + 4); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t4 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + + y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); + + y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0); + y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1); + + vst1q_f32(y_ptr, y1_vec); + vst1q_f32(y_ptr + 4, y2_vec); + + a_ptr0 += 8; + a_ptr1 += 8; + + y_ptr += 8; + } + + if (m & 4) { + bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); + bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); + + y1_vec = vld1q_f32(y_ptr); + + a0 = vcombine_bf16(a0x4, bf16_zero); + a1 = vcombine_bf16(a1x4, bf16_zero); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + + y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); + y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); + + vst1q_f32(y_ptr, y1_vec); + + a_ptr0 += 4; + a_ptr1 += 4; + a_ptr2 += 4; + a_ptr3 += 4; + + y_ptr += 4; + } + + if (m & 2) { + float x0, x1; + x0 = alpha * (BF16_TO_FP32(x_ptr[0])); + x1 = alpha * (BF16_TO_FP32(x_ptr[1])); + + y_ptr[0] += x0 * BF16_TO_FP32(a_ptr0[0]); + y_ptr[0] += x1 * BF16_TO_FP32(a_ptr1[0]); + y_ptr[1] += x0 * BF16_TO_FP32(a_ptr0[1]); + y_ptr[1] += x1 * BF16_TO_FP32(a_ptr1[1]); + + a_ptr0 += 2; + a_ptr1 += 2; + + y_ptr += 2; + } + + if (m & 1) { + float x0, x1; + x0 = alpha * BF16_TO_FP32(x_ptr[0]); + x1 = alpha * BF16_TO_FP32(x_ptr[1]); + + y_ptr[0] += x0 * BF16_TO_FP32(a_ptr0[0]); + y_ptr[0] += x1 * BF16_TO_FP32(a_ptr1[0]); + } + + x_ptr += 2; + } + + if (n & 1) { + x0 = BF16_TO_FP32(x_ptr[0]) * alpha; + y_ptr = y; + a_ptr0 = a_ptr; + + for (j = 0; j < m; j++) { + y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); + } + } + + return (0); + } + + BLASLONG iy = 0; + for (i = 0; i < m; i++) { + y[iy] *= beta; + iy += incy; + } + + for (j = 0; j < n; j++) { + x0 = alpha * BF16_TO_FP32(*x_ptr); + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += x0 * BF16_TO_FP32(a_ptr[i]); + iy += incy; + } + + a_ptr += lda; + x_ptr += incx; + } + + return (0); +} From 4346b9155970dc80b794f88583411549778fde74 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Fri, 28 Feb 2025 13:17:46 +0000 Subject: [PATCH 16/56] add beta and alpha testcase for sbgemv --- test/compare_sgemm_sbgemm.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 05d9b33aba..ae109c1a56 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2020, The OpenBLAS Project +Copyright (c) 2020,2025 The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -202,6 +202,8 @@ main (int argc, char *argv[]) return ret; } + for (beta = 0; beta < 3; beta += 1) { + for (alpha = 0; alpha < 3; alpha += 1) { for (l = 0; l < 2; l++) { // l = 1 to test inc_x & inc_y not equal to one. for (x = 1; x <= loop; x++) { @@ -230,7 +232,10 @@ main (int argc, char *argv[]) B[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; sbstobf16_(&one, &B[j << l], &one, &btmp, &one); BB[j << l].v = btmp; + + CC[j << l] = C[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; } + for (y = 0; y < 2; y++) { if (y == 0) { @@ -246,12 +251,14 @@ main (int argc, char *argv[]) SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); + for (int i = 0; i < x; i ++) DD[i] *= beta; + for (j = 0; j < x; j++) for (i = 0; i < x; i++) if (transA == 'N') { - DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j << l]); + DD[i] += alpha * float16to32 (AA[j * x + i]) * float16to32 (BB[j << l]); } else if (transA == 'T') { - DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i << l]); + DD[j] += alpha * float16to32 (AA[j * x + i]) * float16to32 (BB[i << l]); } for (j = 0; j < x; j++) { @@ -268,8 +275,10 @@ main (int argc, char *argv[]) free(BB); free(DD); free(CC); - } - } + } // x + } // l + } // alpha + } // beta if (ret != 0) fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret); From 35914aa9a2a862d61e391bc40a1620e11df64ad3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Mar 2025 22:54:59 +0100 Subject: [PATCH 17/56] Expose the option to build without LAPACKE to ccmake --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e99bd208f..f8b63041ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,8 @@ include(CMakePackageConfigHelpers) ####### option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) +option(BUILD_WITHOUT_LAPACKE "Do not build the C interface to LAPACK)" OFF) + option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)") @@ -81,6 +83,10 @@ if(BUILD_WITHOUT_LAPACK) set(NO_LAPACKE 1) endif() +if (BUILD_WITHOUT_LAPACKE) + set(NO_LAPACKE 1) +endif() + if(BUILD_WITHOUT_CBLAS) set(NO_CBLAS 1) endif() From 38ee7c93011946ac3cb5231600e09a0a583d93b0 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Mon, 3 Mar 2025 11:30:45 +0000 Subject: [PATCH 18/56] Add dispatch of SBGEMVNKERNEL for NEOVERSEN2 and NEOVERSEV2 --- kernel/arm64/KERNEL.NEOVERSEN2 | 3 ++- kernel/arm64/KERNEL.NEOVERSEV2 | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index e4e1cfde31..fc7fe69302 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -198,4 +198,5 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) -SBGEMVTKERNEL = sbgemv_t_bfdot.c \ No newline at end of file +SBGEMVTKERNEL = sbgemv_t_bfdot.c +SBGEMVNKERNEL = sbgemv_n_neon.c \ No newline at end of file diff --git a/kernel/arm64/KERNEL.NEOVERSEV2 b/kernel/arm64/KERNEL.NEOVERSEV2 index 4d866f8584..e08efdb9d8 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV2 +++ b/kernel/arm64/KERNEL.NEOVERSEV2 @@ -2,4 +2,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE ifeq ($(BUILD_BFLOAT16), 1) SBGEMVTKERNEL = sbgemv_t_bfdot.c +SBGEMVNKERNEL = sbgemv_n_neon.c endif \ No newline at end of file From 6b8b35cdf2de638d67396fe624cf624087fb9d0a Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Mon, 3 Mar 2025 11:55:27 +0000 Subject: [PATCH 19/56] fix minior issues of redeclaration of float x0,x1 in sbgemv_n_neon.c --- kernel/arm64/sbgemv_n_neon.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/arm64/sbgemv_n_neon.c b/kernel/arm64/sbgemv_n_neon.c index cdb54298c0..9e7ea03c8b 100644 --- a/kernel/arm64/sbgemv_n_neon.c +++ b/kernel/arm64/sbgemv_n_neon.c @@ -480,7 +480,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 2) { - float x0, x1; x0 = alpha * (BF16_TO_FP32(x_ptr[0])); x1 = alpha * (BF16_TO_FP32(x_ptr[1])); @@ -496,7 +495,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 1) { - float x0, x1; x0 = alpha * BF16_TO_FP32(x_ptr[0]); x1 = alpha * BF16_TO_FP32(x_ptr[1]); From 39eb43d4410d0745ca2c78d1162d973abf3aa35b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Mar 2025 13:48:28 +0100 Subject: [PATCH 20/56] Improve thread safety of pthreads builds that rely on C11 atomic operations for locking (#5170) * Tighten memory orders for C11 atomic operations --- driver/others/blas_server.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 7306a3ecd8..4b79136ec7 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -146,8 +146,8 @@ typedef struct { } thread_status_t; #ifdef HAVE_C11 -#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) -#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) +#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_ACQUIRE) +#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELEASE) #else #define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) #define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) @@ -637,7 +637,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ #ifdef SMP_SERVER // Handle lazy re-init of the thread-pool after a POSIX fork + LOCK_COMMAND(&server_lock); if (unlikely(blas_server_avail == 0)) blas_thread_init(); + UNLOCK_COMMAND(&server_lock); #endif BLASLONG i = 0; blas_queue_t *current = queue; From 5c4e38ab17eb530e950e68e1d45ea7a2fcd25cea Mon Sep 17 00:00:00 2001 From: manjam01 Date: Thu, 27 Feb 2025 09:39:06 +0000 Subject: [PATCH 21/56] Optimize gemv_n_sve kernel --- kernel/arm64/KERNEL.ARMV8SVE | 2 +- kernel/arm64/gemv_n_sve.c | 83 ++++++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 13 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index dc58e329fc..9adacce632 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -74,7 +74,7 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = gemv_n_sve.c DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S diff --git a/kernel/arm64/gemv_n_sve.c b/kernel/arm64/gemv_n_sve.c index 2950555615..59a5c85572 100644 --- a/kernel/arm64/gemv_n_sve.c +++ b/kernel/arm64/gemv_n_sve.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2024, The OpenBLAS Project +Copyright (c) 2024-2025, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,23 +59,82 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr = a; if (inc_y == 1) { + BLASLONG width = n / 3; uint64_t sve_size = SV_COUNT(); - for (j = 0; j < n; j++) { - SV_TYPE temp_vec = SV_DUP(alpha * x[ix]); - i = 0; - svbool_t pg = SV_WHILE(i, m); - while (svptest_any(SV_TRUE(), pg)) { - SV_TYPE a_vec = svld1(pg, a_ptr + i); + svbool_t pg_true = SV_TRUE(); + svbool_t pg = SV_WHILE(0, m % sve_size); + + FLOAT *a0_ptr = a + lda * width * 0; + FLOAT *a1_ptr = a + lda * width * 1; + FLOAT *a2_ptr = a + lda * width * 2; + + for (j = 0; j < width; j++) { + for (i = 0; (i + sve_size - 1) < m; i += sve_size) { + ix = j * inc_x; + + SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); + SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); + SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + + SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); + + SV_TYPE y_vec = svld1(pg_true, y + i); + y_vec = svmla_lane(y_vec, a00_vec, x0_vec, 0); + y_vec = svmla_lane(y_vec, a01_vec, x1_vec, 0); + y_vec = svmla_lane(y_vec, a02_vec, x2_vec, 0); + + svst1(pg_true, y + i, y_vec); + } + + if (i < m) { + SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); + SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); + SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + + SV_TYPE a00_vec = svld1(pg, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg, a2_ptr + i); + SV_TYPE y_vec = svld1(pg, y + i); - y_vec = svmla_x(pg, y_vec, temp_vec, a_vec); + y_vec = svmla_m(pg, y_vec, a00_vec, x0_vec); + y_vec = svmla_m(pg, y_vec, a01_vec, x1_vec); + y_vec = svmla_m(pg, y_vec, a02_vec, x2_vec); + + ix += inc_x; + svst1(pg, y + i, y_vec); - i += sve_size; - pg = SV_WHILE(i, m); } + + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + } + + a_ptr = a2_ptr; + for (j = width * 3; j < n; j++) { + ix = j * inc_x; + for (i = 0; (i + sve_size - 1) < m; i += sve_size) { + SV_TYPE y_vec = svld1(pg_true, y + i); + SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); + SV_TYPE a_vec = svld1(pg_true, a_ptr + i); + y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec); + svst1(pg_true, y + i, y_vec); + } + + if (i < m) { + SV_TYPE y_vec = svld1(pg, y + i); + SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); + SV_TYPE a_vec = svld1(pg, a_ptr + i); + y_vec = svmla_m(pg, y_vec, a_vec, x_vec); + svst1(pg, y + i, y_vec); + } + a_ptr += lda; ix += inc_x; } - return(0); + return (0); } for (j = 0; j < n; j++) { @@ -89,4 +148,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ix += inc_x; } return (0); -} +} \ No newline at end of file From 80d3c2ad95781211b77272a1cfc9d77ba7ec402d Mon Sep 17 00:00:00 2001 From: Masato Nakagawa Date: Tue, 11 Mar 2025 20:18:20 +0900 Subject: [PATCH 22/56] Add Improving Load Imbalance in Thread-Parallel GEMM --- driver/level3/level3_thread.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 9b1aadf7dc..77aaeee6b9 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -591,7 +591,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG BLASLONG nthreads = args -> nthreads; - BLASLONG width, i, j, k, js; + BLASLONG width, width_n, i, j, k, js; BLASLONG m, n, n_from, n_to; int mode; #if defined(DYNAMIC_ARCH) @@ -740,18 +740,25 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG /* Partition (a step of) n into nthreads regions */ range_N[0] = js; num_parts = 0; - while (n > 0){ - width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); - if (width < switch_ratio) { - width = switch_ratio; + for(j = 0; j < nthreads_n; j++){ + width_n = blas_quickdivide(n + nthreads_n - j - 1, nthreads_n - j); + n -= width_n; + for(i = 0; i < nthreads_m; i++){ + width = blas_quickdivide(width_n + nthreads_m - i - 1, nthreads_m - i); + if (width < switch_ratio) { + width = switch_ratio; + } + width = round_up(width_n, width, GEMM_PREFERED_SIZE); + + width_n -= width; + if (width_n < 0) { + width = width + width_n; + width_n = 0; + } + range_N[num_parts + 1] = range_N[num_parts] + width; + + num_parts ++; } - width = round_up(n, width, GEMM_PREFERED_SIZE); - - n -= width; - if (n < 0) width = width + n; - range_N[num_parts + 1] = range_N[num_parts] + width; - - num_parts ++; } for (j = num_parts; j < MAX_CPU_NUMBER; j++) { range_N[j + 1] = range_N[num_parts]; From a085b6c9ec38de7109fe95322db677fc18c31696 Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 12 Mar 2025 14:49:10 +0000 Subject: [PATCH 23/56] Fix aarch64 sbgemv_t compilation error for GCC < 13 --- CONTRIBUTORS.md | 1 + kernel/arm64/sbgemv_t_bfdot.c | 17 ++++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 041582892d..938a3bf918 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -237,6 +237,7 @@ In chronological order: * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel * [2025-02-26] Add sbgemv_t_bfdot kernel + * [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13 * Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` diff --git a/kernel/arm64/sbgemv_t_bfdot.c b/kernel/arm64/sbgemv_t_bfdot.c index 0751690fcd..fc4ae019e9 100644 --- a/kernel/arm64/sbgemv_t_bfdot.c +++ b/kernel/arm64/sbgemv_t_bfdot.c @@ -33,11 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -static inline float bf16_to_fp32(bfloat16 bf16) { - uint32_t fp32 = (uint32_t)bf16 << 16; - return *((float*)&fp32); -} - int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) { if (m < 1 || n < 1) return(0); @@ -132,10 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat } for (; i < m; ++i) { - y0_ptr[iy] += alpha * a0_ptr[i] * x_ptr[i]; - y1_ptr[iy] += alpha * a1_ptr[i] * x_ptr[i]; - y2_ptr[iy] += alpha * a2_ptr[i] * x_ptr[i]; - y3_ptr[iy] += alpha * a3_ptr[i] * x_ptr[i]; + y0_ptr[iy] += alpha * vcvtah_f32_bf16(a0_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + y1_ptr[iy] += alpha * vcvtah_f32_bf16(a1_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + y2_ptr[iy] += alpha * vcvtah_f32_bf16(a2_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + y3_ptr[iy] += alpha * vcvtah_f32_bf16(a3_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); } iy += incy; @@ -177,7 +172,7 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat } for (; i < m; ++i) { - y_ptr[iy] += alpha * a_ptr[i] * x_ptr[i]; + y_ptr[iy] += alpha * vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); } iy += incy; @@ -191,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat temp = 0.0; ix = 0; for (i = 0; i < m; i++) { - temp += bf16_to_fp32(a[i]) * bf16_to_fp32(x[ix]); + temp += vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[ix]); ix += incx; } if (beta == 0.0f) { From 4c00099ed65af573912065a69d83ce42a9aa0cba Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Wed, 12 Mar 2025 16:20:15 +0000 Subject: [PATCH 24/56] replace customize bf16_to_fp32 with arm neon vcvtah_f32_bf16 --- kernel/arm64/sbgemv_n_neon.c | 86 ++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/kernel/arm64/sbgemv_n_neon.c b/kernel/arm64/sbgemv_n_neon.c index 9e7ea03c8b..489d4d22cb 100644 --- a/kernel/arm64/sbgemv_n_neon.c +++ b/kernel/arm64/sbgemv_n_neon.c @@ -33,16 +33,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if (defined(__GNUC__) && __GNUC__ >= 13) -#define BF16_TO_FP32(bf16) ((float)(bf16)) -#else -static inline float bf16_to_fp32(bfloat16_t bf16) { - uint32_t fp32 = (uint32_t)(*((u_int16_t*)(&bf16))) << 16; - return *((float*)&fp32); -} -#define BF16_TO_FP32(bf16) bf16_to_fp32(bf16) -#endif - static void beta_op(float *x, BLASLONG n, FLOAT beta) { if (beta == 0) { memset(x, 0, n * sizeof(float)); @@ -268,24 +258,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (rest_m) { - x0 = alpha * BF16_TO_FP32(x_ptr[0]); - x1 = alpha * BF16_TO_FP32(x_ptr[1]); - x2 = alpha * BF16_TO_FP32(x_ptr[2]); - x3 = alpha * BF16_TO_FP32(x_ptr[3]); - x4 = alpha * BF16_TO_FP32(x_ptr[4]); - x5 = alpha * BF16_TO_FP32(x_ptr[5]); - x6 = alpha * BF16_TO_FP32(x_ptr[6]); - x7 = alpha * BF16_TO_FP32(x_ptr[7]); + x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); + x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); + x2 = alpha * vcvtah_f32_bf16(x_ptr[2]); + x3 = alpha * vcvtah_f32_bf16(x_ptr[3]); + x4 = alpha * vcvtah_f32_bf16(x_ptr[4]); + x5 = alpha * vcvtah_f32_bf16(x_ptr[5]); + x6 = alpha * vcvtah_f32_bf16(x_ptr[6]); + x7 = alpha * vcvtah_f32_bf16(x_ptr[7]); for (BLASLONG j = 0; j < rest_m; j++) { - y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); - y_ptr[j] += x1 * BF16_TO_FP32(a_ptr1[j]); - y_ptr[j] += x2 * BF16_TO_FP32(a_ptr2[j]); - y_ptr[j] += x3 * BF16_TO_FP32(a_ptr3[j]); - y_ptr[j] += x4 * BF16_TO_FP32(a_ptr4[j]); - y_ptr[j] += x5 * BF16_TO_FP32(a_ptr5[j]); - y_ptr[j] += x6 * BF16_TO_FP32(a_ptr6[j]); - y_ptr[j] += x7 * BF16_TO_FP32(a_ptr7[j]); + y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); + y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); + y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); + y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); + y_ptr[j] += x4 * vcvtah_f32_bf16(a_ptr4[j]); + y_ptr[j] += x5 * vcvtah_f32_bf16(a_ptr5[j]); + y_ptr[j] += x6 * vcvtah_f32_bf16(a_ptr6[j]); + y_ptr[j] += x7 * vcvtah_f32_bf16(a_ptr7[j]); } } @@ -384,16 +374,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (rest_m) { - x0 = alpha * BF16_TO_FP32(x_ptr[0]); - x1 = alpha * BF16_TO_FP32(x_ptr[1]); - x2 = alpha * BF16_TO_FP32(x_ptr[2]); - x3 = alpha * BF16_TO_FP32(x_ptr[3]); + x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); + x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); + x2 = alpha * vcvtah_f32_bf16(x_ptr[2]); + x3 = alpha * vcvtah_f32_bf16(x_ptr[3]); for (BLASLONG j = 0; j < rest_m; j++) { - y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); - y_ptr[j] += x1 * BF16_TO_FP32(a_ptr1[j]); - y_ptr[j] += x2 * BF16_TO_FP32(a_ptr2[j]); - y_ptr[j] += x3 * BF16_TO_FP32(a_ptr3[j]); + y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); + y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); + y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); + y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); } } @@ -480,13 +470,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 2) { - x0 = alpha * (BF16_TO_FP32(x_ptr[0])); - x1 = alpha * (BF16_TO_FP32(x_ptr[1])); + x0 = alpha * (vcvtah_f32_bf16(x_ptr[0])); + x1 = alpha * (vcvtah_f32_bf16(x_ptr[1])); - y_ptr[0] += x0 * BF16_TO_FP32(a_ptr0[0]); - y_ptr[0] += x1 * BF16_TO_FP32(a_ptr1[0]); - y_ptr[1] += x0 * BF16_TO_FP32(a_ptr0[1]); - y_ptr[1] += x1 * BF16_TO_FP32(a_ptr1[1]); + y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); + y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); + y_ptr[1] += x0 * vcvtah_f32_bf16(a_ptr0[1]); + y_ptr[1] += x1 * vcvtah_f32_bf16(a_ptr1[1]); a_ptr0 += 2; a_ptr1 += 2; @@ -495,23 +485,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 1) { - x0 = alpha * BF16_TO_FP32(x_ptr[0]); - x1 = alpha * BF16_TO_FP32(x_ptr[1]); + x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); + x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); - y_ptr[0] += x0 * BF16_TO_FP32(a_ptr0[0]); - y_ptr[0] += x1 * BF16_TO_FP32(a_ptr1[0]); + y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); + y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); } x_ptr += 2; } if (n & 1) { - x0 = BF16_TO_FP32(x_ptr[0]) * alpha; + x0 = vcvtah_f32_bf16(x_ptr[0]) * alpha; y_ptr = y; a_ptr0 = a_ptr; for (j = 0; j < m; j++) { - y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); + y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); } } @@ -525,10 +515,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } for (j = 0; j < n; j++) { - x0 = alpha * BF16_TO_FP32(*x_ptr); + x0 = alpha * vcvtah_f32_bf16(*x_ptr); iy = 0; for (i = 0; i < m; i++) { - y[iy] += x0 * BF16_TO_FP32(a_ptr[i]); + y[iy] += x0 * vcvtah_f32_bf16(a_ptr[i]); iy += incy; } From b34235ca66a0116990a28abd3408252fde4a3d7e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Mar 2025 22:41:50 +0100 Subject: [PATCH 25/56] Fix inclusion of deprecated interfaces and cgesvdq/strsyl3 --- cmake/lapacke.cmake | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index f43bf10d08..94224d8baf 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -98,6 +98,8 @@ set(CSRC lapacke_cgesv_work.c lapacke_cgesvd.c lapacke_cgesvd_work.c + lapacke_cgesvdq.c + lapacke_cgesvdq_work.c lapacke_cgesvdx.c lapacke_cgesvdx_work.c lapacke_cgesvj.c @@ -1766,8 +1768,8 @@ set(SSRC lapacke_strsna_work.c lapacke_strsyl.c lapacke_strsyl_work.c - lapacke_ctrsyl3.c - lapacke_ctrsyl3_work.c + lapacke_strsyl3.c + lapacke_strsyl3_work.c lapacke_strtri.c lapacke_strtri_work.c lapacke_strtrs.c @@ -2410,10 +2412,10 @@ set(ZSRC lapacke_ilaver.c ) if (BUILD_LAPACK_DEPRECATED) -set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) -set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) -set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) -set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) + list(APPEND SSRC lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) + list(APPEND DSRC lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) + list(APPEND CSRC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) + list(APPEND ZSRC lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) endif() set(SRCX From 8a418b1aab753c4101d7b5129398417c2ee18c87 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Mar 2025 23:20:16 +0100 Subject: [PATCH 26/56] Add dummy implementations for the LAPACK_COMPLEX_CUSTOM case --- lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c | 2 ++ lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c b/lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c index f6fb74b18f..274ece33c7 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c @@ -48,4 +48,6 @@ lapack_complex_double lapack_make_complex_double( double re, double im ) { #endif return z; } +#else +lapack_complex_double lapack_make_complex_double( double re, double im ) {} #endif diff --git a/lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c b/lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c index c04eb084cd..c76e59c916 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c @@ -48,4 +48,6 @@ lapack_complex_float lapack_make_complex_float( float re, float im ) { #endif return z; } +#else +lapack_complex_float lapack_make_complex_float( float re, float im ) {} #endif From 9807f56580fca7f06dd9ef1e14673748cf025e31 Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 12 Mar 2025 21:26:27 +0000 Subject: [PATCH 27/56] Optimize aarch64 sgemm_ncopy --- CONTRIBUTORS.md | 1 + kernel/arm64/sgemm_ncopy_4.S | 47 +++++------ kernel/arm64/sgemm_ncopy_8.S | 149 ++++++++++++++--------------------- 3 files changed, 78 insertions(+), 119 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 938a3bf918..2e2979accb 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -238,6 +238,7 @@ In chronological order: * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel * [2025-02-26] Add sbgemv_t_bfdot kernel * [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13 + * [2025-03-12] Optimize aarch64 sgemm_ncopy * Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S index c819ee6fb1..de8c8eca6f 100644 --- a/kernel/arm64/sgemm_ncopy_4.S +++ b/kernel/arm64/sgemm_ncopy_4.S @@ -88,28 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [A04, #A_PREFETCH] ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - ldr q2, [A03], #16 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v10.s[2], v2.s[2] - ins v11.s[2], v2.s[3] - ldr q3, [A04], #16 - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] - ins v10.s[3], v3.s[2] - ins v11.s[3], v3.s[3] + + zip1 v12.4s, v0.4s, v1.4s + zip1 v13.4s, v2.4s, v3.4s + zip2 v14.4s, v0.4s, v1.4s + zip2 v15.4s, v2.4s, v3.4s + + zip1 v8.2d, v12.2d, v13.2d + zip2 v9.2d, v12.2d, v13.2d + zip1 v10.2d, v14.2d, v15.2d + zip2 v11.2d, v14.2d, v15.2d st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] add B00, B00, #64 @@ -138,16 +129,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [A02, #A_PREFETCH] ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] + + zip1 v12.4s, v0.4s, v1.4s + zip2 v13.4s, v0.4s, v1.4s + + dup v8.2d, v12.d[0] + dup v9.2d, v12.d[1] + dup v10.2d, v13.d[0] + dup v11.2d , v13.d[1] st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] add B00, B00, #32 @@ -330,4 +320,3 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ret EPILOGUE - diff --git a/kernel/arm64/sgemm_ncopy_8.S b/kernel/arm64/sgemm_ncopy_8.S index f99b1d992e..d941eb3eb4 100644 --- a/kernel/arm64/sgemm_ncopy_8.S +++ b/kernel/arm64/sgemm_ncopy_8.S @@ -86,47 +86,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x8 ldr q0, [A01], #16 ldr q1, [A02], #16 - ins v8.s[0], v0.s[0] - ins v10.s[0], v0.s[1] - ins v12.s[0], v0.s[2] - ins v14.s[0], v0.s[3] - ins v8.s[1], v1.s[0] - ins v10.s[1], v1.s[1] - ins v12.s[1], v1.s[2] - ins v14.s[1], v1.s[3] - ldr q2, [A03], #16 ldr q3, [A04], #16 - ins v8.s[2], v2.s[0] - ins v10.s[2], v2.s[1] - ins v12.s[2], v2.s[2] - ins v14.s[2], v2.s[3] - ins v8.s[3], v3.s[0] - ins v10.s[3], v3.s[1] - ins v12.s[3], v3.s[2] - ins v14.s[3], v3.s[3] + + zip1 v16.4s, v0.4s, v1.4s + zip1 v17.4s, v2.4s, v3.4s + zip2 v18.4s, v0.4s, v1.4s + zip2 v19.4s, v2.4s, v3.4s + + zip1 v8.2d, v16.2d, v17.2d + zip2 v10.2d, v16.2d, v17.2d + zip1 v12.2d, v18.2d, v19.2d + zip2 v14.2d, v18.2d, v19.2d ldr q4, [A05], #16 ldr q5, [A06], #16 - ins v9.s[0], v4.s[0] - ins v11.s[0], v4.s[1] - ins v13.s[0], v4.s[2] - ins v15.s[0], v4.s[3] - ins v9.s[1], v5.s[0] - ins v11.s[1], v5.s[1] - ins v13.s[1], v5.s[2] - ins v15.s[1], v5.s[3] - ldr q6, [A07], #16 ldr q7, [A08], #16 - ins v9.s[2], v6.s[0] - ins v11.s[2], v6.s[1] - ins v13.s[2], v6.s[2] - ins v15.s[2], v6.s[3] - ins v9.s[3], v7.s[0] - ins v11.s[3], v7.s[1] - ins v13.s[3], v7.s[2] - ins v15.s[3], v7.s[3] + + zip1 v16.4s, v4.4s, v5.4s + zip1 v17.4s, v6.4s, v7.4s + zip2 v18.4s, v4.4s, v5.4s + zip2 v19.4s, v6.4s, v7.4s + + zip1 v9.2d, v16.2d, v17.2d + zip2 v11.2d, v16.2d, v17.2d + zip1 v13.2d, v18.2d, v19.2d + zip2 v15.2d, v18.2d, v19.2d st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64 @@ -135,31 +121,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x8 ldr d0, [A01], #8 ldr d1, [A02], #8 - ins v8.s[0], v0.s[0] - ins v10.s[0], v0.s[1] - ins v8.s[1], v1.s[0] - ins v10.s[1], v1.s[1] - ldr d2, [A03], #8 ldr d3, [A04], #8 - ins v8.s[2], v2.s[0] - ins v10.s[2], v2.s[1] - ins v8.s[3], v3.s[0] - ins v10.s[3], v3.s[1] + + zip1 v12.4s, v0.4s, v1.4s + zip1 v13.4s, v2.4s, v3.4s + + zip1 v8.2d, v12.2d, v13.2d + zip2 v10.2d, v12.2d, v13.2d ldr d4, [A05], #8 ldr d5, [A06], #8 - ins v9.s[0], v4.s[0] - ins v11.s[0], v4.s[1] - ins v9.s[1], v5.s[0] - ins v11.s[1], v5.s[1] - ldr d6, [A07], #8 ldr d7, [A08], #8 - ins v9.s[2], v6.s[0] - ins v11.s[2], v6.s[1] - ins v9.s[3], v7.s[0] - ins v11.s[3], v7.s[1] + + zip1 v12.4s, v4.4s, v5.4s + zip1 v13.4s, v6.4s, v7.4s + + zip1 v9.2d, v12.2d, v13.2d + zip2 v11.2d, v12.2d, v13.2d st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 .endm @@ -191,25 +171,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4 ldr q0, [A01], #16 ldr q1, [A02], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - ldr q2, [A03], #16 ldr q3, [A04], #16 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v10.s[2], v2.s[2] - ins v11.s[2], v2.s[3] - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] - ins v10.s[3], v3.s[2] - ins v11.s[3], v3.s[3] + + zip1 v12.4s, v0.4s, v1.4s + zip1 v13.4s, v2.4s, v3.4s + zip2 v14.4s, v0.4s, v1.4s + zip2 v15.4s, v2.4s, v3.4s + + zip1 v8.2d, v12.2d, v13.2d + zip2 v9.2d, v12.2d, v13.2d + zip1 v10.2d, v14.2d, v15.2d + zip2 v11.2d, v14.2d, v15.2d st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 .endm @@ -217,17 +190,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 ldr d0, [A01], #8 ldr d1, [A02], #8 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ldr d2, [A03], #8 ldr d3, [A04], #8 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] + + zip1 v10.4s, v0.4s, v1.4s + zip1 v11.4s, v2.4s, v3.4s + + zip1 v8.2d, v10.2d, v11.2d + zip2 v9.2d, v10.2d, v11.2d st1 {v8.4s, v9.4s}, [B00], #32 .endm @@ -249,14 +219,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 ldr q0, [A01], #16 ldr q1, [A02], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] + + zip1 v12.4s, v0.4s, v1.4s + zip2 v13.4s, v0.4s, v1.4s + + dup v8.2d, v12.d[0] + dup v9.2d, v12.d[1] + dup v10.2d, v13.d[0] + dup v11.2d , v13.d[1] st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32 .endm @@ -264,10 +234,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 ldr d0, [A01], #8 ldr d1, [A02], #8 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] + + zip1 v8.2s, v0.2s, v1.2s + zip2 v9.2s, v0.2s, v1.2s st1 {v8.2s, v9.2s}, [B00], #16 .endm From edef2e4441e50e3a2da1920fdbde09101087c43d Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Thu, 13 Mar 2025 20:55:31 +0000 Subject: [PATCH 28/56] Fix bug in ARM64 sbgemv_t --- kernel/arm64/sbgemv_t_bfdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/sbgemv_t_bfdot.c b/kernel/arm64/sbgemv_t_bfdot.c index fc4ae019e9..672f70acf2 100644 --- a/kernel/arm64/sbgemv_t_bfdot.c +++ b/kernel/arm64/sbgemv_t_bfdot.c @@ -196,7 +196,7 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat y[iy] = alpha * temp + beta * y[iy]; } iy += incy; - a += lda; + a_ptr += lda; } return (0); } From f27ba5efd15fd7ab94543b551e60325edca728d7 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Fri, 14 Mar 2025 17:55:40 +0000 Subject: [PATCH 29/56] fix bugs in aarch64 sbgemv_n kernel --- kernel/arm64/sbgemv_n_neon.c | 83 +++++++++++++++--------------------- 1 file changed, 34 insertions(+), 49 deletions(-) diff --git a/kernel/arm64/sbgemv_n_neon.c b/kernel/arm64/sbgemv_n_neon.c index 489d4d22cb..ff730407fd 100644 --- a/kernel/arm64/sbgemv_n_neon.c +++ b/kernel/arm64/sbgemv_n_neon.c @@ -69,12 +69,8 @@ static void beta_op(float *x, BLASLONG n, FLOAT beta) { x += 4; } - if (rest_n & 3) { - x[0] *= beta; - if ((rest_n & 3) > 1) - x[1] *= beta; - if ((rest_n & 3) > 2) - x[2] *= beta; + for (BLASLONG i = 0; i < (rest_n & 3); i ++) { + x[i] *= beta; } } return; @@ -88,7 +84,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, bfloat16x8_t a0, a1, a2, a3, a4, a5, a6, a7; bfloat16x8_t t0, t1, t2, t3, t4, t5, t6, t7; + bfloat16x8_t x_vec; + bfloat16x4_t x_vecx4; + float32x4_t y1_vec, y2_vec; float32x4_t fp32_low, fp32_high; @@ -106,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, if (incx == 1 && incy == 1) { if (beta != 1) { - beta_op(y, n, beta); + beta_op(y, m, beta); } for (i = 0; i < n / 8; i++) { @@ -290,12 +289,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, a_ptr += 4 * lda; - bfloat16x4_t x_vecx4 = vld1_bf16(x_ptr); + x_vecx4 = vld1_bf16(x_ptr); if (alpha != 1) { - x_vec = vcombine_bf16(x_vecx4, bf16_zero); - fp32_low = vreinterpretq_f32_u16( - vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), - vreinterpretq_u16_bf16(x_vec))); + fp32_low = vcvt_f32_bf16(x_vecx4); fp32_low = vmulq_n_f32(fp32_low, alpha); x_vecx4 = vcvt_bf16_f32(fp32_low); } @@ -348,15 +344,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, y1_vec = vld1q_f32(y_ptr); - a0 = vcombine_bf16(a0x4, bf16_zero); - a1 = vcombine_bf16(a1x4, bf16_zero); - a2 = vcombine_bf16(a2x4, bf16_zero); - a3 = vcombine_bf16(a3x4, bf16_zero); + a0 = vcombine_bf16(a0x4, a2x4); + a1 = vcombine_bf16(a1x4, a3x4); - t0 = vreinterpretq_bf16_u16( - vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); - t1 = vreinterpretq_bf16_u16( - vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16(vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); @@ -374,10 +366,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (rest_m) { - x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); - x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); - x2 = alpha * vcvtah_f32_bf16(x_ptr[2]); - x3 = alpha * vcvtah_f32_bf16(x_ptr[3]); + fp32_low = vcvt_f32_bf16(x_vecx4); + + x0 = vgetq_lane_f32(fp32_low, 0); + x1 = vgetq_lane_f32(fp32_low, 1); + x2 = vgetq_lane_f32(fp32_low, 2); + x3 = vgetq_lane_f32(fp32_low, 3); for (BLASLONG j = 0; j < rest_m; j++) { y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); @@ -396,18 +390,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, a_ptr += 2 * lda; - bfloat16_t tmp_buffer[4]; - memset((void*)tmp_buffer, 0, sizeof(bfloat16_t)); - - tmp_buffer[0] = x_ptr[0]; - tmp_buffer[1] = x_ptr[1]; + x_vecx4 = vreinterpret_bf16_u16(vzip1_u16( + vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[0])), + vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[1])) + )); - bfloat16x4_t x_vecx4 = vld1_bf16(tmp_buffer); if (alpha != 1) { - x_vec = vcombine_bf16(x_vecx4, bf16_zero); - fp32_low = vreinterpretq_f32_u16( - vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), - vreinterpretq_u16_bf16(x_vec))); + fp32_low = vcvt_f32_bf16(x_vecx4); fp32_low = vmulq_n_f32(fp32_low, alpha); x_vecx4 = vcvt_bf16_f32(fp32_low); } @@ -422,14 +411,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, t0 = vreinterpretq_bf16_u16( vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); - t4 = vreinterpretq_bf16_u16( + t1 = vreinterpretq_bf16_u16( vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); - y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0); - y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1); + y2_vec = vbfmlalbq_lane_f32(y2_vec, t1, x_vecx4, 0); + y2_vec = vbfmlaltq_lane_f32(y2_vec, t1, x_vecx4, 1); vst1q_f32(y_ptr, y1_vec); vst1q_f32(y_ptr + 4, y2_vec); @@ -449,29 +438,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, a0 = vcombine_bf16(a0x4, bf16_zero); a1 = vcombine_bf16(a1x4, bf16_zero); - t0 = vreinterpretq_bf16_u16( - vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); - t1 = vreinterpretq_bf16_u16( - vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); - y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); - y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); vst1q_f32(y_ptr, y1_vec); a_ptr0 += 4; a_ptr1 += 4; - a_ptr2 += 4; - a_ptr3 += 4; y_ptr += 4; } if (m & 2) { - x0 = alpha * (vcvtah_f32_bf16(x_ptr[0])); - x1 = alpha * (vcvtah_f32_bf16(x_ptr[1])); + fp32_low = vcvt_f32_bf16(x_vecx4); + x0 = vgetq_lane_f32(fp32_low, 0); + x1 = vgetq_lane_f32(fp32_low, 1); + y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); @@ -485,8 +469,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 1) { - x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); - x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); + fp32_low = vcvt_f32_bf16(x_vecx4); + x0 = vgetq_lane_f32(fp32_low, 0); + x1 = vgetq_lane_f32(fp32_low, 1); y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); From b6cb5ece5845512c1598aaca03831f8e6f63756a Mon Sep 17 00:00:00 2001 From: "shubham.chaudhari" Date: Fri, 28 Feb 2025 13:10:40 +0530 Subject: [PATCH 30/56] Add thread throttling profile for DGEMV on NEOVERSEV1 --- interface/gemv.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/interface/gemv.c b/interface/gemv.c index d031339463..360b82dcd2 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -89,6 +89,24 @@ static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { } #endif +//thread throttling for dgemv +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_dgemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { + + return + MN < 8100L ? 1 +: MN < 12100L ? MIN(ncpu, 2) +: MN < 36100L ? MIN(ncpu, 4) +: MN < 84100L ? MIN(ncpu, 8) +: MN < 348100L ? MIN(ncpu, 16) +: MN < 435600L ? MIN(ncpu, 24) +: MN < 810000L ? MIN(ncpu, 32) +: MN < 1050625 ? MIN(ncpu, 40) +: ncpu; + +} +#endif + static inline int get_gemv_optimal_nthreads(BLASLONG MN) { int ncpu = num_cpu_avail(3); #if defined(_WIN64) && defined(_M_ARM64) @@ -98,6 +116,8 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { #endif #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); +#elif defined(NEOVERSEV1) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) + return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) From 189dbbc04ff6fb4d58168fd1aef11c21ed9d14c4 Mon Sep 17 00:00:00 2001 From: "shubham.chaudhari" Date: Tue, 4 Mar 2025 16:08:55 +0530 Subject: [PATCH 31/56] Add thread throttling for dynamic arch neoversev1 --- interface/gemv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/interface/gemv.c b/interface/gemv.c index 360b82dcd2..22409649e4 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -127,6 +127,12 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { if (strcmp(gotoblas_corename(), "neoversev2") == 0) { return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); } +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); + } + + #endif if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) From 8e289ecddc7f51913d3fafcb11957d780c7d3d7e Mon Sep 17 00:00:00 2001 From: "shubham.chaudhari" Date: Tue, 18 Mar 2025 13:24:05 +0530 Subject: [PATCH 32/56] Simplified thread throttling function in gemv --- interface/gemv.c | 51 +++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index 22409649e4..34b6addd3d 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -70,11 +70,22 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { - return - MN < 25600L ? 1 - : MN < 63001L ? MIN(ncpu, 4) - : MN < 459684L ? MIN(ncpu, 16) - : ncpu; + #ifdef DOUBLE + return (MN < 8100L) ? 1 + : (MN < 12100L) ? MIN(ncpu, 2) + : (MN < 36100L) ? MIN(ncpu, 4) + : (MN < 84100L) ? MIN(ncpu, 8) + : (MN < 348100L) ? MIN(ncpu, 16) + : (MN < 435600L) ? MIN(ncpu, 24) + : (MN < 810000L) ? MIN(ncpu, 32) + : (MN < 1050625L) ? MIN(ncpu, 40) + : ncpu; + #else + return (MN < 25600L) ? 1 + : (MN < 63001L) ? MIN(ncpu, 4) + : (MN < 459684L) ? MIN(ncpu, 16) + : ncpu; + #endif } #endif @@ -89,24 +100,6 @@ static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { } #endif -//thread throttling for dgemv -#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) -static inline int get_dgemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { - - return - MN < 8100L ? 1 -: MN < 12100L ? MIN(ncpu, 2) -: MN < 36100L ? MIN(ncpu, 4) -: MN < 84100L ? MIN(ncpu, 8) -: MN < 348100L ? MIN(ncpu, 16) -: MN < 435600L ? MIN(ncpu, 24) -: MN < 810000L ? MIN(ncpu, 32) -: MN < 1050625 ? MIN(ncpu, 40) -: ncpu; - -} -#endif - static inline int get_gemv_optimal_nthreads(BLASLONG MN) { int ncpu = num_cpu_avail(3); #if defined(_WIN64) && defined(_M_ARM64) @@ -114,25 +107,17 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { return num_cpu_avail(4); return 1; #endif -#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); -#elif defined(NEOVERSEV1) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) - return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); -#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) if (strcmp(gotoblas_corename(), "neoversev1") == 0) { return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); } if (strcmp(gotoblas_corename(), "neoversev2") == 0) { return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); } -#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) - if (strcmp(gotoblas_corename(), "neoversev1") == 0) { - return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); - } - - #endif if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) From c0a5c9655ed3e8f7c3903ccb33ea96c6ae9b80ad Mon Sep 17 00:00:00 2001 From: Harishmcw Date: Mon, 24 Mar 2025 13:49:55 +0530 Subject: [PATCH 33/56] Fix missing commas in gensymbol.pl --- exports/gensymbol.pl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index dd79e924d3..91892395ef 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -59,7 +59,7 @@ cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, cblas_scnrm2, cblas_scasum, - cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy + cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy, cblas_cgemmt); @cblasobjsd = ( cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, @@ -68,7 +68,7 @@ cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, - cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy + cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy, cblas_dgemmt); @cblasobjss = ( @@ -79,7 +79,7 @@ cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, cblas_strsv, cblas_sgeadd, - cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy + cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy, cblas_sgemmt); @cblasobjsz = ( cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, @@ -89,7 +89,7 @@ cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, cblas_zaxpby, cblas_zgeadd, - cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy + cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy, cblas_zgemmt); @cblasobjs = ( cblas_xerbla ); @@ -1584,7 +1584,7 @@ LAPACKE_cgetsqrhrt, LAPACKE_cgetsqrhrt_work, LAPACKE_cungtsqr_row, - LAPACKE_cungtsqr_row_work + LAPACKE_cungtsqr_row_work, ); @lapackeobjsd = ( @@ -2197,7 +2197,7 @@ LAPACKE_dgetsqrhrt, LAPACKE_dgetsqrhrt_work, LAPACKE_dorgtsqr_row, - LAPACKE_dorgtsqr_row_work + LAPACKE_dorgtsqr_row_work, ); @lapackeobjss = ( @@ -2802,7 +2802,7 @@ LAPACKE_sgetsqrhrt, LAPACKE_sgetsqrhrt_work, LAPACKE_sorgtsqr_row, - LAPACKE_sorgtsqr_row_work + LAPACKE_sorgtsqr_row_work, ); @lapackeobjsz = ( @@ -3345,7 +3345,7 @@ LAPACKE_zgetsqrhrt, LAPACKE_zgetsqrhrt_work, LAPACKE_zungtsqr_row, - LAPACKE_zungtsqr_row_work + LAPACKE_zungtsqr_row_work, ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the @@ -3551,7 +3551,7 @@ LAPACKE_zsytrs_aa_2stage_work, # new functions from 3.9.0 LAPACKE_zgesvdq, - LAPACKE_zgesvdq_work + LAPACKE_zgesvdq_work, ); #These function may need 2 underscores. @@ -3639,7 +3639,7 @@ zhesv_aa_2stage, zhetrf_aa_2stage, zhetrs_aa_2stage, zsysv_aa_2stage, zsytrf_aa_2stage, zsytrs_aa_2stage, - zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col + zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, ); From c2e7ab5351cd402edfed96225c88ca5d0d753780 Mon Sep 17 00:00:00 2001 From: Harishmcw Date: Wed, 26 Mar 2025 10:50:29 +0530 Subject: [PATCH 34/56] DLL symbol pre/postfixing in CMake builds --- CMakeLists.txt | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f8b63041ad..c140bf5bac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,6 +77,16 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) +if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL "")) +if (NOT BUILD_STATIC_LIBS) + message (STATUS "forcing build of a temporary static library for symbol renaming") + set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE) + set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) + set (DELETE_STATIC_LIBS 1) +endif () +endif() + + ####### if(BUILD_WITHOUT_LAPACK) set(NO_LAPACK 1) @@ -379,7 +389,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) endif() endif() -if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") +if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") else() @@ -467,10 +477,26 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") else () set (BZ 0) endif() + + if (CMAKE_SYSTEM_NAME MATCHES "Windows") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) + #if (USE_PERL) +message(STATUS "adding postbuild instruction to rename syms") + add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def + COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c + COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a + #if (${REMOVE_STATIC_LIB}) + #file (REMOVE ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.lib) + #endif () + ) + #endif () + else () if (NOT USE_PERL) add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def - COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so + COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) else() @@ -481,6 +507,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") ) endif() endif() +endif() if (BUILD_BENCHMARKS) #find_package(OpenMP REQUIRED) @@ -650,4 +677,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake DESTINATION ${CMAKECONFIG_INSTALL_DIR}) install(EXPORT "${PN}${SUFFIX64}Targets" NAMESPACE "${PN}${SUFFIX64}::" - DESTINATION ${CMAKECONFIG_INSTALL_DIR}) + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) \ No newline at end of file From 1724b3f10497bc4c4c62e9019e6bd169796d0cda Mon Sep 17 00:00:00 2001 From: Harishmcw Date: Wed, 26 Mar 2025 10:55:50 +0530 Subject: [PATCH 35/56] DLL symbol pre/postfixing in CMake builds --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c140bf5bac..df9c631d44 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -488,7 +488,7 @@ message(STATUS "adding postbuild instruction to rename syms") COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a #if (${REMOVE_STATIC_LIB}) - #file (REMOVE ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.lib) + #file (REMOVE $/${OpenBLAS_LIBNAME}.lib) #endif () ) #endif () From 3ca1ba1be3865169803d6bb2c10bacad5196e7ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Mar 2025 18:37:11 +0100 Subject: [PATCH 36/56] resynchronize with the posix shell version --- exports/gensymbol.pl | 165 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 145 insertions(+), 20 deletions(-) diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index 91892395ef..5597306343 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -21,7 +21,7 @@ chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, - ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); + ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt,cgemmtr); @blasobjsd = ( damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, @@ -29,7 +29,7 @@ dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, - idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); + idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt,dgemmtr); @blasobjss = ( isamax,isamin,ismax,ismin, @@ -38,7 +38,7 @@ smax,smin,snrm2,simatcopy,somatcopy, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, - strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); + strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt,sgemmtr); @blasobjsz = ( izamax,izamin,, @@ -48,28 +48,29 @@ zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, - zgeadd, dzsum, zgemmt); + zgeadd, dzsum, zgemmt,zgemmtr); @blasobjs = (lsame, xerbla); -@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); +@bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby, cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, - cblas_scnrm2, cblas_scasum, + cblas_scnrm2, cblas_scasum, cblas_cgemmt, cblas_cgemmtr, cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy, - cblas_cgemmt); + cblas_caxpyc, cblas_crotg, cblas_csrot, cblas_scamax, cblas_scamin, cblas_cgemm_batch); + @cblasobjsd = ( cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, - cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, + cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_dgemmt, cblas_dgemmtr, cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy, - cblas_dgemmt); + cblas_damax, cblas_damin, cblas_dgemm_batch); @cblasobjss = ( cblas_sasum, cblas_saxpy, cblas_saxpby, @@ -78,9 +79,10 @@ cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, - cblas_strsv, cblas_sgeadd, + cblas_strsv, cblas_sgeadd, cblas_sgemmt, cblas_sgemmtr, cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy, - cblas_sgemmt); + cblas_samax, cblas_samin, cblas_sgemm_batch); + @cblasobjsz = ( cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, @@ -88,13 +90,13 @@ cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, - cblas_zaxpby, cblas_zgeadd, + cblas_zaxpby, cblas_zgeadd, cblas_zgemmt, cblas_zgemmtr, cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy, - cblas_zgemmt); + cblas_zaxpyc, cblas_zdrot, cblas_zrotg, cblas_dzamax, cblas_dzamin, cblas_zgemm_batch); @cblasobjs = ( cblas_xerbla ); -@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, @@ -709,6 +711,7 @@ # functions added for lapack-3.7.0 @lapackobjs2s = (@lapackobjs2s, slarfy, + ssyconvf, strevc3, sgelqt, sgelqt3, @@ -832,12 +835,82 @@ zungtsqr_row ); +#functions added for lapack-3.11 +@lapackobjs2c = (@lapackobjs2c, + cgedmd, + cgedmdq + ); +@lapackobjs2d = (@lapackobjs2d, + dgedmd, + dgedmdq + ); +@lapackobjs2s = (@lapackobjs2s, + sgedmd, + sgedmdq + ); +@lapackobjs2z = (@lapackobjs2z, + zgedmd, + zgedmdq + ); + +#functions added post 3.11 + +@lapackobjs2c = (@lapackobjs2c, + cgelst, + cgeqp3rk, + claqp2rk, + claqp3rk, + clatrs3, + crscl, + ctrsyl3 + ); +# claqz0 +# claqz1 +# claqz2 +# claqz3 +# clatrs3 + +@lapackobjs2d = (@lapackobjs2d, + dgelst, + dgeqp3rk, + dlaqp2rk, + dlaqp3rk, + dlarmm, + dlatrs3, + dtrsyl3 + ); + +@lapackobjs2s = (@lapackobjs2s, + sgelst, + sgeqp3rk, + slaqp2rk, + slaqp3rk, + slarmm, + slatrs3, + strsyl3 + ); + +@lapackobjs2z = (@lapackobjs2z, + zgelst, + zgeqp3rk, + zlaqp2rk, + zlaqp3rk, + zlatrs3, + zrscl, + ztrsyl3 + ); +# zlaqz0 +# zlaqz1 +# zlaqz2 +# zlaqz3 + @lapack_extendedprecision_objs = ( zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, ); @lapack_deprecated_objsc = ( + cgelqs, cgeqrs, cgegs, cggsvd, cgegv, cggsvp, cgelsx, clahrd, @@ -845,13 +918,16 @@ ctzrqf, ); @lapack_deprecated_objsd = ( + dgelqs, dgeqrs, dgegs, dgeqpf, dgegv, dggsvd, dgelsx, dggsvp, dlahrd, dlatzm, dtzrqf); -@lapack_deprecated_objss = ( +@lapack_deprecated_objss = ( + sgelqs, + sgeqrs, sgelsx, sgegs, sgegv, @@ -864,6 +940,8 @@ ); @lapack_deprecated_objsz = ( + zgelqs, + zgeqrs, zgegs, zgegv, zgelsx, @@ -997,6 +1075,10 @@ LAPACKE_cgebrd_work, LAPACKE_cgecon, LAPACKE_cgecon_work, + LAPACKE_cgedmd, + LAPACKE_cgedmd_work, + LAPACKE_cgedmdq, + LAPACKE_cgedmdq_work, LAPACKE_cgeequ, LAPACKE_cgeequ_work, LAPACKE_cgeequb, @@ -1585,7 +1667,14 @@ LAPACKE_cgetsqrhrt_work, LAPACKE_cungtsqr_row, LAPACKE_cungtsqr_row_work, - + LAPACKE_clangb, + LAPACKE_clangb_work, + LAPACKE_ctrsyl3, + LAPACKE_ctrsyl3_work, + LAPACKE_ctz_nancheck, + LAPACKE_ctz_trans, + LAPACKE_cunhr_col, + LAPACKE_cunhr_col_work ); @lapackeobjsd = ( LAPACKE_dgb_nancheck, @@ -1656,6 +1745,10 @@ LAPACKE_dgebrd_work, LAPACKE_dgecon, LAPACKE_dgecon_work, + LAPACKE_dgedmd, + LAPACKE_dgedmd_work, + LAPACKE_dgedmdq, + LAPACKE_dgedmdq_work, LAPACKE_dgeequ, LAPACKE_dgeequ_work, LAPACKE_dgeequb, @@ -2198,6 +2291,14 @@ LAPACKE_dgetsqrhrt_work, LAPACKE_dorgtsqr_row, LAPACKE_dorgtsqr_row_work, + LAPACKE_dlangb, + LAPACKE_dlangb_work, + LAPACKE_dorhr_col, + LAPACKE_dorhr_col_work, + LAPACKE_dtrsyl3, + LAPACKE_dtrsyl3_work, + LAPACKE_dtz_nancheck, + LAPACKE_dtz_trans, ); @lapackeobjss = ( @@ -2269,6 +2370,10 @@ LAPACKE_sgebrd_work, LAPACKE_sgecon, LAPACKE_sgecon_work, + LAPACKE_sgedmd, + LAPACKE_sgedmd_work, + LAPACKE_sgedmdq, + LAPACKE_sgedmdq_work, LAPACKE_sgeequ, LAPACKE_sgeequ_work, LAPACKE_sgeequb, @@ -2803,6 +2908,14 @@ LAPACKE_sgetsqrhrt_work, LAPACKE_sorgtsqr_row, LAPACKE_sorgtsqr_row_work, + LAPACKE_slangb, + LAPACKE_slangb_work, + LAPACKE_sorhr_col, + LAPACKE_sorhr_col_work, + LAPACKE_strsyl3, + LAPACKE_strsyl3_work, + LAPACKE_stz_nancheck, + LAPACKE_stz_trans, ); @lapackeobjsz = ( @@ -2878,6 +2991,10 @@ LAPACKE_zgebrd_work, LAPACKE_zgecon, LAPACKE_zgecon_work, + LAPACKE_zgedmd, + LAPACKE_zgedmd_work, + LAPACKE_zgedmdq, + LAPACKE_zgedmdq_work, LAPACKE_zgeequ, LAPACKE_zgeequ_work, LAPACKE_zgeequb, @@ -3346,6 +3463,14 @@ LAPACKE_zgetsqrhrt_work, LAPACKE_zungtsqr_row, LAPACKE_zungtsqr_row_work, + LAPACKE_zlangb, + LAPACKE_zlangb_work, + LAPACKE_zunhr_col, + LAPACKE_zunhr_col_work, + LAPACKE_ztrsyl3, + LAPACKE_ztrsyl3_work, + LAPACKE_ztz_nancheck, + LAPACKE_ztz_trans, ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the @@ -3573,7 +3698,7 @@ ssygv_2stage, ssysv_aa_2stage, ssytrf_aa_2stage, ssytrs_aa_2stage, - slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, + slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, slarfb_gett ); @lapack_embeded_underscore_objs_c=( chetf2_rook, chetrf_rook, chetri_rook, @@ -3598,7 +3723,7 @@ chetrf_aa_2stage, chetrs_aa_2stage, csysv_aa_2stage, csytrf_aa_2stage, csytrs_aa_2stage, - claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, + claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, clarfb_gett ); @lapack_embeded_underscore_objs_d=( dlasyf_rook, @@ -3615,7 +3740,7 @@ dsbevd_2stage, dsygv_2stage, dsysv_aa_2stage, dsytrf_aa_2stage, dsytrs_aa_2stage, - dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, + dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, dlarfb_gett ); @lapack_embeded_underscore_objs_z=( zhetf2_rook, zhetrf_rook, zhetri_rook, @@ -3639,7 +3764,7 @@ zhesv_aa_2stage, zhetrf_aa_2stage, zhetrs_aa_2stage, zsysv_aa_2stage, zsytrf_aa_2stage, zsytrs_aa_2stage, - zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, + zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, zlarfb_gett ); From 51c1fb1f93fd6d7f7fd24dd22c954ba9527b4a05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Mar 2025 23:36:49 +0100 Subject: [PATCH 37/56] Fix ?spmv build and misinterpretation of NO_LAPACK=0 --- interface/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index c0d5896e17..393b7cdd3d 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -30,17 +30,17 @@ set(BLAS2_SOURCES gemv.c ger.c trsv.c trmv.c syr2.c gbmv.c - sbmv.c + sbmv.c spmv.c spr2.c tbsv.c tbmv.c tpsv.c tpmv.c ) set(BLAS2_REAL_ONLY_SOURCES - symv.c syr.c spmv.c spr.c + symv.c syr.c spr.c ) set(BLAS2_COMPLEX_LAPACK_SOURCES - symv.c syr.c spmv.c spr.c + symv.c syr.c spr.c ) set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES @@ -195,7 +195,7 @@ if (NOT DEFINED NO_CBLAS) endforeach () endif() -if (NOT DEFINED NO_LAPACK) +if (NOT NO_LAPACK) set(LAPACK_SOURCES lapack/gesv.c ) From 02fd1df10b4a2e36a848d852589a5ba25214926c Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Wed, 12 Mar 2025 20:41:55 -0400 Subject: [PATCH 38/56] CMake: Pass `OpenMP` compiler and linker flags through CMake targets Using `OpenMP::OpenMP_LANG` targets for CMake is less error-prone than passing the compiler and linker flags manually. Furthermore, it allows the user to customize those flags by setting `OpenMP_LANG_FLAGS`, `OpenMP_LANG_LIB_NAMES`, and `OpenMP_omp_LIBRARY`. --- CMakeLists.txt | 22 ++++++++++++++++------ cmake/arch.cmake | 11 ----------- cmake/fc.cmake | 24 ++++++++++++------------ cmake/system.cmake | 17 ++++++++--------- driver/level2/CMakeLists.txt | 4 ++++ driver/level3/CMakeLists.txt | 4 ++++ driver/others/CMakeLists.txt | 4 ++++ interface/CMakeLists.txt | 4 ++++ kernel/CMakeLists.txt | 3 +++ lapack/CMakeLists.txt | 4 ++++ 10 files changed, 59 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index df9c631d44..6a8d36c409 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,10 +119,6 @@ endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") -if (USE_OPENMP) - find_package(OpenMP REQUIRED) -endif () - include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") @@ -240,6 +236,12 @@ endif () # add objects to the openblas lib if(NOT NO_LAPACK) add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES}) + if (USE_OPENMP AND (NOT NOFORTRAN)) + # Disable OpenMP for LAPACK Fortran codes on Windows. + if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + target_link_libraries(LAPACK_OVERRIDES OpenMP::OpenMP_Fortran) + endif() + endif() list(APPEND TARGET_OBJS "$") endif() if(NOT NO_LAPACKE) @@ -281,10 +283,18 @@ endif() if (USE_OPENMP) if(BUILD_STATIC_LIBS) - target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) + if(NOFORTRAN) + target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) + else() + target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) + endif() endif() if(BUILD_SHARED_LIBS) - target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) + if(NOFORTRAN) + target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) + else() + target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) + endif() endif() endif() diff --git a/cmake/arch.cmake b/cmake/arch.cmake index ec91a2d598..d9a7aafd62 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -31,17 +31,6 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") endif () -if (USE_OPENMP) - # USE_SIMPLE_THREADED_LEVEL3 = 1 - # NO_AFFINITY = 1 - find_package(OpenMP REQUIRED) - if (OpenMP_FOUND) - set(CCOMMON_OPT "${CCOMMON_OPT} ${OpenMP_C_FLAGS} -DUSE_OPENMP") - set(FCOMMON_OPT "${FCOMMON_OPT} ${OpenMP_Fortran_FLAGS}") - endif() -endif () - - if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 38bd406a3a..3aa6a151da 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -7,7 +7,7 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "L # This is for classic Flang. LLVM Flang is handled with gfortran below. set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") + set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") endif () @@ -117,7 +117,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") + set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -128,14 +128,14 @@ if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () if (${F_COMPILER} STREQUAL "FUJITSU") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -151,7 +151,7 @@ if (${F_COMPILER} STREQUAL "IBM") set(FCOMMON_OPT "${FCOMMON_OPT} -q32") endif () if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -168,7 +168,7 @@ if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -195,7 +195,7 @@ if (${F_COMPILER} STREQUAL "PATHSCALE") endif () if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -233,7 +233,7 @@ if (${F_COMPILER} STREQUAL "OPEN64") if (USE_OPENMP) set(FEXTRALIB "${FEXTRALIB} -lstdc++") - set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -245,14 +245,14 @@ if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -m64") endif () if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") + set(OpenMP_Fortran_FLAGS "-xopenmp=parallel" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () if (${F_COMPILER} STREQUAL "COMPAQ") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -265,7 +265,7 @@ if (${F_COMPILER} STREQUAL "CRAY") if (NOT USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp") else () - set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") + set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -290,7 +290,7 @@ if (${F_COMPILER} STREQUAL "NAGFOR") # -w=unused: Suppress warning messages about unused variables set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () diff --git a/cmake/system.cmake b/cmake/system.cmake index efaafee400..4c9d9198ce 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -372,6 +372,14 @@ else () endif () endif () +if (USE_OPENMP) + find_package(OpenMP COMPONENTS C REQUIRED) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") + if (NOT NOFORTRAN) + find_package(OpenMP COMPONENTS Fortran REQUIRED) + endif () +endif () + if (BINARY64) if (INTERFACE64) # CCOMMON_OPT += -DUSE64BITINT @@ -655,15 +663,6 @@ if (LAPACK_STRLEN) endif() set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") -#Disable -fopenmp for LAPACK Fortran codes on Windows. -if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parallel") - foreach (FILTER_FLAG ${FILTER_FLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) - endforeach () -endif () - if (CMAKE_Fortran_COMPILER) if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 3e9964ab1a..c52b461a7e 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -223,3 +223,7 @@ if (USE_THREAD) endif () add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) + +if (USE_OPENMP) + target_link_libraries(driver_level2 OpenMP::OpenMP_C) +endif() diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index b1ec94c233..eabfeed24a 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -171,3 +171,7 @@ endforeach () # add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) + +if (USE_OPENMP) + target_link_libraries(driver_level3 OpenMP::OpenMP_C) +endif() diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 139f329ecf..ebcc0aa781 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -88,3 +88,7 @@ endif () #endif add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) + +if (USE_OPENMP) + target_link_libraries(driver_others OpenMP::OpenMP_C) +endif() diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index c0d5896e17..12b679c84e 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -250,3 +250,7 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) endif () add_library(interface OBJECT ${OPENBLAS_SRC}) + +if (USE_OPENMP) + target_link_libraries(interface OpenMP::OpenMP_C) +endif() diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d3262e07d2..81185f6030 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1364,6 +1364,9 @@ endif () if (USE_GEMM3M) target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) endif() + if (USE_OPENMP) + target_link_libraries(kernel${TSUFFIX} OpenMP::OpenMP_C) + endif() endfunction () diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 1d44e94904..bbaacb2be5 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -117,3 +117,7 @@ GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) add_library(lapack OBJECT ${OPENBLAS_SRC}) + +if (USE_OPENMP) + target_link_libraries(lapack OpenMP::OpenMP_C) +endif() From 1b0c0f00e9448a4e2866cc3449b02cc4d727abf7 Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Thu, 13 Mar 2025 02:25:52 -0400 Subject: [PATCH 39/56] CMake: Avoid mixed OpenMP linkage --- cmake/system.cmake | 6 ++++++ ctest/CMakeLists.txt | 16 ---------------- lapack-netlib/TESTING/EIG/CMakeLists.txt | 6 ------ lapack-netlib/TESTING/LIN/CMakeLists.txt | 4 ---- test/CMakeLists.txt | 4 ---- 5 files changed, 6 insertions(+), 30 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4c9d9198ce..14b2c65b11 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -377,6 +377,12 @@ if (USE_OPENMP) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") if (NOT NOFORTRAN) find_package(OpenMP COMPONENTS Fortran REQUIRED) + # Avoid mixed OpenMP linkage + get_target_property(OMP_C_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) + get_target_property(OMP_Fortran_LIB OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES) + if (NOT OMP_C_LIB STREQUAL OMP_Fortran_LIB) + message(FATAL_ERROR "Multiple OpenMP runtime libraries detected. Mixed OpenMP runtime linkage is dangerous. You may pass -DOpenMP_LANG_LIB_NAMES and -DOpenMP_omp_LIBRARY to manually choose the OpenMP library.") + endif() endif () endif () diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 4496eff825..03b157843c 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -44,10 +44,6 @@ else() c_${float_char}blas1.c) endif() target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(x${float_char}cblat1 omp pthread) - endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat1 m) endif() @@ -73,10 +69,6 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(x${float_char}cblat2 omp pthread) - endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat2 m) endif() @@ -124,20 +116,12 @@ else() endif() endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(x${float_char}cblat3 omp pthread) - endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() if (USE_GEMM3M) if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(x${float_char}cblat3 omp pthread) - endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3_3m m) endif() diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index e4c4181b2c..d9c34fe985 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -107,12 +107,6 @@ set(ZDMDEIGTST zchkdmd.f90) macro(add_eig_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) - - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(${name} omp pthread) - endif() - #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index e406570e1f..95baa31229 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -240,10 +240,6 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(${name} omp pthread) - endif() #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4ebd5348cd..f874fa5eaa 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -34,10 +34,6 @@ endif () foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) -if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") -target_link_libraries(${test_bin} omp pthread) -endif() endforeach() # $1 exec, $2 input, $3 output_result From 251c3f857dc5148a61c4475f1bbfad2de6046a31 Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Wed, 26 Mar 2025 23:19:40 -0400 Subject: [PATCH 40/56] gh m1: fix mixed linkage when built with OpenMP and clang+gfortran --- .github/workflows/apple_m.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml index e34eada86b..81952dabd7 100644 --- a/.github/workflows/apple_m.yml +++ b/.github/workflows/apple_m.yml @@ -102,6 +102,7 @@ jobs: mkdir build && cd build cmake -DDYNAMIC_ARCH=1 \ -DUSE_OPENMP=${{matrix.openmp}} \ + -DOpenMP_Fortran_LIB_NAMES=omp \ -DINTERFACE64=${{matrix.ilp64}} \ -DNOFORTRAN=0 \ -DBUILD_WITHOUT_LAPACK=0 \ From ea6515c4b33800ca6c91148a5cdb4452937c87fc Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Wed, 26 Mar 2025 17:35:21 +0100 Subject: [PATCH 41/56] On zarch don't produce objects from assembler with a writable stack section On z-series, the current version of the GNU toolchain produces warnings such as: ``` /usr/lib64/gcc/[...]/s390x-suse-linux/bin/ld: warning: ztrmm_kernel_RC_Z14.o: missing .note.GNU-stack section implies executable stack /usr/lib64/[...]/s390x-suse-linux/bin/ld: NOTE: This behaviour is deprecated and will be removed in a future version of the linker ``` To prevent this message and make sure we are future proof, add ``` .section .note.GNU-stack,"",@progbits ``` Also add the `.size` bit to give the asm defined functions a proper size in the symbol table. Signed-off-by: Egbert Eich --- common_zarch.h | 11 +++++++++-- kernel/zarch/ctrmm4x4V.S | 2 ++ kernel/zarch/gemm8x4V.S | 2 ++ kernel/zarch/strmm8x4V.S | 2 ++ kernel/zarch/trmm8x4V.S | 2 ++ kernel/zarch/ztrmm4x4V.S | 2 ++ 6 files changed, 19 insertions(+), 2 deletions(-) diff --git a/common_zarch.h b/common_zarch.h index 7911f11ae7..035bcd27c0 100644 --- a/common_zarch.h +++ b/common_zarch.h @@ -103,9 +103,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ .global REALNAME ;\ .type REALNAME, %function ;\ REALNAME: - -#define EPILOGUE +#if defined(__ELF__) && defined(__linux__) +# define GNUSTACK .section .note.GNU-stack,"",@progbits +#else +# define GNUSTACK +#endif + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + GNUSTACK #define PROFCODE diff --git a/kernel/zarch/ctrmm4x4V.S b/kernel/zarch/ctrmm4x4V.S index 123f2ead0c..dd997fbdf8 100644 --- a/kernel/zarch/ctrmm4x4V.S +++ b/kernel/zarch/ctrmm4x4V.S @@ -714,6 +714,8 @@ ld %f10,136(%r15) ld %f11,144(%r15) ld %f12,152(%r15) br %r14 + +EPILOGUE .end diff --git a/kernel/zarch/gemm8x4V.S b/kernel/zarch/gemm8x4V.S index 633e60ea6d..47ce5f8dd7 100644 --- a/kernel/zarch/gemm8x4V.S +++ b/kernel/zarch/gemm8x4V.S @@ -604,6 +604,8 @@ ALIGN_2 /*end*/ lmg %r6,%r12,48(%r15) br %r14 + +EPILOGUE .end diff --git a/kernel/zarch/strmm8x4V.S b/kernel/zarch/strmm8x4V.S index e34a7a05ae..c93c928cb6 100644 --- a/kernel/zarch/strmm8x4V.S +++ b/kernel/zarch/strmm8x4V.S @@ -845,6 +845,8 @@ ALIGN_2 lmg %r6,%r12,48(%r15) #endif br %r14 + +EPILOGUE .end diff --git a/kernel/zarch/trmm8x4V.S b/kernel/zarch/trmm8x4V.S index 4da113ff30..de337e351f 100644 --- a/kernel/zarch/trmm8x4V.S +++ b/kernel/zarch/trmm8x4V.S @@ -864,6 +864,8 @@ ALIGN_2 lmg %r6,%r12,48(%r15) #endif br %r14 + +EPILOGUE .end diff --git a/kernel/zarch/ztrmm4x4V.S b/kernel/zarch/ztrmm4x4V.S index 6fd7f25099..fa99daee50 100644 --- a/kernel/zarch/ztrmm4x4V.S +++ b/kernel/zarch/ztrmm4x4V.S @@ -719,6 +719,8 @@ ld %f10,136(%r15) ld %f11,144(%r15) ld %f12,152(%r15) br %r14 + +EPILOGUE .end From 61b9339d3a1fd7a4c4d91fce92ac55e41f80a08a Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 28 Mar 2025 08:59:26 +0100 Subject: [PATCH 42/56] getarch/cpuid.S: Fix warning about executable stack When using the GNU toolchain a warning is printed about an executible stack: /usr/lib64/gcc/.../x86_64-suse-linux/bin/ld: warning: /tmp/ccyG3xBB.o: missing .note.GNU-stack section implies executable stack [ 15s] /usr/lib64/gcc/.../x86_64-suse-linux/bin/ld: NOTE: This behaviour is deprecated and will be removed in a future version of the linker to prevent this warning, add: ``` .section .note.GNU-stack,"",@progbits ``` Signed-off-by: Egbert Eich --- cpuid.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpuid.S b/cpuid.S index 851fe34d27..295917bdbe 100644 --- a/cpuid.S +++ b/cpuid.S @@ -65,3 +65,6 @@ _cpuid: .subsections_via_symbols #endif +#if defined(__ELF__) && defined(__linux__) + .section .note.GNU-stack,"",@progbits +#endif From 3fc15ad81cabf3f847bfa99c1fcb3e8039543068 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Mar 2025 23:22:09 +0200 Subject: [PATCH 43/56] Fix pdb file creation in debug dll builds with CMake on Windows/WoA --- CMakeLists.txt | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index df9c631d44..9bcfd38adc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,11 +78,12 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL "")) +set (DELETE_STATIC_LIBS "") if (NOT BUILD_STATIC_LIBS) message (STATUS "forcing build of a temporary static library for symbol renaming") set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE) set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) - set (DELETE_STATIC_LIBS 1) + set (DELETE_STATIC_LIBS file (REMOVE $/${OpenBLAS_LIBNAME}.lib)) endif () endif() @@ -481,15 +482,22 @@ if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFI if (CMAKE_SYSTEM_NAME MATCHES "Windows") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) - #if (USE_PERL) +if (CMAKE_BUILD_TYPE MATCHES "Debug") +set (CRTLIB msvcrtd) +set (PDBOPT -debug -pdb:$/${OpenBLAS_LIBNAME}.pdb) +set (PDB_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) +else () +set (CRTLIB msvcrt) +set (PDBOPT "") +endif() + #if (USE_PERL) message(STATUS "adding postbuild instruction to rename syms") add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c - COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a - #if (${REMOVE_STATIC_LIB}) - #file (REMOVE $/${OpenBLAS_LIBNAME}.lib) - #endif () + COMMAND lld-link -nodefaultlib:libcmt -defaultlib:${CRTLIB} ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a ${PDBOPT} + #COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a + ${REMOVE_STATIC_LIB} VERBATIM ) #endif () else () From 0aa5ef29ec27ad6b0c5858d352ba1b4ea0b35b50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Apr 2025 23:54:56 +0200 Subject: [PATCH 44/56] Repeat the libs target's "ln" in the all target to ensure completeness --- Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile b/Makefile index 4c72177343..2083c3c789 100644 --- a/Makefile +++ b/Makefile @@ -93,6 +93,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ fi endif + +ifeq ($(OSNAME), WINNT) + @-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +endif + ifneq ($(OSNAME), AIX) @echo -n " Library Name ... $(LIBNAME)" else From 7bf848454ddfd8713c50de6c55138388c2823a33 Mon Sep 17 00:00:00 2001 From: ColumbusAI <75283809+ColumbusAI@users.noreply.github.com> Date: Sat, 5 Apr 2025 09:57:53 -0700 Subject: [PATCH 45/56] Update zsum.c -- fixed spelling error to successfully compile spelling error where zsum_kernel is used and it should be zasum_kernel. Will not compile without fix. --- kernel/x86_64/zsum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zsum.c b/kernel/x86_64/zsum.c index 5973c12538..974d1b2f15 100644 --- a/kernel/x86_64/zsum.c +++ b/kernel/x86_64/zsum.c @@ -54,7 +54,7 @@ static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(sumf); if (inc_x == 1) { - sumf = zsum_kernel(n, x); + sumf = zasum_kernel(n, x); } else { inc_x2 = 2 * inc_x; From 1ed962d25975ff9fba9ca08f5f5d5101bd6426b7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Apr 2025 10:44:48 -0700 Subject: [PATCH 46/56] Fix compilation with xcode16.3/clang17/gcc14 --- Makefile.system | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile.system b/Makefile.system index d6dd9e9608..79544276b6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -435,6 +435,15 @@ ifeq (x$(XCVER), x 15) CCOMMON_OPT += -Wl,-ld_classic FCOMMON_OPT += -Wl,-ld_classic endif +ifeq (x$(XCVER), x 16) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -Wl,-ld_classic +FCOMMON_OPT += -Wl,-ld_classic +endif +ifeq ($(F_COMPILER), GFORTRAN) +override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB))) +endif +endif endif ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) From 67c5bdd639b7c64b764bf5a279e8f58a567742ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 7 Apr 2025 12:20:43 -0700 Subject: [PATCH 47/56] Azure CI: Update flang call in OSX_LLVM_flangnew job (#5208) * Update flang call in OSX_LLVM_flangnew job --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 26f4c2af3d..7941bf4633 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -175,7 +175,7 @@ jobs: - script: | brew update brew install llvm flang - make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/Cellar/flang/19.1.7_1/bin/flang-new NO_SHARED=1 + make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/opt/flang/bin/flang NO_SHARED=1 - job: OSX_OpenMP_Clang pool: From 1c5d0d5539d2d2dfb5d2cc431a9cba1a214f59df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Apr 2025 10:44:36 +0200 Subject: [PATCH 48/56] move libomp to extralib --- test/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/Makefile b/test/Makefile index 65576d3dd1..9ba88988b2 100644 --- a/test/Makefile +++ b/test/Makefile @@ -299,18 +299,18 @@ CLDFLAGS = $(CFLAGS) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB += -lomp +EXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) -CEXTRALIB = -lgomp +EXTRALIB = -lgomp endif ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) -CEXTRALIB += -lgomp +EXTRALIB += -lgomp endif ifeq ($(C_COMPILER), CLANG) -CEXTRALIB += -lomp +EXTRALIB += -lomp endif endif endif From fc8090b60774447a2d205bda363f14408e14a780 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Apr 2025 11:54:36 +0200 Subject: [PATCH 49/56] Move additional omp dependency to EXTRALIB --- ctest/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 877a190c19..e6f683bd87 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -235,18 +235,18 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB += -lomp +EXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) -CEXTRALIB = -lgomp +EXTRALIB = -lgomp endif ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) -CEXTRALIB += -lgomp +EXTRALIB += -lgomp endif ifeq ($(C_COMPILER), CLANG) -CEXTRALIB += -lomp +EXTRALIB += -lomp endif endif endif From 1ff303f36e54da80e459aad3556b561ee70b78b0 Mon Sep 17 00:00:00 2001 From: lglglglgy <2661896437@qq.com> Date: Tue, 8 Apr 2025 21:18:00 +0800 Subject: [PATCH 50/56] Optimizing the Implementation of GEMV on the RISC-V V Extension Specialized some scenarios, performed loop unrolling, and reduced the number of multiplications. --- kernel/riscv64/gemv_n_vector.c | 304 ++++++++++++++++++++++----------- 1 file changed, 207 insertions(+), 97 deletions(-) diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index aa13fc87d3..64ed532cbe 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -27,13 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) -#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) -#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) -#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) -#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8) +#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m8) +#define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f32m8) #else #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t @@ -42,103 +44,211 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i = 0, j = 0, k = 0; - BLASLONG ix = 0, iy = 0; - - if(n < 0) return(0); - FLOAT *a_ptr = a; - FLOAT temp = 0.0; - FLOAT_V_T va0, va1, vy0, vy1; - unsigned int gvl = 0; - if(inc_y == 1){ - gvl = VSETVL(m); - if(gvl <= m/2){ - for(k=0,j=0; k Date: Tue, 8 Apr 2025 07:03:11 -0700 Subject: [PATCH 51/56] Fix incomplete error message (Reference-LAPACK PR 1119) --- lapack-netlib/TESTING/EIG/cerred.f | 12 ++++++------ lapack-netlib/TESTING/EIG/derred.f | 12 ++++++------ lapack-netlib/TESTING/EIG/serred.f | 12 ++++++------ lapack-netlib/TESTING/EIG/zerred.f | 12 ++++++------ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cerred.f b/lapack-netlib/TESTING/EIG/cerred.f index 98d1570805..7514a32418 100644 --- a/lapack-netlib/TESTING/EIG/cerred.f +++ b/lapack-netlib/TESTING/EIG/cerred.f @@ -332,7 +332,7 @@ SUBROUTINE CERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test CGESDD @@ -367,7 +367,7 @@ SUBROUTINE CERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test CGEJSV @@ -433,7 +433,7 @@ SUBROUTINE CERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test CGESVDX @@ -492,7 +492,7 @@ SUBROUTINE CERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test CGESVDQ @@ -547,7 +547,7 @@ SUBROUTINE CERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * @@ -558,7 +558,7 @@ SUBROUTINE CERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * diff --git a/lapack-netlib/TESTING/EIG/derred.f b/lapack-netlib/TESTING/EIG/derred.f index 11a9320526..faa716f8b0 100644 --- a/lapack-netlib/TESTING/EIG/derred.f +++ b/lapack-netlib/TESTING/EIG/derred.f @@ -329,7 +329,7 @@ SUBROUTINE DERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test DGESDD @@ -358,7 +358,7 @@ SUBROUTINE DERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test DGEJSV @@ -424,7 +424,7 @@ SUBROUTINE DERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test DGESVDX @@ -483,7 +483,7 @@ SUBROUTINE DERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test DGESVDQ @@ -538,7 +538,7 @@ SUBROUTINE DERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * @@ -549,7 +549,7 @@ SUBROUTINE DERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * diff --git a/lapack-netlib/TESTING/EIG/serred.f b/lapack-netlib/TESTING/EIG/serred.f index b52aa16246..a55097ebad 100644 --- a/lapack-netlib/TESTING/EIG/serred.f +++ b/lapack-netlib/TESTING/EIG/serred.f @@ -329,7 +329,7 @@ SUBROUTINE SERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test SGESDD @@ -358,7 +358,7 @@ SUBROUTINE SERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test SGEJSV @@ -424,7 +424,7 @@ SUBROUTINE SERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test SGESVDX @@ -483,7 +483,7 @@ SUBROUTINE SERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test SGESVDQ @@ -538,7 +538,7 @@ SUBROUTINE SERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * @@ -549,7 +549,7 @@ SUBROUTINE SERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * diff --git a/lapack-netlib/TESTING/EIG/zerred.f b/lapack-netlib/TESTING/EIG/zerred.f index 1876c1f1d7..f325dcdc3a 100644 --- a/lapack-netlib/TESTING/EIG/zerred.f +++ b/lapack-netlib/TESTING/EIG/zerred.f @@ -332,7 +332,7 @@ SUBROUTINE ZERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test ZGESDD @@ -367,7 +367,7 @@ SUBROUTINE ZERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test ZGEJSV @@ -433,7 +433,7 @@ SUBROUTINE ZERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test ZGESVDX @@ -492,7 +492,7 @@ SUBROUTINE ZERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test ZGESVDQ @@ -547,7 +547,7 @@ SUBROUTINE ZERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * @@ -558,7 +558,7 @@ SUBROUTINE ZERRED( PATH, NUNIT ) WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * From a34b487f225e219670e95ad89cc7ea98121a9439 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Apr 2025 17:25:46 +0200 Subject: [PATCH 52/56] Remove spurious cast from Alpha and Cell's DEFAULT_ALIGN --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 229554f337..48b64fd2ae 100644 --- a/param.h +++ b/param.h @@ -2146,7 +2146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 512 #define GEMM_DEFAULT_OFFSET_B 512 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2214,7 +2214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 8192 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 From ec146157d380c2a1003bac4d25b6fc39d55d4c0e Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 2 Apr 2025 09:11:58 +0000 Subject: [PATCH 53/56] Use SVE kernel for S/DGEMVT for SVE machines --- CONTRIBUTORS.md | 3 +++ kernel/arm64/KERNEL.ARMV8SVE | 4 ++-- kernel/arm64/KERNEL.NEOVERSEN2 | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 2e2979accb..6b0814dcc3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -26,6 +26,9 @@ * Chris Sidebottom * Optimizations and other improvements targeting AArch64 +* Annop Wongwathanarat + * Optimizations and other improvements targeting AArch64 + ## Previous Developers * Zaheer Chothia diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 9adacce632..4ff53c6d0c 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -79,8 +79,8 @@ DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S +SGEMVTKERNEL = gemv_t_sve_v1x3.c +DGEMVTKERNEL = gemv_t_sve_v1x3.c CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index fc7fe69302..b9dc23562b 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -65,8 +65,8 @@ DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S +SGEMVTKERNEL = gemv_t_sve_v1x3.c +DGEMVTKERNEL = gemv_t_sve_v1x3.c CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S From 51ba70f47bba6b7a161f61526a2f196fe896e1f6 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Thu, 10 Apr 2025 15:20:34 +0200 Subject: [PATCH 54/56] test_potrs.c: remove pragma darwin-aarch64 support Using GCC 14.2.0 on Darwin, the pragma ultimately causes a linker error "ld: invalid r_symbolnum=". The current workaround is to use the old linker, but (a) it's deprecated and (b) it can produce libraries that are subsequently not linkable with the newer linker in dependents: the new ld64 does not link to libraries with duplicate rpaths created by the classic linker. --- Makefile.system | 4 ---- utest/test_potrs.c | 1 - 2 files changed, 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 79544276b6..ac6a41c92a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -436,10 +436,6 @@ CCOMMON_OPT += -Wl,-ld_classic FCOMMON_OPT += -Wl,-ld_classic endif ifeq (x$(XCVER), x 16) -ifeq ($(C_COMPILER), GCC) -CCOMMON_OPT += -Wl,-ld_classic -FCOMMON_OPT += -Wl,-ld_classic -endif ifeq ($(F_COMPILER), GFORTRAN) override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB))) endif diff --git a/utest/test_potrs.c b/utest/test_potrs.c index 642ce1e376..e6ccf4bb64 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -32,7 +32,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" -#pragma GCC optimize("no-gcse") /* void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, From 3d6d026fe1f61ed4ee501aa7d69b40196ee1b174 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Thu, 10 Apr 2025 15:44:31 +0200 Subject: [PATCH 55/56] no-gcse when loongarch64 --- utest/test_potrs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utest/test_potrs.c b/utest/test_potrs.c index e6ccf4bb64..bcb1f753b0 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -32,6 +32,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" +#if defined(ARCH_LOONGARCH64) +#pragma GCC optimize("no-gcse") +#endif /* void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, From fd3afef1222b95d4c1a809667e7a76e259675fe1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Apr 2025 22:09:19 +0200 Subject: [PATCH 56/56] lapacke_mangling.h is no longer generated, so don't delete on make clean --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2083c3c789..90de509133 100644 --- a/Makefile +++ b/Makefile @@ -452,7 +452,7 @@ endif @rm -f cblas.tmp cblas.tmp2 @touch $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean - @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h + @rm -f $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C relapack clean @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)