Skip to content

Commit eb84aac

Browse files
authored
Merge pull request #5084 from quic/topic/sgemm_direct_sme1
Support for SGEMM_DIRECT Kernel based on SME1
2 parents abbd78a + f66ca05 commit eb84aac

25 files changed

+617
-14
lines changed

Makefile.arm64

+5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve
3030
endif
3131
endif
3232

33+
ifeq ($(CORE), ARMV9SME)
34+
CCOMMON_OPT += -march=armv9-a+sve2+sme
35+
FCOMMON_OPT += -march=armv9-a+sve2
36+
endif
37+
3338
ifeq ($(CORE), CORTEXA53)
3439
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
3540
ifneq ($(F_COMPILER), NAG)

Makefile.system

+8
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64)
420420
export MACOSX_DEPLOYMENT_TARGET=11.0
421421
ifeq ($(C_COMPILER), GCC)
422422
export NO_SVE = 1
423+
export NO_SME = 1
423424
endif
424425
else
425426
export MACOSX_DEPLOYMENT_TARGET=10.8
@@ -709,6 +710,9 @@ DYNAMIC_CORE += NEOVERSEN2
709710
DYNAMIC_CORE += ARMV8SVE
710711
DYNAMIC_CORE += A64FX
711712
endif
713+
ifneq ($(NO_SME), 1)
714+
DYNAMIC_CORE += ARMV9SME
715+
endif
712716
DYNAMIC_CORE += THUNDERX
713717
DYNAMIC_CORE += THUNDERX2T99
714718
DYNAMIC_CORE += TSV110
@@ -1472,6 +1476,10 @@ ifeq ($(NO_SVE), 1)
14721476
CCOMMON_OPT += -DNO_SVE
14731477
endif
14741478

1479+
ifeq ($(NO_SME), 1)
1480+
CCOMMON_OPT += -DNO_SME
1481+
endif
1482+
14751483
ifdef SMP
14761484
CCOMMON_OPT += -DSMP_SERVER
14771485

TargetList.txt

+1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ THUNDERX3T110
111111
VORTEX
112112
A64FX
113113
ARMV8SVE
114+
ARMV9SME
114115
FT2000
115116

116117
9.System Z:

c_check

+19
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,24 @@ if [ "$architecture" = "arm64" ]; then
334334
rm -rf "$tmpd"
335335
fi
336336

337+
no_sme=0
338+
if [ "$architecture" = "arm64" ]; then
339+
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
340+
tmpf="$tmpd/a.S"
341+
printf ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n">> "$tmpf"
342+
args=" -march=armv9-a+sve2+sme -c -o $tmpf.o $tmpf"
343+
no_sme=0
344+
{
345+
$compiler_name $flags $args >/dev/null 2>&1
346+
} || {
347+
args=" -march=armv9-a+sme -c -o $tmpf.o $tmpf"
348+
$compiler_name $flags $args >/dev/null 2>&1
349+
} || {
350+
no_sme=1
351+
}
352+
rm -rf "$tmpd"
353+
fi
354+
337355
c11_atomics=0
338356
case "$data" in
339357
*HAVE_C11*)
@@ -475,6 +493,7 @@ done
475493
printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a"
476494
[ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n"
477495
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
496+
[ "$no_sme" -eq 1 ] && printf "NO_SME=1\n"
478497
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
479498
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
480499
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"

cmake/arch.cmake

+15-3
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,21 @@ endif ()
4444

4545
if (DYNAMIC_ARCH)
4646
if (ARM64)
47-
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
48-
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
49-
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
47+
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
48+
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
49+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10
50+
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
51+
endif ()
52+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14
53+
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
54+
endif()
55+
elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang")
56+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
57+
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
58+
endif ()
59+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19
60+
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
61+
endif()
5062
endif ()
5163
if (DYNAMIC_LIST)
5264
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})

cmake/cc.cmake

+6
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE)
238238
endif ()
239239
endif ()
240240

241+
if (${CORE} STREQUAL ARMV9SME)
242+
if (NOT DYNAMIC_ARCH)
243+
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme")
244+
endif ()
245+
endif ()
246+
241247
if (${CORE} STREQUAL CORTEXA510)
242248
if (NOT DYNAMIC_ARCH)
243249
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")

cmake/prebuild.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,7 @@ endif ()
10141014
set(ZGEMM_UNROLL_M 4)
10151015
set(ZGEMM_UNROLL_N 4)
10161016
set(SYMV_P 16)
1017-
elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
1017+
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME")
10181018
file(APPEND ${TARGET_CONF_TEMP}
10191019
"#define L1_CODE_SIZE\t65536\n"
10201020
"#define L1_CODE_LINESIZE\t64\n"

cmake/system.cmake

+3
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,9 @@ if (${TARGET} STREQUAL NEOVERSEV1)
318318
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
319319
endif()
320320
endif()
321+
if (${TARGET} STREQUAL ARMV9SME)
322+
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3")
323+
endif()
321324
if (${TARGET} STREQUAL A64FX)
322325
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
323326
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")

cmake/system_check.cmake

+11
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,17 @@ endif()
139139
endif()
140140
endif()
141141

142+
if (ARM64)
143+
if (NOT NO_SME)
144+
file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n")
145+
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME)
146+
if (NO_SME EQUAL 1)
147+
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME")
148+
endif()
149+
file(REMOVE "${PROJECT_BINARY_DIR}/sme.c" "${PROJECT_BINARY_DIR}/sme.o")
150+
endif()
151+
endif()
152+
142153
include(CheckIncludeFile)
143154
CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11)
144155
if (HAVE_C11 EQUAL 1)

common.h

+1
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,7 @@ void gotoblas_profile_init(void);
702702
void gotoblas_profile_quit(void);
703703

704704
int support_avx512(void);
705+
int support_sme1(void);
705706

706707
#ifdef USE_OPENMP
707708

common_arm64.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
175175
#define HUGE_PAGESIZE ( 4 << 20)
176176

177177
#ifndef BUFFERSIZE
178-
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
178+
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME)
179179
#define BUFFER_SIZE (32 << 22)
180180
#else
181181
#define BUFFER_SIZE (32 << 20)

common_param.h

+6
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
223223
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
224224
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
225225
#endif
226+
#ifdef ARCH_ARM64
227+
#ifdef HAVE_SME
228+
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
229+
#endif
230+
#endif
231+
226232

227233
int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
228234
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);

common_s.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,9 @@
215215
#ifdef ARCH_X86_64
216216
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
217217
#define SGEMM_DIRECT gotoblas -> sgemm_direct
218-
#else
218+
#elif ARCH_ARM64
219219
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant
220-
#define SGEMM_DIRECT sgemm_direct
220+
#define SGEMM_DIRECT gotoblas -> sgemm_direct
221221
#endif
222222

223223
#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy

driver/others/dynamic_arm64.c

+34
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE;
115115
#else
116116
#define gotoblas_ARMV8SVE gotoblas_ARMV8
117117
#endif
118+
#ifdef DYN_ARMV9SME
119+
extern gotoblas_t gotoblas_ARMV9SME;
120+
#else
121+
#define gotoblas_ARMV9SME gotoblas_ARMV8
122+
#endif
118123
#ifdef DYN_CORTEX_A55
119124
extern gotoblas_t gotoblas_CORTEXA55;
120125
#else
@@ -148,6 +153,13 @@ extern gotoblas_t gotoblas_A64FX;
148153
#define gotoblas_ARMV8SVE gotoblas_ARMV8
149154
#define gotoblas_A64FX gotoblas_ARMV8
150155
#endif
156+
157+
#ifndef NO_SME
158+
extern gotoblas_t gotoblas_ARMV9SME;
159+
#else
160+
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
161+
#endif
162+
151163
extern gotoblas_t gotoblas_THUNDERX3T110;
152164
#endif
153165
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
@@ -168,6 +180,9 @@ extern void openblas_warning(int verbose, const char * msg);
168180
#ifndef HWCAP_SVE
169181
#define HWCAP_SVE (1 << 22)
170182
#endif
183+
#ifndef HWCAP2_SME
184+
#define HWCAP2_SME 1<<23
185+
#endif
171186

172187
#define get_cpu_ftr(id, var) ({ \
173188
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
@@ -430,6 +445,13 @@ static gotoblas_t *get_coretype(void) {
430445
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
431446
openblas_warning(1, coremsg);
432447
}
448+
449+
#if !defined(NO_SME) && defined(HWCAP2_SME)
450+
if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) {
451+
return &gotoblas_ARMV9SME;
452+
}
453+
#endif
454+
433455
#ifndef NO_SVE
434456
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
435457
return &gotoblas_ARMV8SVE;
@@ -480,3 +502,15 @@ void gotoblas_dynamic_init(void) {
480502
void gotoblas_dynamic_quit(void) {
481503
gotoblas = NULL;
482504
}
505+
506+
int support_sme1(void) {
507+
int ret = 0;
508+
509+
#if (defined OS_LINUX || defined OS_ANDROID)
510+
ret = getauxval(AT_HWCAP2) & HWCAP2_SME;
511+
if(getauxval(AT_HWCAP2) & HWCAP2_SME){
512+
ret = 1;
513+
}
514+
#endif
515+
return ret;
516+
}

getarch.c

+13
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12891289
#define CORENAME "ARMV8SVE"
12901290
#endif
12911291

1292+
#ifdef FORCE_ARMV9SME
1293+
#define FORCE
1294+
#define ARCHITECTURE "ARM64"
1295+
#define SUBARCHITECTURE "ARMV9SME"
1296+
#define SUBDIRNAME "arm64"
1297+
#define ARCHCONFIG "-DARMV9SME " \
1298+
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
1299+
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
1300+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
1301+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9"
1302+
#define LIBNAME "armv9sme"
1303+
#define CORENAME "ARMV9SME"
1304+
#endif
12921305

12931306
#ifdef FORCE_ARMV8
12941307
#define FORCE

interface/gemm.c

+10-3
Original file line numberDiff line numberDiff line change
@@ -393,14 +393,21 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
393393
PRINT_DEBUG_CNAME;
394394

395395
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
396-
#ifdef DYNAMIC_ARCH
396+
#if defined(DYNAMIC_ARCH) && defined(ARCH_x86)
397397
if (support_avx512() )
398-
#endif
399398
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
400399
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
401400
return;
402401
}
403-
402+
#endif
403+
#if defined(DYNAMIC_ARCH) && defined(ARCH_ARM64)
404+
if (support_sme1()){
405+
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
406+
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
407+
return;
408+
}
409+
}
410+
#endif
404411
#endif
405412

406413
#ifndef COMPLEX

kernel/CMakeLists.txt

+10-2
Original file line numberDiff line numberDiff line change
@@ -207,19 +207,27 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
207207
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
208208
set(USE_TRMM true)
209209
endif ()
210-
211210
set(USE_DIRECT_SGEMM false)
212-
if (X86_64)
211+
if (X86_64 OR (ARM64 AND (UC_TARGET_CORE MATCHES ARMV9SME)))
213212
set(USE_DIRECT_SGEMM true)
214213
endif()
215214

216215
if (USE_DIRECT_SGEMM)
217216
# if (NOT DEFINED SGEMMDIRECTKERNEL)
217+
if (X86_64)
218218
set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c)
219219
set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c)
220220
# endif()
221221
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
222222
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
223+
elseif (ARM64)
224+
set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c)
225+
set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S)
226+
set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S)
227+
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
228+
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE)
229+
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE)
230+
endif ()
223231
endif()
224232

225233
foreach (float_type SINGLE DOUBLE)

kernel/Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ ifdef NO_AVX2
2424
AVX2OPT=
2525
endif
2626

27+
2728
ifdef TARGET_CORE
29+
ifeq ($(TARGET_CORE), ARMV9SME)
30+
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme
31+
endif
2832
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
2933
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
3034
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))

0 commit comments

Comments
 (0)