Skip to content

Commit c748e6a

Browse files
committed
optimized sbgemm kernel for neoverse-v1 (sve-256)
Signed-off-by: Ye Tao <ye.tao@arm.com>
1 parent 4379a6f commit c748e6a

12 files changed

+789
-987
lines changed

CONTRIBUTORS.md

+2
Original file line numberDiff line numberDiff line change
@@ -240,3 +240,5 @@ In chronological order:
240240
* Marek Michalowski <https://github.com/michalowski-arm>
241241
* [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1`
242242

243+
* Ye Tao <ye.tao@arm.com>
244+
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1

Makefile.arm64

+2-2
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ ifeq ($(CORE), NEOVERSEV1)
101101
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
102102
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
103103
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
104-
CCOMMON_OPT += -march=armv8.4-a+sve
104+
CCOMMON_OPT += -march=armv8.4-a+sve+bf16
105105
ifeq (1, $(ISCLANG))
106106
CCOMMON_OPT += -mtune=cortex-x1
107107
else
@@ -111,7 +111,7 @@ ifneq ($(F_COMPILER), NAG)
111111
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
112112
endif
113113
else
114-
CCOMMON_OPT += -march=armv8.4-a+sve
114+
CCOMMON_OPT += -march=armv8.4-a+sve+bf16
115115
ifneq ($(CROSS), 1)
116116
CCOMMON_OPT += -mtune=native
117117
endif

cmake/system.cmake

+2-2
Original file line numberDiff line numberDiff line change
@@ -291,10 +291,10 @@ if (DEFINED TARGET)
291291

292292
if (${TARGET} STREQUAL NEOVERSEV1)
293293
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
294-
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1")
294+
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
295295
else ()
296296
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4)
297-
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1")
297+
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
298298
else ()
299299
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.")
300300
endif()

kernel/arm64/KERNEL.NEOVERSEV1

+8-4
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
include $(KERNELDIR)/KERNEL.ARMV8SVE
22

3-
SGEMVTKERNEL = gemv_t_sve.c
4-
DGEMVTKERNEL = gemv_t_sve.c
3+
SGEMVTKERNEL = gemv_t_sve_v1x3.c
4+
DGEMVTKERNEL = gemv_t_sve_v1x3.c
5+
ifeq ($(BUILD_BFLOAT16), 1)
56
SBGEMM_BETA = sbgemm_beta_neoversev1.c
67
SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c
8+
ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
79
SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c
810
SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c
9-
SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c
10-
SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c
1111
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
1212
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
13+
endif
14+
SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c
15+
SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c
1316
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
1417
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
18+
endif

kernel/arm64/sbgemm_kernel_8x4_neoversev1.c kernel/arm64/sbgemm_kernel_4x4_neoversev1.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/***************************************************************************
2-
* Copyright (c) 2024, The OpenBLAS Project
2+
* Copyright (c) 2024-2025, The OpenBLAS Project
33
* All rights reserved.
44
* Redistribution and use in source and binary forms, with or without
55
* modification, are permitted provided that the following conditions are
@@ -31,9 +31,9 @@
3131
#include "common.h"
3232

3333
#define ALPHA_ONE
34-
#include "sbgemm_kernel_8x4_neoversev1_impl.c"
34+
#include "sbgemm_kernel_4x4_neoversev1_impl.c"
3535
#undef ALPHA_ONE
36-
#include "sbgemm_kernel_8x4_neoversev1_impl.c"
36+
#include "sbgemm_kernel_4x4_neoversev1_impl.c"
3737

3838
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B,
3939
FLOAT *C, BLASLONG ldc) {

0 commit comments

Comments
 (0)