Skip to content

Commit 9807f56

Browse files
committedMar 13, 2025·
Optimize aarch64 sgemm_ncopy
1 parent 1ba0265 commit 9807f56

File tree

3 files changed

+78
-119
lines changed

3 files changed

+78
-119
lines changed
 

‎CONTRIBUTORS.md

+1
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ In chronological order:
238238
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel
239239
* [2025-02-26] Add sbgemv_t_bfdot kernel
240240
* [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13
241+
* [2025-03-12] Optimize aarch64 sgemm_ncopy
241242

242243
* Marek Michalowski <marek.michalowski@arm.com>
243244
* [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1`

‎kernel/arm64/sgemm_ncopy_4.S

+18-29
Original file line numberDiff line numberDiff line change
@@ -88,28 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8888
prfm PLDL1KEEP, [A04, #A_PREFETCH]
8989

9090
ldr q0, [A01], #16
91-
ins v8.s[0], v0.s[0]
92-
ins v9.s[0], v0.s[1]
93-
ins v10.s[0], v0.s[2]
94-
ins v11.s[0], v0.s[3]
95-
9691
ldr q1, [A02], #16
97-
ins v8.s[1], v1.s[0]
98-
ins v9.s[1], v1.s[1]
99-
ins v10.s[1], v1.s[2]
100-
ins v11.s[1], v1.s[3]
101-
10292
ldr q2, [A03], #16
103-
ins v8.s[2], v2.s[0]
104-
ins v9.s[2], v2.s[1]
105-
ins v10.s[2], v2.s[2]
106-
ins v11.s[2], v2.s[3]
107-
10893
ldr q3, [A04], #16
109-
ins v8.s[3], v3.s[0]
110-
ins v9.s[3], v3.s[1]
111-
ins v10.s[3], v3.s[2]
112-
ins v11.s[3], v3.s[3]
94+
95+
zip1 v12.4s, v0.4s, v1.4s
96+
zip1 v13.4s, v2.4s, v3.4s
97+
zip2 v14.4s, v0.4s, v1.4s
98+
zip2 v15.4s, v2.4s, v3.4s
99+
100+
zip1 v8.2d, v12.2d, v13.2d
101+
zip2 v9.2d, v12.2d, v13.2d
102+
zip1 v10.2d, v14.2d, v15.2d
103+
zip2 v11.2d, v14.2d, v15.2d
113104

114105
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
115106
add B00, B00, #64
@@ -138,16 +129,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
138129
prfm PLDL1KEEP, [A02, #A_PREFETCH]
139130

140131
ldr q0, [A01], #16
141-
ins v8.s[0], v0.s[0]
142-
ins v9.s[0], v0.s[1]
143-
ins v10.s[0], v0.s[2]
144-
ins v11.s[0], v0.s[3]
145-
146132
ldr q1, [A02], #16
147-
ins v8.s[1], v1.s[0]
148-
ins v9.s[1], v1.s[1]
149-
ins v10.s[1], v1.s[2]
150-
ins v11.s[1], v1.s[3]
133+
134+
zip1 v12.4s, v0.4s, v1.4s
135+
zip2 v13.4s, v0.4s, v1.4s
136+
137+
dup v8.2d, v12.d[0]
138+
dup v9.2d, v12.d[1]
139+
dup v10.2d, v13.d[0]
140+
dup v11.2d , v13.d[1]
151141

152142
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
153143
add B00, B00, #32
@@ -330,4 +320,3 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
330320
ret
331321

332322
EPILOGUE
333-

‎kernel/arm64/sgemm_ncopy_8.S

+59-90
Original file line numberDiff line numberDiff line change
@@ -86,47 +86,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8686
.macro COPY4x8
8787
ldr q0, [A01], #16
8888
ldr q1, [A02], #16
89-
ins v8.s[0], v0.s[0]
90-
ins v10.s[0], v0.s[1]
91-
ins v12.s[0], v0.s[2]
92-
ins v14.s[0], v0.s[3]
93-
ins v8.s[1], v1.s[0]
94-
ins v10.s[1], v1.s[1]
95-
ins v12.s[1], v1.s[2]
96-
ins v14.s[1], v1.s[3]
97-
9889
ldr q2, [A03], #16
9990
ldr q3, [A04], #16
100-
ins v8.s[2], v2.s[0]
101-
ins v10.s[2], v2.s[1]
102-
ins v12.s[2], v2.s[2]
103-
ins v14.s[2], v2.s[3]
104-
ins v8.s[3], v3.s[0]
105-
ins v10.s[3], v3.s[1]
106-
ins v12.s[3], v3.s[2]
107-
ins v14.s[3], v3.s[3]
91+
92+
zip1 v16.4s, v0.4s, v1.4s
93+
zip1 v17.4s, v2.4s, v3.4s
94+
zip2 v18.4s, v0.4s, v1.4s
95+
zip2 v19.4s, v2.4s, v3.4s
96+
97+
zip1 v8.2d, v16.2d, v17.2d
98+
zip2 v10.2d, v16.2d, v17.2d
99+
zip1 v12.2d, v18.2d, v19.2d
100+
zip2 v14.2d, v18.2d, v19.2d
108101

109102
ldr q4, [A05], #16
110103
ldr q5, [A06], #16
111-
ins v9.s[0], v4.s[0]
112-
ins v11.s[0], v4.s[1]
113-
ins v13.s[0], v4.s[2]
114-
ins v15.s[0], v4.s[3]
115-
ins v9.s[1], v5.s[0]
116-
ins v11.s[1], v5.s[1]
117-
ins v13.s[1], v5.s[2]
118-
ins v15.s[1], v5.s[3]
119-
120104
ldr q6, [A07], #16
121105
ldr q7, [A08], #16
122-
ins v9.s[2], v6.s[0]
123-
ins v11.s[2], v6.s[1]
124-
ins v13.s[2], v6.s[2]
125-
ins v15.s[2], v6.s[3]
126-
ins v9.s[3], v7.s[0]
127-
ins v11.s[3], v7.s[1]
128-
ins v13.s[3], v7.s[2]
129-
ins v15.s[3], v7.s[3]
106+
107+
zip1 v16.4s, v4.4s, v5.4s
108+
zip1 v17.4s, v6.4s, v7.4s
109+
zip2 v18.4s, v4.4s, v5.4s
110+
zip2 v19.4s, v6.4s, v7.4s
111+
112+
zip1 v9.2d, v16.2d, v17.2d
113+
zip2 v11.2d, v16.2d, v17.2d
114+
zip1 v13.2d, v18.2d, v19.2d
115+
zip2 v15.2d, v18.2d, v19.2d
130116

131117
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
132118
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64
@@ -135,31 +121,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
135121
.macro COPY2x8
136122
ldr d0, [A01], #8
137123
ldr d1, [A02], #8
138-
ins v8.s[0], v0.s[0]
139-
ins v10.s[0], v0.s[1]
140-
ins v8.s[1], v1.s[0]
141-
ins v10.s[1], v1.s[1]
142-
143124
ldr d2, [A03], #8
144125
ldr d3, [A04], #8
145-
ins v8.s[2], v2.s[0]
146-
ins v10.s[2], v2.s[1]
147-
ins v8.s[3], v3.s[0]
148-
ins v10.s[3], v3.s[1]
126+
127+
zip1 v12.4s, v0.4s, v1.4s
128+
zip1 v13.4s, v2.4s, v3.4s
129+
130+
zip1 v8.2d, v12.2d, v13.2d
131+
zip2 v10.2d, v12.2d, v13.2d
149132

150133
ldr d4, [A05], #8
151134
ldr d5, [A06], #8
152-
ins v9.s[0], v4.s[0]
153-
ins v11.s[0], v4.s[1]
154-
ins v9.s[1], v5.s[0]
155-
ins v11.s[1], v5.s[1]
156-
157135
ldr d6, [A07], #8
158136
ldr d7, [A08], #8
159-
ins v9.s[2], v6.s[0]
160-
ins v11.s[2], v6.s[1]
161-
ins v9.s[3], v7.s[0]
162-
ins v11.s[3], v7.s[1]
137+
138+
zip1 v12.4s, v4.4s, v5.4s
139+
zip1 v13.4s, v6.4s, v7.4s
140+
141+
zip1 v9.2d, v12.2d, v13.2d
142+
zip2 v11.2d, v12.2d, v13.2d
163143

164144
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
165145
.endm
@@ -191,43 +171,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
191171
.macro COPY4x4
192172
ldr q0, [A01], #16
193173
ldr q1, [A02], #16
194-
ins v8.s[0], v0.s[0]
195-
ins v9.s[0], v0.s[1]
196-
ins v10.s[0], v0.s[2]
197-
ins v11.s[0], v0.s[3]
198-
ins v8.s[1], v1.s[0]
199-
ins v9.s[1], v1.s[1]
200-
ins v10.s[1], v1.s[2]
201-
ins v11.s[1], v1.s[3]
202-
203174
ldr q2, [A03], #16
204175
ldr q3, [A04], #16
205-
ins v8.s[2], v2.s[0]
206-
ins v9.s[2], v2.s[1]
207-
ins v10.s[2], v2.s[2]
208-
ins v11.s[2], v2.s[3]
209-
ins v8.s[3], v3.s[0]
210-
ins v9.s[3], v3.s[1]
211-
ins v10.s[3], v3.s[2]
212-
ins v11.s[3], v3.s[3]
176+
177+
zip1 v12.4s, v0.4s, v1.4s
178+
zip1 v13.4s, v2.4s, v3.4s
179+
zip2 v14.4s, v0.4s, v1.4s
180+
zip2 v15.4s, v2.4s, v3.4s
181+
182+
zip1 v8.2d, v12.2d, v13.2d
183+
zip2 v9.2d, v12.2d, v13.2d
184+
zip1 v10.2d, v14.2d, v15.2d
185+
zip2 v11.2d, v14.2d, v15.2d
213186

214187
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
215188
.endm
216189

217190
.macro COPY2x4
218191
ldr d0, [A01], #8
219192
ldr d1, [A02], #8
220-
ins v8.s[0], v0.s[0]
221-
ins v9.s[0], v0.s[1]
222-
ins v8.s[1], v1.s[0]
223-
ins v9.s[1], v1.s[1]
224-
225193
ldr d2, [A03], #8
226194
ldr d3, [A04], #8
227-
ins v8.s[2], v2.s[0]
228-
ins v9.s[2], v2.s[1]
229-
ins v8.s[3], v3.s[0]
230-
ins v9.s[3], v3.s[1]
195+
196+
zip1 v10.4s, v0.4s, v1.4s
197+
zip1 v11.4s, v2.4s, v3.4s
198+
199+
zip1 v8.2d, v10.2d, v11.2d
200+
zip2 v9.2d, v10.2d, v11.2d
231201

232202
st1 {v8.4s, v9.4s}, [B00], #32
233203
.endm
@@ -249,25 +219,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
249219
.macro COPY4x2
250220
ldr q0, [A01], #16
251221
ldr q1, [A02], #16
252-
ins v8.s[0], v0.s[0]
253-
ins v9.s[0], v0.s[1]
254-
ins v10.s[0], v0.s[2]
255-
ins v11.s[0], v0.s[3]
256-
ins v8.s[1], v1.s[0]
257-
ins v9.s[1], v1.s[1]
258-
ins v10.s[1], v1.s[2]
259-
ins v11.s[1], v1.s[3]
222+
223+
zip1 v12.4s, v0.4s, v1.4s
224+
zip2 v13.4s, v0.4s, v1.4s
225+
226+
dup v8.2d, v12.d[0]
227+
dup v9.2d, v12.d[1]
228+
dup v10.2d, v13.d[0]
229+
dup v11.2d , v13.d[1]
260230

261231
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32
262232
.endm
263233

264234
.macro COPY2x2
265235
ldr d0, [A01], #8
266236
ldr d1, [A02], #8
267-
ins v8.s[0], v0.s[0]
268-
ins v9.s[0], v0.s[1]
269-
ins v8.s[1], v1.s[0]
270-
ins v9.s[1], v1.s[1]
237+
238+
zip1 v8.2s, v0.2s, v1.2s
239+
zip2 v9.2s, v0.2s, v1.2s
271240

272241
st1 {v8.2s, v9.2s}, [B00], #16
273242
.endm

0 commit comments

Comments
 (0)
Please sign in to comment.