Skip to content

Commit 9276c90

Browse files
committed
Optimized sgemm and dgemm and tested again.
1 parent 391584a commit 9276c90

13 files changed

+1293
-4
lines changed

Diff for: Makefile.rule

+3
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ VERSION = 0.2.19.dev
5252
# USE_THREAD = 0
5353

5454
# If you're going to use this library with OpenMP, please comment it in.
55+
# always use this flag for POWER8
5556
# USE_OPENMP = 1
5657

5758
# You can define maximum number of threads. Basically it should be
@@ -153,10 +154,12 @@ NO_AFFINITY = 1
153154

154155
# Common Optimization Flag;
155156
# The default -O2 is enough.
157+
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
156158
# COMMON_OPT = -O2
157159

158160
# gfortran option for LAPACK
159161
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
162+
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
160163
# FCOMMON_OPT = -frecursive
161164

162165
# Profiling flags

Diff for: common_power.h

+5
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,13 @@
3939
#ifndef COMMON_POWER
4040
#define COMMON_POWER
4141

42+
#if defined(POWER8)
43+
#define MB __asm__ __volatile__ ("eieio":::"memory")
44+
#define WMB __asm__ __volatile__ ("eieio":::"memory")
45+
#else
4246
#define MB __asm__ __volatile__ ("sync")
4347
#define WMB __asm__ __volatile__ ("sync")
48+
#endif
4449

4550
#define INLINE inline
4651

Diff for: kernel/power/KERNEL.POWER8

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
2020

2121
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
2222
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
23-
DGEMMITCOPY = ../generic/gemm_tcopy_16.c
23+
DGEMMITCOPY = dgemm_tcopy_16_power8.S
2424
DGEMMONCOPY = gemm_ncopy_4.S
2525
DGEMMOTCOPY = gemm_tcopy_4.S
2626
DGEMMINCOPYOBJ = dgemm_incopy.o

Diff for: kernel/power/dgemm_kernel_16x4_power8.S

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

2828
/**************************************************************************************
29-
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
29+
* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
3030
* BLASTEST : OK
3131
* CTEST : OK
3232
* TEST : OK

Diff for: kernel/power/dgemm_tcopy_16_power8.S

+211
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
/***************************************************************************
2+
Copyright (c) 2013-2016, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
/**************************************************************************************
29+
* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
30+
* BLASTEST : OK
31+
* CTEST : OK
32+
* TEST : OK
33+
* LAPACK-TEST : OK
34+
**************************************************************************************/
35+
36+
/*********************************************************************/
37+
/* Copyright 2009, 2010 The University of Texas at Austin. */
38+
/* All rights reserved. */
39+
/* */
40+
/* Redistribution and use in source and binary forms, with or */
41+
/* without modification, are permitted provided that the following */
42+
/* conditions are met: */
43+
/* */
44+
/* 1. Redistributions of source code must retain the above */
45+
/* copyright notice, this list of conditions and the following */
46+
/* disclaimer. */
47+
/* */
48+
/* 2. Redistributions in binary form must reproduce the above */
49+
/* copyright notice, this list of conditions and the following */
50+
/* disclaimer in the documentation and/or other materials */
51+
/* provided with the distribution. */
52+
/* */
53+
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
54+
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
55+
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
56+
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
57+
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
58+
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
59+
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
60+
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
61+
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
62+
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
63+
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
64+
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
65+
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
66+
/* POSSIBILITY OF SUCH DAMAGE. */
67+
/* */
68+
/* The views and conclusions contained in the software and */
69+
/* documentation are those of the authors and should not be */
70+
/* interpreted as representing official policies, either expressed */
71+
/* or implied, of The University of Texas at Austin. */
72+
/*********************************************************************/
73+
74+
#define ASSEMBLER
75+
#include "common.h"
76+
#include "def_vsx.h"
77+
78+
#define M r3
79+
#define N r4
80+
#define A r5
81+
#define LDA r6
82+
#define B r7
83+
84+
#define A0 r8
85+
#define A1 r9
86+
#define A2 r10
87+
#define A3 r11
88+
89+
#define J r12
90+
91+
#define PREA r14
92+
#define PREB r15
93+
#define BO r16
94+
#define B8 r17
95+
#define B4 r18
96+
#define B2 r19
97+
#define B1 r20
98+
#define o8 r21
99+
#define T2 r22
100+
#define I r23
101+
#define o16 r24
102+
#define o32 r25
103+
#define o48 r26
104+
#define B16 r29
105+
#define M16 r30
106+
#define T1 r31
107+
108+
#define o0 0
109+
110+
#include "dgemm_tcopy_macros_16_power8.S"
111+
112+
#define STACKSIZE 384
113+
114+
115+
PROLOGUE
116+
PROFCODE
117+
118+
addi SP, SP, -STACKSIZE
119+
li r0, 0
120+
121+
std r31, 144(SP)
122+
std r30, 152(SP)
123+
std r29, 160(SP)
124+
std r28, 168(SP)
125+
std r27, 176(SP)
126+
std r26, 184(SP)
127+
std r25, 192(SP)
128+
std r24, 200(SP)
129+
std r23, 208(SP)
130+
std r22, 216(SP)
131+
std r21, 224(SP)
132+
std r20, 232(SP)
133+
std r19, 240(SP)
134+
std r18, 248(SP)
135+
std r17, 256(SP)
136+
std r16, 264(SP)
137+
std r15, 272(SP)
138+
std r14, 280(SP)
139+
140+
cmpwi cr0, M, 0
141+
ble- L999
142+
cmpwi cr0, N, 0
143+
ble- L999
144+
145+
slwi LDA, LDA, BASE_SHIFT
146+
slwi M16, M, 4 + BASE_SHIFT
147+
148+
li T1, -16
149+
li T2, -8
150+
li PREA, -4
151+
li PREB, -2
152+
153+
and B8, N, T1
154+
and B4, N, T2
155+
and B2, N, PREA
156+
and B1, N, PREB
157+
158+
mullw B8, B8, M
159+
mullw B4, B4, M
160+
mullw B2, B2, M
161+
mullw B1, B1, M
162+
163+
slwi B8, B8, BASE_SHIFT
164+
slwi B4, B4, BASE_SHIFT
165+
slwi B2, B2, BASE_SHIFT
166+
slwi B1, B1, BASE_SHIFT
167+
168+
add B8, B8, B
169+
add B4, B4, B
170+
add B2, B2, B
171+
add B1, B1, B
172+
173+
li PREA, 768
174+
addi PREB, M16, 128
175+
176+
li o8, 8
177+
li o16, 16
178+
li o32, 32
179+
li o48, 48
180+
181+
#include "dgemm_tcopy_logic_16_power8.S"
182+
183+
L999:
184+
185+
li r3, 0
186+
187+
ld r31, 144(SP)
188+
ld r30, 152(SP)
189+
ld r29, 160(SP)
190+
ld r28, 168(SP)
191+
ld r27, 176(SP)
192+
ld r26, 184(SP)
193+
ld r25, 192(SP)
194+
ld r24, 200(SP)
195+
ld r23, 208(SP)
196+
ld r22, 216(SP)
197+
ld r21, 224(SP)
198+
ld r20, 232(SP)
199+
ld r19, 240(SP)
200+
ld r18, 248(SP)
201+
ld r17, 256(SP)
202+
ld r16, 264(SP)
203+
ld r15, 272(SP)
204+
ld r14, 280(SP)
205+
206+
addi SP, SP, STACKSIZE
207+
208+
blr
209+
EPILOGUE
210+
211+

0 commit comments

Comments
 (0)