Skip to content

Commit 49506d7

Browse files
committed
[SystemZ] Add CodeGen support for scalar f64 ops in vector registers
The z13 vector facility includes some instructions that operate only on the high f64 in a v2f64, effectively extending the FP register set from 16 to 32 registers. It's still better to use the old instructions if the operands happen to fit though, since the older instructions have a shorter encoding. Based on a patch by Richard Sandiford. llvm-svn: 236524
1 parent 80b3af7 commit 49506d7

40 files changed

+1102
-80
lines changed

llvm/lib/Target/SystemZ/SystemZ.td

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ include "SystemZOperands.td"
4040
include "SystemZPatterns.td"
4141
include "SystemZInstrFormats.td"
4242
include "SystemZInstrInfo.td"
43-
include "SystemZInstrFP.td"
4443
include "SystemZInstrVector.td"
44+
include "SystemZInstrFP.td"
4545

4646
def SystemZInstrInfo : InstrInfo {}
4747

llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp

+44
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,27 @@ static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
8080
Context);
8181
}
8282

83+
// MI loads the high part of a vector from memory. Return an instruction
84+
// that uses replicating vector load Opcode to do the same thing.
85+
static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) {
86+
return MCInstBuilder(Opcode)
87+
.addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
88+
.addReg(MI->getOperand(1).getReg())
89+
.addImm(MI->getOperand(2).getImm())
90+
.addReg(MI->getOperand(3).getReg());
91+
}
92+
93+
// MI stores the high part of a vector to memory. Return an instruction
94+
// that uses elemental vector store Opcode to do the same thing.
95+
static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
96+
return MCInstBuilder(Opcode)
97+
.addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
98+
.addReg(MI->getOperand(1).getReg())
99+
.addImm(MI->getOperand(2).getImm())
100+
.addReg(MI->getOperand(3).getReg())
101+
.addImm(0);
102+
}
103+
83104
void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
84105
SystemZMCInstLower Lower(MF->getContext(), *this);
85106
MCInst LoweredMI;
@@ -158,6 +179,29 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
158179
.addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()));
159180
break;
160181

182+
case SystemZ::VLR32:
183+
case SystemZ::VLR64:
184+
LoweredMI = MCInstBuilder(SystemZ::VLR)
185+
.addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
186+
.addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()));
187+
break;
188+
189+
case SystemZ::VL32:
190+
LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
191+
break;
192+
193+
case SystemZ::VL64:
194+
LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG);
195+
break;
196+
197+
case SystemZ::VST32:
198+
LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF);
199+
break;
200+
201+
case SystemZ::VST64:
202+
LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEG);
203+
break;
204+
161205
case SystemZ::LFER:
162206
LoweredMI = MCInstBuilder(SystemZ::VLGVF)
163207
.addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))

llvm/lib/Target/SystemZ/SystemZISelLowering.cpp

+8-3
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
9191
addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
9292
else
9393
addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
94-
addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
95-
addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
96-
addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
94+
addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
95+
if (Subtarget.hasVector()) {
96+
addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
97+
addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
98+
} else {
99+
addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
100+
addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
101+
}
97102
addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
98103

99104
if (Subtarget.hasVector()) {

llvm/lib/Target/SystemZ/SystemZInstrFP.td

+11-3
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,14 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
4646
defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
4747
defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
4848
}
49-
defm : CompareZeroFP<LTEBRCompare, FP32>;
50-
defm : CompareZeroFP<LTDBRCompare, FP64>;
51-
defm : CompareZeroFP<LTXBRCompare, FP128>;
49+
// Note that the comparison against zero operation is not available if we
50+
// have vector support, since load-and-test instructions will partially
51+
// clobber the target (vector) register.
52+
let Predicates = [FeatureNoVector] in {
53+
defm : CompareZeroFP<LTEBRCompare, FP32>;
54+
defm : CompareZeroFP<LTDBRCompare, FP64>;
55+
defm : CompareZeroFP<LTXBRCompare, FP128>;
56+
}
5257

5358
// Moves between 64-bit integer and floating-point registers.
5459
def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
@@ -98,6 +103,9 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
98103
defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
99104
defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
100105

106+
// For z13 we prefer LDE over LE to avoid partial register dependencies.
107+
def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
108+
101109
// These instructions are split after register allocation, so we don't
102110
// want a custom inserter.
103111
let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {

llvm/lib/Target/SystemZ/SystemZInstrFormats.td

+21-1
Original file line numberDiff line numberDiff line change
@@ -2151,10 +2151,13 @@ class PrefetchRILPC<string mnemonic, bits<12> opcode,
21512151

21522152
// A floating-point load-and test operation. Create both a normal unary
21532153
// operation and one that acts as a comparison against zero.
2154+
// Note that the comparison against zero operation is not available if we
2155+
// have vector support, since load-and-test instructions will partially
2156+
// clobber the target (vector) register.
21542157
multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
21552158
RegisterOperand cls> {
21562159
def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
2157-
let isCodeGenOnly = 1 in
2160+
let isCodeGenOnly = 1, Predicates = [FeatureNoVector] in
21582161
def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
21592162
}
21602163

@@ -2401,6 +2404,23 @@ class Alias<int size, dag outs, dag ins, list<dag> pattern>
24012404
class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
24022405
: Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>;
24032406

2407+
// An alias of a UnaryVRR*, but with different register sizes.
2408+
class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
2409+
: Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
2410+
[(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
2411+
2412+
// An alias of a UnaryVRX, but with different register sizes.
2413+
class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
2414+
AddressingMode mode = bdxaddr12only>
2415+
: Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
2416+
[(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
2417+
2418+
// An alias of a StoreVRX, but with different register sizes.
2419+
class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
2420+
AddressingMode mode = bdxaddr12only>
2421+
: Alias<6, (outs), (ins tr.op:$V1, mode:$XBD2),
2422+
[(operator (tr.vt tr.op:$V1), mode:$XBD2)]>;
2423+
24042424
// An alias of a BinaryRI, but with different register sizes.
24052425
class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
24062426
Immediate imm>

llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,10 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
578578
Opcode = SystemZ::LDR;
579579
else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
580580
Opcode = SystemZ::LXR;
581+
else if (SystemZ::VR32BitRegClass.contains(DestReg, SrcReg))
582+
Opcode = SystemZ::VLR32;
583+
else if (SystemZ::VR64BitRegClass.contains(DestReg, SrcReg))
584+
Opcode = SystemZ::VLR64;
581585
else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg))
582586
Opcode = SystemZ::VLR;
583587
else
@@ -1118,6 +1122,12 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
11181122
} else if (RC == &SystemZ::FP128BitRegClass) {
11191123
LoadOpcode = SystemZ::LX;
11201124
StoreOpcode = SystemZ::STX;
1125+
} else if (RC == &SystemZ::VR32BitRegClass) {
1126+
LoadOpcode = SystemZ::VL32;
1127+
StoreOpcode = SystemZ::VST32;
1128+
} else if (RC == &SystemZ::VR64BitRegClass) {
1129+
LoadOpcode = SystemZ::VL64;
1130+
StoreOpcode = SystemZ::VST64;
11211131
} else if (RC == &SystemZ::VF128BitRegClass ||
11221132
RC == &SystemZ::VR128BitRegClass) {
11231133
LoadOpcode = SystemZ::VL;

llvm/lib/Target/SystemZ/SystemZInstrVector.td

+30-12
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
let Predicates = [FeatureVector] in {
1515
// Register move.
1616
def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
17+
def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
18+
def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
1719

1820
// Load GR from VR element.
1921
def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>;
@@ -123,6 +125,13 @@ let Predicates = [FeatureVector] in {
123125
def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)),
124126
(VLREPG bdxaddr12only:$addr)>;
125127

128+
// Use VLREP to load subvectors. These patterns use "12pair" because
129+
// LEY and LDY offer full 20-bit displacement fields. It's often better
130+
// to use those instructions rather than force a 20-bit displacement
131+
// into a GPR temporary.
132+
def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
133+
def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
134+
126135
// Load logical element and zero.
127136
def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8, v128b, 1, 0>;
128137
def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>;
@@ -193,6 +202,13 @@ let Predicates = [FeatureVector] in {
193202
imm32zx1:$index),
194203
(VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
195204

205+
// Use VSTE to store subvectors. These patterns use "12pair" because
206+
// STEY and STDY offer full 20-bit displacement fields. It's often better
207+
// to use those instructions rather than force a 20-bit displacement
208+
// into a GPR temporary.
209+
def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
210+
def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
211+
196212
// Scatter element.
197213
def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>;
198214
def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
@@ -778,7 +794,7 @@ multiclass VectorRounding<Instruction insn, TypedReg tr> {
778794
let Predicates = [FeatureVector] in {
779795
// Add.
780796
def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
781-
def WFADB : BinaryVRRc<"wfadb", 0xE7E3, null_frag, v64db, v64db, 3, 8>;
797+
def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
782798

783799
// Convert from fixed 64-bit.
784800
def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
@@ -804,53 +820,55 @@ let Predicates = [FeatureVector] in {
804820

805821
// Divide.
806822
def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
807-
def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, null_frag, v64db, v64db, 3, 8>;
823+
def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
808824

809825
// Load FP integer.
810826
def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, null_frag, v128db, v128db, 3, 0>;
811827
def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
812828
defm : VectorRounding<VFIDB, v128db>;
829+
defm : VectorRounding<WFIDB, v64db>;
813830

814831
// Load lengthened.
815832
def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
816-
def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, null_frag, v64db, v32eb, 2, 8>;
833+
def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fextend, v64db, v32eb, 2, 8>;
817834

818835
// Load rounded,
819836
def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
820837
def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
821838
def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
839+
def : FPConversion<WLEDB, fround, v32eb, v64db, 0, 0>;
822840

823841
// Multiply.
824842
def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
825-
def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, null_frag, v64db, v64db, 3, 8>;
843+
def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
826844

827845
// Multiply and add.
828846
def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
829-
def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, null_frag, v64db, v64db, 8, 3>;
847+
def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
830848

831849
// Multiply and subtract.
832850
def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
833-
def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, null_frag, v64db, v64db, 8, 3>;
851+
def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
834852

835853
// Load complement,
836854
def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
837-
def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 0>;
855+
def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
838856

839857
// Load negative.
840858
def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
841-
def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 1>;
859+
def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
842860

843861
// Load positive.
844862
def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
845-
def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 2>;
863+
def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
846864

847865
// Square root.
848866
def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
849-
def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, null_frag, v64db, v64db, 3, 8>;
867+
def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
850868

851869
// Subtract.
852870
def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
853-
def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, null_frag, v64db, v64db, 3, 8>;
871+
def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
854872

855873
// Test data class immediate.
856874
let Defs = [CC] in {
@@ -866,7 +884,7 @@ let Predicates = [FeatureVector] in {
866884
let Predicates = [FeatureVector] in {
867885
// Compare scalar.
868886
let Defs = [CC] in
869-
def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, null_frag, v64db, 3>;
887+
def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
870888

871889
// Compare and signal scalar.
872890
let Defs = [CC] in

0 commit comments

Comments
 (0)