Skip to content

Commit cd2a1b5

Browse files
committed
[SystemZ] Handle sub-128 vectors
The ABI allows sub-128 vectors to be passed and returned in registers, with the vector occupying the upper part of a register. We therefore want to legalize those types by widening the vector rather than promoting the elements. The patch includes some simple tests for sub-128 vectors and also tests that we can recognize various pack sequences, some of which use sub-128 vectors as temporary results. One of these forms is based on the pack sequences generated by llvmpipe when no intrinsics are used. Signed unpacks are recognized as BUILD_VECTORs whose elements are individually sign-extended. Unsigned unpacks can have the equivalent form with zero extension, but they also occur as shuffles in which some elements are zero. Based on a patch by Richard Sandiford. llvm-svn: 236525
1 parent 49506d7 commit cd2a1b5

20 files changed

+1175
-29
lines changed

llvm/lib/Target/SystemZ/SystemZCallingConv.h

+17
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ class SystemZCCState : public CCState {
2828
/// See ISD::OutputArg::IsFixed.
2929
SmallVector<bool, 4> ArgIsFixed;
3030

31+
/// Records whether the value was widened from a short vector type.
32+
SmallVector<bool, 4> ArgIsShortVector;
33+
34+
// Check whether ArgVT is a short vector type.
35+
bool IsShortVectorType(EVT ArgVT) {
36+
return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
37+
}
38+
3139
public:
3240
SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
3341
SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
@@ -39,6 +47,10 @@ class SystemZCCState : public CCState {
3947
ArgIsFixed.clear();
4048
for (unsigned i = 0; i < Ins.size(); ++i)
4149
ArgIsFixed.push_back(true);
50+
// Record whether the call operand was a short vector.
51+
ArgIsShortVector.clear();
52+
for (unsigned i = 0; i < Ins.size(); ++i)
53+
ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));
4254

4355
CCState::AnalyzeFormalArguments(Ins, Fn);
4456
}
@@ -49,6 +61,10 @@ class SystemZCCState : public CCState {
4961
ArgIsFixed.clear();
5062
for (unsigned i = 0; i < Outs.size(); ++i)
5163
ArgIsFixed.push_back(Outs[i].IsFixed);
64+
// Record whether the call operand was a short vector.
65+
ArgIsShortVector.clear();
66+
for (unsigned i = 0; i < Outs.size(); ++i)
67+
ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));
5268

5369
CCState::AnalyzeCallOperands(Outs, Fn);
5470
}
@@ -60,6 +76,7 @@ class SystemZCCState : public CCState {
6076
CCAssignFn Fn) = delete;
6177

6278
bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
79+
bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
6380
};
6481

6582
} // end namespace llvm

llvm/lib/Target/SystemZ/SystemZCallingConv.td

+16-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ class CCIfSubtarget<string F, CCAction A>
2121
class CCIfFixed<CCAction A>
2222
: CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
2323

24+
// Match if this specific argument was widened from a short vector type.
25+
class CCIfShortVector<CCAction A>
26+
: CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
27+
28+
2429
//===----------------------------------------------------------------------===//
2530
// z/Linux return value calling convention
2631
//===----------------------------------------------------------------------===//
@@ -43,6 +48,8 @@ def RetCC_SystemZ : CallingConv<[
4348
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
4449

4550
// Similarly for vectors, with V24 being the ABI-compliant choice.
51+
// Sub-128 vectors are returned in the same way, but they're widened
52+
// to one of these types during type legalization.
4653
CCIfSubtarget<"hasVector()",
4754
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
4855
CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
@@ -74,12 +81,20 @@ def CC_SystemZ : CallingConv<[
7481
CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
7582
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
7683

77-
// The first 8 named vector arguments are passed in V24-V31.
84+
// The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
85+
// are passed in the same way, but they're widened to one of these types
86+
// during type legalization.
7887
CCIfSubtarget<"hasVector()",
7988
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
8089
CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
8190
V25, V27, V29, V31]>>>>,
8291

92+
// However, sub-128 vectors which need to go on the stack occupy just a
93+
// single 8-byte-aligned 8-byte stack slot. Pass as i64.
94+
CCIfSubtarget<"hasVector()",
95+
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
96+
CCIfShortVector<CCBitConvertToType<i64>>>>,
97+
8398
// Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
8499
CCIfSubtarget<"hasVector()",
85100
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],

llvm/lib/Target/SystemZ/SystemZISelLowering.cpp

+72-9
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
318318
// Convert a GPR scalar to a vector by inserting it into element 0.
319319
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
320320

321+
// Use a series of unpacks for extensions.
322+
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
323+
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
324+
321325
// Detect shifts by a scalar amount and convert them into
322326
// V*_BY_SCALAR.
323327
setOperationAction(ISD::SHL, VT, Custom);
@@ -793,7 +797,15 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
793797
else if (VA.getLocInfo() == CCValAssign::Indirect)
794798
Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
795799
MachinePointerInfo(), false, false, false, 0);
796-
else
800+
else if (VA.getLocInfo() == CCValAssign::BCvt) {
801+
// If this is a short vector argument loaded from the stack,
802+
// extend from i64 to full vector size and then bitcast.
803+
assert(VA.getLocVT() == MVT::i64);
804+
assert(VA.getValVT().isVector());
805+
Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64,
806+
Value, DAG.getUNDEF(MVT::i64));
807+
Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
808+
} else
797809
assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
798810
return Value;
799811
}
@@ -810,6 +822,14 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
810822
return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
811823
case CCValAssign::AExt:
812824
return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
825+
case CCValAssign::BCvt:
826+
// If this is a short vector argument to be stored to the stack,
827+
// bitcast to v2i64 and then extract first element.
828+
assert(VA.getLocVT() == MVT::i64);
829+
assert(VA.getValVT().isVector());
830+
Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
831+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
832+
DAG.getConstant(0, DL, MVT::i32));
813833
case CCValAssign::Full:
814834
return Value;
815835
default:
@@ -3910,6 +3930,23 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
39103930
return DAG.getNode(ISD::BITCAST, DL, VT, Res);
39113931
}
39123932

3933+
SDValue
3934+
SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
3935+
unsigned UnpackHigh) const {
3936+
SDValue PackedOp = Op.getOperand(0);
3937+
EVT OutVT = Op.getValueType();
3938+
EVT InVT = PackedOp.getValueType();
3939+
unsigned ToBits = OutVT.getVectorElementType().getSizeInBits();
3940+
unsigned FromBits = InVT.getVectorElementType().getSizeInBits();
3941+
do {
3942+
FromBits *= 2;
3943+
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
3944+
SystemZ::VectorBits / FromBits);
3945+
PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
3946+
} while (FromBits != ToBits);
3947+
return PackedOp;
3948+
}
3949+
39133950
SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
39143951
unsigned ByScalar) const {
39153952
// Look for cases where a vector shift can use the *_BY_SCALAR form.
@@ -4058,6 +4095,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
40584095
return lowerINSERT_VECTOR_ELT(Op, DAG);
40594096
case ISD::EXTRACT_VECTOR_ELT:
40604097
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
4098+
case ISD::SIGN_EXTEND_VECTOR_INREG:
4099+
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
4100+
case ISD::ZERO_EXTEND_VECTOR_INREG:
4101+
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
40614102
case ISD::SHL:
40624103
return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
40634104
case ISD::SRL:
@@ -4122,6 +4163,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
41224163
OPCODE(PERMUTE_DWORDS);
41234164
OPCODE(PERMUTE);
41244165
OPCODE(PACK);
4166+
OPCODE(UNPACK_HIGH);
4167+
OPCODE(UNPACKL_HIGH);
4168+
OPCODE(UNPACK_LOW);
4169+
OPCODE(UNPACKL_LOW);
41254170
OPCODE(VSHL_BY_SCALAR);
41264171
OPCODE(VSRL_BY_SCALAR);
41274172
OPCODE(VSRA_BY_SCALAR);
@@ -4334,17 +4379,35 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
43344379
}
43354380
}
43364381
}
4337-
// (z_merge_high 0, 0) -> 0. This is mostly useful for using VLLEZF
4338-
// for v4f32.
4339-
if (Opcode == SystemZISD::MERGE_HIGH) {
4382+
if (Opcode == SystemZISD::MERGE_HIGH ||
4383+
Opcode == SystemZISD::MERGE_LOW) {
43404384
SDValue Op0 = N->getOperand(0);
43414385
SDValue Op1 = N->getOperand(1);
4342-
if (Op0 == Op1) {
4343-
if (Op0.getOpcode() == ISD::BITCAST)
4344-
Op0 = Op0.getOperand(0);
4345-
if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
4346-
cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0)
4386+
if (Op0.getOpcode() == ISD::BITCAST)
4387+
Op0 = Op0.getOperand(0);
4388+
if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
4389+
cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
4390+
// (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF
4391+
// for v4f32.
4392+
if (Op1 == N->getOperand(0))
43474393
return Op1;
4394+
// (z_merge_? 0, X) -> (z_unpackl_? 0, X).
4395+
EVT VT = Op1.getValueType();
4396+
unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
4397+
if (ElemBytes <= 4) {
4398+
Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
4399+
SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
4400+
EVT InVT = VT.changeVectorElementTypeToInteger();
4401+
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
4402+
SystemZ::VectorBytes / ElemBytes / 2);
4403+
if (VT != InVT) {
4404+
Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
4405+
DCI.AddToWorklist(Op1.getNode());
4406+
}
4407+
SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
4408+
DCI.AddToWorklist(Op.getNode());
4409+
return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
4410+
}
43484411
}
43494412
}
43504413
// If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better

llvm/lib/Target/SystemZ/SystemZISelLowering.h

+28
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,15 @@ enum {
201201
// Pack vector operands 0 and 1 into a single vector with half-sized elements.
202202
PACK,
203203

204+
// Unpack the first half of vector operand 0 into double-sized elements.
205+
// UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
206+
UNPACK_HIGH,
207+
UNPACKL_HIGH,
208+
209+
// Likewise for the second half.
210+
UNPACK_LOW,
211+
UNPACKL_LOW,
212+
204213
// Shift each element of vector operand 0 by the number of bits specified
205214
// by scalar operand 1.
206215
VSHL_BY_SCALAR,
@@ -306,6 +315,23 @@ class SystemZTargetLowering : public TargetLowering {
306315
// want to clobber the upper 32 bits of a GPR unnecessarily.
307316
return MVT::i32;
308317
}
318+
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
319+
const override {
320+
// Widen subvectors to the full width rather than promoting integer
321+
// elements. This is better because:
322+
//
323+
// (a) it means that we can handle the ABI for passing and returning
324+
// sub-128 vectors without having to handle them as legal types.
325+
//
326+
// (b) we don't have instructions to extend on load and truncate on store,
327+
// so promoting the integers is less efficient.
328+
//
329+
// (c) there are no multiplication instructions for the widest integer
330+
// type (v2i64).
331+
if (VT.getVectorElementType().getSizeInBits() % 8 == 0)
332+
return TypeWidenVector;
333+
return TargetLoweringBase::getPreferredVectorAction(VT);
334+
}
309335
EVT getSetCCResultType(LLVMContext &, EVT) const override;
310336
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
311337
bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
@@ -417,6 +443,8 @@ class SystemZTargetLowering : public TargetLowering {
417443
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
418444
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
419445
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
446+
SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
447+
unsigned UnpackHigh) const;
420448
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
421449

422450
SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,

llvm/lib/Target/SystemZ/SystemZInstrVector.td

+12-12
Original file line numberDiff line numberDiff line change
@@ -290,24 +290,24 @@ let Predicates = [FeatureVector] in {
290290
def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>;
291291

292292
// Unpack high.
293-
def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, null_frag, v128h, v128b, 0>;
294-
def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, null_frag, v128f, v128h, 1>;
295-
def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, null_frag, v128g, v128f, 2>;
293+
def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>;
294+
def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>;
295+
def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>;
296296

297297
// Unpack logical high.
298-
def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, null_frag, v128h, v128b, 0>;
299-
def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, null_frag, v128f, v128h, 1>;
300-
def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, null_frag, v128g, v128f, 2>;
298+
def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>;
299+
def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>;
300+
def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>;
301301

302302
// Unpack low.
303-
def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, null_frag, v128h, v128b, 0>;
304-
def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, null_frag, v128f, v128h, 1>;
305-
def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, null_frag, v128g, v128f, 2>;
303+
def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, z_unpack_low, v128h, v128b, 0>;
304+
def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>;
305+
def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, z_unpack_low, v128g, v128f, 2>;
306306

307307
// Unpack logical low.
308-
def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, null_frag, v128h, v128b, 0>;
309-
def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, null_frag, v128f, v128h, 1>;
310-
def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, null_frag, v128g, v128f, 2>;
308+
def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>;
309+
def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>;
310+
def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>;
311311
}
312312

313313
//===----------------------------------------------------------------------===//

llvm/lib/Target/SystemZ/SystemZOperators.td

+10-5
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,10 @@ def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS",
193193
SDT_ZVecTernaryInt>;
194194
def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
195195
def z_pack : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
196+
def z_unpack_high : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
197+
def z_unpackl_high : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
198+
def z_unpack_low : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
199+
def z_unpackl_low : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>;
196200
def z_vshl_by_scalar : SDNode<"SystemZISD::VSHL_BY_SCALAR",
197201
SDT_ZVecBinaryInt>;
198202
def z_vsrl_by_scalar : SDNode<"SystemZISD::VSRL_BY_SCALAR",
@@ -544,11 +548,12 @@ def z_vllezi64 : PatFrag<(ops node:$addr),
544548
def z_vllezf32 : PatFrag<(ops node:$addr),
545549
(bitconvert
546550
(z_merge_high
547-
(v2i64 (bitconvert
548-
(z_merge_high
549-
(v4f32 (z_vzero)),
550-
(v4f32 (scalar_to_vector
551-
(f32 (load node:$addr))))))),
551+
(v2i64
552+
(z_unpackl_high
553+
(v4i32
554+
(bitconvert
555+
(v4f32 (scalar_to_vector
556+
(f32 (load node:$addr)))))))),
552557
(v2i64 (z_vzero))))>;
553558
def z_vllezf64 : PatFrag<(ops node:$addr),
554559
(z_merge_high

llvm/test/CodeGen/SystemZ/vec-args-03.ll

+14
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,17 @@ define <4 x i32> @foo(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4
1414
%y = sub <4 x i32> %v2, %v10
1515
ret <4 x i32> %y
1616
}
17+
18+
; This routine has 10 vector arguments, which fill up %v24-%v31 and
19+
; the two single-wide stack slots at 160 and 168.
20+
define <4 x i8> @bar(<4 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3, <4 x i8> %v4,
21+
<4 x i8> %v5, <4 x i8> %v6, <4 x i8> %v7, <4 x i8> %v8,
22+
<4 x i8> %v9, <4 x i8> %v10) {
23+
; CHECK-LABEL: bar:
24+
; CHECK: vlrepg [[REG1:%v[0-9]+]], 168(%r15)
25+
; CHECK: vsb %v24, %v26, [[REG1]]
26+
; CHECK: br %r14
27+
%y = sub <4 x i8> %v2, %v10
28+
ret <4 x i8> %y
29+
}
30+

0 commit comments

Comments
 (0)