@@ -318,6 +318,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
318
318
// Convert a GPR scalar to a vector by inserting it into element 0.
319
319
setOperationAction (ISD::SCALAR_TO_VECTOR, VT, Custom);
320
320
321
+ // Use a series of unpacks for extensions.
322
+ setOperationAction (ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
323
+ setOperationAction (ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
324
+
321
325
// Detect shifts by a scalar amount and convert them into
322
326
// V*_BY_SCALAR.
323
327
setOperationAction (ISD::SHL, VT, Custom);
@@ -793,7 +797,15 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
793
797
else if (VA.getLocInfo () == CCValAssign::Indirect)
794
798
Value = DAG.getLoad (VA.getValVT (), DL, Chain, Value,
795
799
MachinePointerInfo (), false , false , false , 0 );
796
- else
800
+ else if (VA.getLocInfo () == CCValAssign::BCvt) {
801
+ // If this is a short vector argument loaded from the stack,
802
+ // extend from i64 to full vector size and then bitcast.
803
+ assert (VA.getLocVT () == MVT::i64);
804
+ assert (VA.getValVT ().isVector ());
805
+ Value = DAG.getNode (ISD::BUILD_VECTOR, DL, MVT::v2i64,
806
+ Value, DAG.getUNDEF (MVT::i64));
807
+ Value = DAG.getNode (ISD::BITCAST, DL, VA.getValVT (), Value);
808
+ } else
797
809
assert (VA.getLocInfo () == CCValAssign::Full && " Unsupported getLocInfo" );
798
810
return Value;
799
811
}
@@ -810,6 +822,14 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
810
822
return DAG.getNode (ISD::ZERO_EXTEND, DL, VA.getLocVT (), Value);
811
823
case CCValAssign::AExt:
812
824
return DAG.getNode (ISD::ANY_EXTEND, DL, VA.getLocVT (), Value);
825
+ case CCValAssign::BCvt:
826
+ // If this is a short vector argument to be stored to the stack,
827
+ // bitcast to v2i64 and then extract first element.
828
+ assert (VA.getLocVT () == MVT::i64);
829
+ assert (VA.getValVT ().isVector ());
830
+ Value = DAG.getNode (ISD::BITCAST, DL, MVT::v2i64, Value);
831
+ return DAG.getNode (ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT (), Value,
832
+ DAG.getConstant (0 , DL, MVT::i32));
813
833
case CCValAssign::Full:
814
834
return Value;
815
835
default :
@@ -3910,6 +3930,23 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
3910
3930
return DAG.getNode (ISD::BITCAST, DL, VT, Res);
3911
3931
}
3912
3932
3933
+ SDValue
3934
+ SystemZTargetLowering::lowerExtendVectorInreg (SDValue Op, SelectionDAG &DAG,
3935
+ unsigned UnpackHigh) const {
3936
+ SDValue PackedOp = Op.getOperand (0 );
3937
+ EVT OutVT = Op.getValueType ();
3938
+ EVT InVT = PackedOp.getValueType ();
3939
+ unsigned ToBits = OutVT.getVectorElementType ().getSizeInBits ();
3940
+ unsigned FromBits = InVT.getVectorElementType ().getSizeInBits ();
3941
+ do {
3942
+ FromBits *= 2 ;
3943
+ EVT OutVT = MVT::getVectorVT (MVT::getIntegerVT (FromBits),
3944
+ SystemZ::VectorBits / FromBits);
3945
+ PackedOp = DAG.getNode (UnpackHigh, SDLoc (PackedOp), OutVT, PackedOp);
3946
+ } while (FromBits != ToBits);
3947
+ return PackedOp;
3948
+ }
3949
+
3913
3950
SDValue SystemZTargetLowering::lowerShift (SDValue Op, SelectionDAG &DAG,
3914
3951
unsigned ByScalar) const {
3915
3952
// Look for cases where a vector shift can use the *_BY_SCALAR form.
@@ -4058,6 +4095,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
4058
4095
return lowerINSERT_VECTOR_ELT (Op, DAG);
4059
4096
case ISD::EXTRACT_VECTOR_ELT:
4060
4097
return lowerEXTRACT_VECTOR_ELT (Op, DAG);
4098
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
4099
+ return lowerExtendVectorInreg (Op, DAG, SystemZISD::UNPACK_HIGH);
4100
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
4101
+ return lowerExtendVectorInreg (Op, DAG, SystemZISD::UNPACKL_HIGH);
4061
4102
case ISD::SHL:
4062
4103
return lowerShift (Op, DAG, SystemZISD::VSHL_BY_SCALAR);
4063
4104
case ISD::SRL:
@@ -4122,6 +4163,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
4122
4163
OPCODE (PERMUTE_DWORDS);
4123
4164
OPCODE (PERMUTE);
4124
4165
OPCODE (PACK);
4166
+ OPCODE (UNPACK_HIGH);
4167
+ OPCODE (UNPACKL_HIGH);
4168
+ OPCODE (UNPACK_LOW);
4169
+ OPCODE (UNPACKL_LOW);
4125
4170
OPCODE (VSHL_BY_SCALAR);
4126
4171
OPCODE (VSRL_BY_SCALAR);
4127
4172
OPCODE (VSRA_BY_SCALAR);
@@ -4334,17 +4379,35 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
4334
4379
}
4335
4380
}
4336
4381
}
4337
- // (z_merge_high 0, 0) -> 0. This is mostly useful for using VLLEZF
4338
- // for v4f32.
4339
- if (Opcode == SystemZISD::MERGE_HIGH) {
4382
+ if (Opcode == SystemZISD::MERGE_HIGH ||
4383
+ Opcode == SystemZISD::MERGE_LOW) {
4340
4384
SDValue Op0 = N->getOperand (0 );
4341
4385
SDValue Op1 = N->getOperand (1 );
4342
- if (Op0 == Op1) {
4343
- if (Op0.getOpcode () == ISD::BITCAST)
4344
- Op0 = Op0.getOperand (0 );
4345
- if (Op0.getOpcode () == SystemZISD::BYTE_MASK &&
4346
- cast<ConstantSDNode>(Op0.getOperand (0 ))->getZExtValue () == 0 )
4386
+ if (Op0.getOpcode () == ISD::BITCAST)
4387
+ Op0 = Op0.getOperand (0 );
4388
+ if (Op0.getOpcode () == SystemZISD::BYTE_MASK &&
4389
+ cast<ConstantSDNode>(Op0.getOperand (0 ))->getZExtValue () == 0 ) {
4390
+ // (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF
4391
+ // for v4f32.
4392
+ if (Op1 == N->getOperand (0 ))
4347
4393
return Op1;
4394
+ // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
4395
+ EVT VT = Op1.getValueType ();
4396
+ unsigned ElemBytes = VT.getVectorElementType ().getStoreSize ();
4397
+ if (ElemBytes <= 4 ) {
4398
+ Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
4399
+ SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
4400
+ EVT InVT = VT.changeVectorElementTypeToInteger ();
4401
+ EVT OutVT = MVT::getVectorVT (MVT::getIntegerVT (ElemBytes * 16 ),
4402
+ SystemZ::VectorBytes / ElemBytes / 2 );
4403
+ if (VT != InVT) {
4404
+ Op1 = DAG.getNode (ISD::BITCAST, SDLoc (N), InVT, Op1);
4405
+ DCI.AddToWorklist (Op1.getNode ());
4406
+ }
4407
+ SDValue Op = DAG.getNode (Opcode, SDLoc (N), OutVT, Op1);
4408
+ DCI.AddToWorklist (Op.getNode ());
4409
+ return DAG.getNode (ISD::BITCAST, SDLoc (N), VT, Op);
4410
+ }
4348
4411
}
4349
4412
}
4350
4413
// If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
0 commit comments