Skip to content

Commit 6998f8a

Browse files
committed
[LoopVectorize] Simplify scalar cost calculation in getInstructionCost
This patch simplifies the calculation of certain costs in getInstructionCost when isScalarAfterVectorization() returns a true value. There are a few places where we multiply a cost by a number N, i.e. unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost(... After some investigation it seems that there are only these cases that occur in practice: 1. VF is a scalar, in which case N = 1. 2. VF is a vector. We can only get here if: a) the instruction is a GEP/bitcast/PHI with scalar uses, or b) this is an update to an induction variable that remains scalar. I have changed the code so that N is assumed to always be 1. For GEPs the cost is always 0, since this is calculated later on as part of the load/store cost. PHI nodes are costed separately and were never previously multiplied by VF. For all other cases I have added an assert that none of the users needs scalarising, which didn't fire in any unit tests. Only one test required fixing and I believe the original cost for the scalar add instruction to have been wrong, since only one copy remains after vectorisation. I have also added a new test for the case when a pointer PHI feeds directly into a store that will be scalarised as we were previously never testing it. Differential Revision: https://reviews.llvm.org/D99718
1 parent c835630 commit 6998f8a

File tree

4 files changed

+121
-29
lines changed

4 files changed

+121
-29
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7383,10 +7383,39 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
73837383
Type *RetTy = I->getType();
73847384
if (canTruncateToMinimalBitwidth(I, VF))
73857385
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7386-
VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
73877386
auto SE = PSE.getSE();
73887387
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
73897388

7389+
auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7390+
ElementCount VF) -> bool {
7391+
if (VF.isScalar())
7392+
return true;
7393+
7394+
auto Scalarized = InstsToScalarize.find(VF);
7395+
assert(Scalarized != InstsToScalarize.end() &&
7396+
"VF not yet analyzed for scalarization profitability");
7397+
return !Scalarized->second.count(I) &&
7398+
llvm::all_of(I->users(), [&](User *U) {
7399+
auto *UI = cast<Instruction>(U);
7400+
return !Scalarized->second.count(UI);
7401+
});
7402+
};
7403+
7404+
if (isScalarAfterVectorization(I, VF)) {
7405+
// With the exception of GEPs and PHIs, after scalarization there should
7406+
// only be one copy of the instruction generated in the loop. This is
7407+
// because the VF is either 1, or any instructions that need scalarizing
7408+
// have already been dealt with by the the time we get here. As a result,
7409+
// it means we don't have to multiply the instruction cost by VF.
7410+
assert(I->getOpcode() == Instruction::GetElementPtr ||
7411+
I->getOpcode() == Instruction::PHI ||
7412+
(I->getOpcode() == Instruction::BitCast &&
7413+
I->getType()->isPointerTy()) ||
7414+
hasSingleCopyAfterVectorization(I, VF));
7415+
VectorTy = RetTy;
7416+
} else
7417+
VectorTy = ToVectorTy(RetTy, VF);
7418+
73907419
// TODO: We need to estimate the cost of intrinsic calls.
73917420
switch (I->getOpcode()) {
73927421
case Instruction::GetElementPtr:
@@ -7514,21 +7543,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
75147543
Op2VK = TargetTransformInfo::OK_UniformValue;
75157544

75167545
SmallVector<const Value *, 4> Operands(I->operand_values());
7517-
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7518-
return N * TTI.getArithmeticInstrCost(
7519-
I->getOpcode(), VectorTy, CostKind,
7520-
TargetTransformInfo::OK_AnyValue,
7521-
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7546+
return TTI.getArithmeticInstrCost(
7547+
I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7548+
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
75227549
}
75237550
case Instruction::FNeg: {
75247551
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7525-
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7526-
return N * TTI.getArithmeticInstrCost(
7527-
I->getOpcode(), VectorTy, CostKind,
7528-
TargetTransformInfo::OK_AnyValue,
7529-
TargetTransformInfo::OK_AnyValue,
7530-
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7531-
I->getOperand(0), I);
7552+
return TTI.getArithmeticInstrCost(
7553+
I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7554+
TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7555+
TargetTransformInfo::OP_None, I->getOperand(0), I);
75327556
}
75337557
case Instruction::Select: {
75347558
SelectInst *SI = cast<SelectInst>(I);
@@ -7583,6 +7607,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
75837607
VectorTy = ToVectorTy(getMemInstValueType(I), Width);
75847608
return getMemoryInstructionCost(I, VF);
75857609
}
7610+
case Instruction::BitCast:
7611+
if (I->getType()->isPointerTy())
7612+
return 0;
7613+
LLVM_FALLTHROUGH;
75867614
case Instruction::ZExt:
75877615
case Instruction::SExt:
75887616
case Instruction::FPToUI:
@@ -7593,8 +7621,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
75937621
case Instruction::SIToFP:
75947622
case Instruction::UIToFP:
75957623
case Instruction::Trunc:
7596-
case Instruction::FPTrunc:
7597-
case Instruction::BitCast: {
7624+
case Instruction::FPTrunc: {
75987625
// Computes the CastContextHint from a Load/Store instruction.
75997626
auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
76007627
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -7672,14 +7699,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
76727699
}
76737700
}
76747701

7675-
unsigned N;
7676-
if (isScalarAfterVectorization(I, VF)) {
7677-
assert(!VF.isScalable() && "VF is assumed to be non scalable");
7678-
N = VF.getKnownMinValue();
7679-
} else
7680-
N = 1;
7681-
return N *
7682-
TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7702+
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
76837703
}
76847704
case Instruction::Call: {
76857705
bool NeedToScalarize;
@@ -7694,11 +7714,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
76947714
case Instruction::ExtractValue:
76957715
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
76967716
default:
7697-
// The cost of executing VF copies of the scalar instruction. This opcode
7698-
// is unknown. Assume that it is the same as 'mul'.
7699-
return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7700-
Instruction::Mul, VectorTy, CostKind) +
7701-
getScalarizationOverhead(I, VF);
7717+
// This opcode is unknown. Assume that it is the same as 'mul'.
7718+
return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
77027719
} // end of switch.
77037720
}
77047721

llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ target triple = "aarch64--linux-gnu"
66

77
; CHECK-LABEL: all_scalar
88
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
9-
; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
9+
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
1010
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
1111
;
1212
define void @all_scalar(i64* %a, i64 %n) {

llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,41 @@ for.end:
8686
ret void
8787
}
8888

89+
; CHECK-LABEL: predicated_store_phi
90+
;
91+
; Same as predicate_store except we use a pointer PHI to maintain the address
92+
;
93+
; CHECK: Found new scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
94+
; CHECK: Found new scalar instruction: %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
95+
; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
96+
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
97+
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %addr, align 4
98+
;
99+
define void @predicated_store_phi(i32* %a, i1 %c, i32 %x, i64 %n) {
100+
entry:
101+
br label %for.body
102+
103+
for.body:
104+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
105+
%addr = phi i32 * [ %a, %entry ], [ %addr.next, %for.inc ]
106+
%tmp1 = load i32, i32* %addr, align 4
107+
%tmp2 = add nsw i32 %tmp1, %x
108+
br i1 %c, label %if.then, label %for.inc
109+
110+
if.then:
111+
store i32 %tmp2, i32* %addr, align 4
112+
br label %for.inc
113+
114+
for.inc:
115+
%i.next = add nuw nsw i64 %i, 1
116+
%cond = icmp slt i64 %i.next, %n
117+
%addr.next = getelementptr inbounds i32, i32* %addr, i64 1
118+
br i1 %cond, label %for.body, label %for.end
119+
120+
for.end:
121+
ret void
122+
}
123+
89124
; CHECK-LABEL: predicated_udiv_scalarized_operand
90125
;
91126
; This test checks that we correctly compute the cost of the predicated udiv
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -S -o - < %s 2>&1 | FileCheck %s
3+
4+
%struct.foo = type { i32, i64 }
5+
6+
; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %0 = bitcast i64* %b to i32*
7+
8+
; The bitcast below will be scalarized due to the predication in the loop. Bitcasts
9+
; between pointer types should be treated as free, despite the scalarization.
10+
define void @foo(%struct.foo* noalias nocapture %in, i32* noalias nocapture readnone %out, i64 %n) {
11+
entry:
12+
br label %for.body
13+
14+
for.body: ; preds = %entry, %if.end
15+
%i.012 = phi i64 [ %inc, %if.end ], [ 0, %entry ]
16+
%b = getelementptr inbounds %struct.foo, %struct.foo* %in, i64 %i.012, i32 1
17+
%0 = bitcast i64* %b to i32*
18+
%a = getelementptr inbounds %struct.foo, %struct.foo* %in, i64 %i.012, i32 0
19+
%1 = load i32, i32* %a, align 8
20+
%tobool.not = icmp eq i32 %1, 0
21+
br i1 %tobool.not, label %if.end, label %land.lhs.true
22+
23+
land.lhs.true: ; preds = %for.body
24+
%2 = load i32, i32* %0, align 4
25+
%cmp2 = icmp sgt i32 %2, 0
26+
br i1 %cmp2, label %if.then, label %if.end
27+
28+
if.then: ; preds = %land.lhs.true
29+
%sub = add nsw i32 %2, -1
30+
store i32 %sub, i32* %0, align 4
31+
br label %if.end
32+
33+
if.end: ; preds = %if.then, %land.lhs.true, %for.body
34+
%inc = add nuw nsw i64 %i.012, 1
35+
%exitcond.not = icmp eq i64 %inc, %n
36+
br i1 %exitcond.not, label %for.end, label %for.body
37+
38+
for.end: ; preds = %if.end
39+
ret void
40+
}

0 commit comments

Comments
 (0)