Skip to content

Commit cec8b61

Browse files
committed
[SLP]Do not reorder top nodes if they do not require reordering.
No need to reorder the top nodes, if they are not stores or insertelement instructions and each node should be analized only once, when the bottom-to-top analysis is performed. We still endup with extractelements for the top node scalars and the final shuffle just adds an extra cost and currently crashes the compiler for PHI nodes. Differential Revision: https://reviews.llvm.org/D116760
1 parent c80d349 commit cec8b61

File tree

2 files changed

+51
-4
lines changed

2 files changed

+51
-4
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3307,9 +3307,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
33073307
MapVector<OrdersType, unsigned,
33083308
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
33093309
OrdersUses;
3310+
// Do the analysis for each tree entry only once, otherwise the order of
3311+
// the same node my be considered several times, though might be not
3312+
// profitable.
33103313
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
33113314
for (const auto &Op : Data.second) {
33123315
TreeEntry *OpTE = Op.second;
3316+
if (!VisitedOps.insert(OpTE).second)
3317+
continue;
33133318
if (!OpTE->ReuseShuffleIndices.empty() ||
33143319
(IgnoreReorder && OpTE == VectorizableTree.front().get()))
33153320
continue;
@@ -3333,9 +3338,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
33333338
} else {
33343339
++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
33353340
}
3336-
if (VisitedOps.insert(OpTE).second)
3337-
OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
3338-
OpTE->UserTreeIndices.size();
3341+
OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
3342+
OpTE->UserTreeIndices.size();
33393343
assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
33403344
--OrdersUses[{}];
33413345
}
@@ -8444,7 +8448,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
84448448
if (R.isTreeTinyAndNotFullyVectorizable())
84458449
continue;
84468450
R.reorderTopToBottom();
8447-
R.reorderBottomToTop();
8451+
R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
84488452
R.buildExternalUses();
84498453

84508454
R.computeMinimumValueSizes();
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -S --slp-vectorizer -mtriple=x86_64-unknown %s -slp-threshold=-5 | FileCheck %s
3+
4+
define i32 @test(i32* %isec) {
5+
; CHECK-LABEL: @test(
6+
; CHECK-NEXT: entry:
7+
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[ISEC:%.*]], i32 0
8+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX10]] to <2 x i32>*
9+
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8
10+
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
11+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
12+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
13+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
14+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1
15+
; CHECK-NEXT: br i1 false, label [[BLOCK1:%.*]], label [[BLOCK3:%.*]]
16+
; CHECK: block1:
17+
; CHECK-NEXT: br i1 false, label [[BLOCK2:%.*]], label [[BLOCK3]]
18+
; CHECK: block2:
19+
; CHECK-NEXT: br label [[BLOCK3]]
20+
; CHECK: block3:
21+
; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[SHUFFLE]], [[BLOCK1]] ], [ [[SHUFFLE]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ]
22+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
23+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
24+
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP7]], [[TMP8]]
25+
; CHECK-NEXT: ret i32 [[TMP9]]
26+
;
27+
entry:
28+
%arrayidx10 = getelementptr inbounds i32, i32* %isec, i32 0
29+
%0 = bitcast i32* %arrayidx10 to <2 x i32>*
30+
%1 = load <2 x i32>, <2 x i32>* %0, align 8
31+
%2 = extractelement <2 x i32> %1, i32 1
32+
%3 = extractelement <2 x i32> %1, i32 0
33+
br i1 false, label %block1, label %block3
34+
block1:
35+
br i1 false, label %block2, label %block3
36+
block2:
37+
br label %block3
38+
block3:
39+
%4 = phi i32 [ %2, %block1 ], [ %2, %block2 ], [ %3, %entry ]
40+
%5 = phi i32 [ %3, %block1 ], [ %3, %block2 ], [ %2, %entry ]
41+
%6 = mul i32 %4, %5
42+
ret i32 %6
43+
}

0 commit comments

Comments
 (0)