Skip to content

Commit f0c2a5a

Browse files
committed
[LV] Generalize conditions for sinking instrs for first order recurrences.
If the recurrence PHI node has a single user, we can sink any instruction without side effects, given that all users are dominated by the instruction computing the incoming value of the next iteration ('Previous'). We can sink instructions that may cause traps, because that only causes the trap to occur later, but not on any new paths. With the relaxed check, we also have to make sure that we do not have a direct cycle (meaning PHI user == 'Previous), which indicates a reduction relation, which potentially gets missed by ReductionDescriptor. As follow-ups, we can also sink stores, iff they do not alias with other instructions we move them across and we could also support sinking chains of instructions and multiple users of the PHI. Fixes PR43398. Reviewers: hsaito, dcaballe, Ayal, rengolin Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D69228
1 parent 505c4da commit f0c2a5a

File tree

2 files changed

+271
-14
lines changed

2 files changed

+271
-14
lines changed

llvm/lib/Analysis/IVDescriptors.cpp

+26-14
Original file line numberDiff line numberDiff line change
@@ -699,25 +699,37 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(
699699
// Ensure every user of the phi node is dominated by the previous value.
700700
// The dominance requirement ensures the loop vectorizer will not need to
701701
// vectorize the initial value prior to the first iteration of the loop.
702-
// TODO: Consider extending this sinking to handle other kinds of instructions
703-
// and expressions, beyond sinking a single cast past Previous.
702+
// TODO: Consider extending this sinking to handle memory instructions and
703+
// phis with multiple users.
704+
705+
// Returns true, if all users of I are dominated by DominatedBy.
706+
auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) {
707+
return all_of(I->uses(), [DT, DominatedBy](Use &U) {
708+
return DT->dominates(DominatedBy, U);
709+
});
710+
};
711+
704712
if (Phi->hasOneUse()) {
705-
auto *I = Phi->user_back();
706-
if (I->isCast() && (I->getParent() == Phi->getParent()) && I->hasOneUse() &&
707-
DT->dominates(Previous, I->user_back())) {
708-
if (!DT->dominates(Previous, I)) // Otherwise we're good w/o sinking.
709-
SinkAfter[I] = Previous;
713+
Instruction *I = Phi->user_back();
714+
715+
// If the user of the PHI is also the incoming value, we potentially have a
716+
// reduction and which cannot be handled by sinking.
717+
if (Previous == I)
718+
return false;
719+
720+
if (DT->dominates(Previous, I)) // We already are good w/o sinking.
710721
return true;
711-
}
712-
}
713722

714-
for (User *U : Phi->users())
715-
if (auto *I = dyn_cast<Instruction>(U)) {
716-
if (!DT->dominates(Previous, I))
717-
return false;
723+
// We can sink any instruction without side effects, as long as all users
724+
// are dominated by the instruction we are sinking after.
725+
if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() &&
726+
allUsesDominatedBy(I, Previous)) {
727+
SinkAfter[I] = Previous;
728+
return true;
718729
}
730+
}
719731

720-
return true;
732+
return allUsesDominatedBy(Phi, Previous);
721733
}
722734

723735
/// This function returns the identity element (or neutral element) for
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
2+
3+
4+
@p = external local_unnamed_addr global [257 x i32], align 16
5+
@q = external local_unnamed_addr global [257 x i32], align 16
6+
7+
; Test case for PR43398.
8+
9+
define void @can_sink_after_store(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 {
10+
; CHECK-LABEL: vector.ph:
11+
; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0
12+
; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
13+
; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
14+
; CHECK-NEXT: br label %vector.body
15+
16+
; CHECK-LABEL: vector.body:
17+
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
18+
; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
19+
; CHECK-NEXT: %offset.idx = add i64 1, %index
20+
; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0
21+
; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
22+
; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
23+
; CHECK-NEXT: %0 = add i64 %offset.idx, 0
24+
; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0
25+
; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0
26+
; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>*
27+
; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4
28+
; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
29+
; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat2
30+
; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load
31+
; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0
32+
; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0
33+
; CHECK-NEXT: %9 = bitcast i32* %8 to <4 x i32>*
34+
; CHECK-NEXT: store <4 x i32> %6, <4 x i32>* %9, align 4
35+
; CHECK-NEXT: %index.next = add i64 %index, 4
36+
; CHECK-NEXT: %10 = icmp eq i64 %index.next, 1996
37+
; CHECK-NEXT: br i1 %10, label %middle.block, label %vector.body
38+
;
39+
entry:
40+
br label %preheader
41+
42+
preheader:
43+
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
44+
%.pre = load i32, i32* %idx.phi.trans, align 4
45+
br label %for
46+
47+
for:
48+
%pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ]
49+
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
50+
%add.1 = add i32 %pre.phi, %x
51+
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
52+
%pre.next = load i32, i32* %idx.1, align 4
53+
%add.2 = add i32 %add.1, %pre.next
54+
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
55+
store i32 %add.2, i32* %idx.2, align 4
56+
%iv.next = add nuw nsw i64 %iv, 1
57+
%exitcond = icmp eq i64 %iv.next, 2000
58+
br i1 %exitcond, label %exit, label %for
59+
60+
exit:
61+
ret void
62+
}
63+
64+
; We can sink potential trapping instructions, as this will only delay the trap
65+
; and not introduce traps on additional paths.
66+
define void @sink_sdiv(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 {
67+
; CHECK-LABEL: vector.ph:
68+
; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0
69+
; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
70+
; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
71+
; CHECK-NEXT: br label %vector.body
72+
73+
; CHECK-LABEL: vector.body:
74+
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
75+
; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
76+
; CHECK-NEXT: %offset.idx = add i64 1, %index
77+
; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0
78+
; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
79+
; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
80+
; CHECK-NEXT: %0 = add i64 %offset.idx, 0
81+
; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0
82+
; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0
83+
; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>*
84+
; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4
85+
; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
86+
; CHECK-NEXT: %5 = sdiv <4 x i32> %4, %broadcast.splat2
87+
; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load
88+
; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0
89+
; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0
90+
; CHECK-NEXT: %9 = bitcast i32* %8 to <4 x i32>*
91+
; CHECK-NEXT: store <4 x i32> %6, <4 x i32>* %9, align 4
92+
; CHECK-NEXT: %index.next = add i64 %index, 4
93+
; CHECK-NEXT: %10 = icmp eq i64 %index.next, 1996
94+
; CHECK-NEXT: br i1 %10, label %middle.block, label %vector.body
95+
;
96+
entry:
97+
br label %preheader
98+
99+
preheader:
100+
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
101+
%.pre = load i32, i32* %idx.phi.trans, align 4
102+
br label %for
103+
104+
for:
105+
%pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ]
106+
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
107+
%div.1 = sdiv i32 %pre.phi, %x
108+
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
109+
%pre.next = load i32, i32* %idx.1, align 4
110+
%add.2 = add i32 %div.1, %pre.next
111+
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
112+
store i32 %add.2, i32* %idx.2, align 4
113+
%iv.next = add nuw nsw i64 %iv, 1
114+
%exitcond = icmp eq i64 %iv.next, 2000
115+
br i1 %exitcond, label %exit, label %for
116+
117+
exit:
118+
ret void
119+
}
120+
121+
; FIXME: Currently we can only sink a single instruction. For the example below,
122+
; we also have to sink users.
123+
define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
124+
; CHECK-LABEL: define void @cannot_sink_with_additional_user(
125+
; CHECK-NEXT: entry:
126+
; CHECK-NEXT: br label %preheader
127+
128+
; CHECK-LABEL: preheader: ; preds = %entry
129+
; CHECK: br label %for
130+
131+
; CHECK-LABEL: for: ; preds = %for, %preheader
132+
; CHECK br i1 %exitcond, label %exit, label %for
133+
134+
; CHECK-LABEL: exit:
135+
; CHECK-NEXT: ret void
136+
137+
entry:
138+
br label %preheader
139+
140+
preheader:
141+
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
142+
%.pre = load i32, i32* %idx.phi.trans, align 4
143+
br label %for
144+
145+
for:
146+
%pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ]
147+
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
148+
%add.1 = add i32 %pre.phi, %x
149+
%add.2 = add i32 %add.1, %x
150+
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
151+
%pre.next = load i32, i32* %idx.1, align 4
152+
%add.3 = add i32 %add.1, %pre.next
153+
%add.4 = add i32 %add.2, %add.3
154+
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
155+
store i32 %add.4, i32* %idx.2, align 4
156+
%iv.next = add nuw nsw i64 %iv, 1
157+
%exitcond = icmp eq i64 %iv.next, 2000
158+
br i1 %exitcond, label %exit, label %for
159+
160+
exit:
161+
ret void
162+
}
163+
164+
; FIXME: We can sink a store, if we can guarantee that it does not alias any
165+
; loads/stores in between.
166+
define void @cannot_sink_store(i32 %x, i32* %ptr, i64 %tc) {
167+
; CHECK-LABEL: define void @cannot_sink_store(
168+
; CHECK-NEXT: entry:
169+
; CHECK-NEXT: br label %preheader
170+
171+
; CHECK-LABEL: preheader: ; preds = %entry
172+
; CHECK: br label %for
173+
174+
; CHECK-LABEL: for: ; preds = %for, %preheader
175+
; CHECK br i1 %exitcond, label %exit, label %for
176+
177+
; CHECK-LABEL: exit:
178+
; CHECK-NEXT: ret void
179+
;
180+
entry:
181+
br label %preheader
182+
183+
preheader:
184+
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
185+
%.pre = load i32, i32* %idx.phi.trans, align 4
186+
br label %for
187+
188+
for:
189+
%pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ]
190+
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
191+
%add.1 = add i32 %pre.phi, %x
192+
store i32 %add.1, i32* %ptr
193+
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
194+
%pre.next = load i32, i32* %idx.1, align 4
195+
%add.2 = add i32 %add.1, %pre.next
196+
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
197+
store i32 %add.2, i32* %idx.2, align 4
198+
%iv.next = add nuw nsw i64 %iv, 1
199+
%exitcond = icmp eq i64 %iv.next, 2000
200+
br i1 %exitcond, label %exit, label %for
201+
202+
exit:
203+
ret void
204+
}
205+
206+
; Some kinds of reductions are not detected by IVDescriptors. If we have a
207+
; cycle, we cannot sink it.
208+
define void @cannot_sink_reduction(i32 %x, i32* %ptr, i64 %tc) {
209+
; CHECK-LABEL: define void @cannot_sink_reduction(
210+
; CHECK-NEXT: entry:
211+
; CHECK-NEXT: br label %preheader
212+
213+
; CHECK-LABEL: preheader: ; preds = %entry
214+
; CHECK: br label %for
215+
216+
; CHECK-LABEL: for: ; preds = %for, %preheader
217+
; CHECK br i1 %exitcond, label %exit, label %for
218+
219+
; CHECK-LABEL: exit: ; preds = %for
220+
; CHECK-NET: ret void
221+
;
222+
entry:
223+
br label %preheader
224+
225+
preheader:
226+
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
227+
%.pre = load i32, i32* %idx.phi.trans, align 4
228+
br label %for
229+
230+
for:
231+
%pre.phi = phi i32 [ %.pre, %preheader ], [ %d, %for ]
232+
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
233+
%d = sdiv i32 %pre.phi, %x
234+
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
235+
%pre.next = load i32, i32* %idx.1, align 4
236+
%add.2 = add i32 %x, %pre.next
237+
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
238+
store i32 %add.2, i32* %idx.2, align 4
239+
%iv.next = add nuw nsw i64 %iv, 1
240+
%exitcond = icmp eq i64 %iv.next, 2000
241+
br i1 %exitcond, label %exit, label %for
242+
243+
exit:
244+
ret void
245+
}

0 commit comments

Comments
 (0)